nvptx.c (nvptx_option_override): Emit sorry for stabs debug.
[gcc.git] / gcc / config / nvptx / nvptx.c
1 /* Target code for NVPTX.
2 Copyright (C) 2014-2015 Free Software Foundation, Inc.
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #include "config.h"
22 #include <sstream>
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "cfghooks.h"
30 #include "df.h"
31 #include "tm_p.h"
32 #include "expmed.h"
33 #include "optabs.h"
34 #include "regs.h"
35 #include "emit-rtl.h"
36 #include "recog.h"
37 #include "diagnostic.h"
38 #include "alias.h"
39 #include "insn-flags.h"
40 #include "output.h"
41 #include "insn-attr.h"
42 #include "flags.h"
43 #include "dojump.h"
44 #include "explow.h"
45 #include "calls.h"
46 #include "varasm.h"
47 #include "stmt.h"
48 #include "expr.h"
49 #include "tm-preds.h"
50 #include "tm-constrs.h"
51 #include "langhooks.h"
52 #include "dbxout.h"
53 #include "cfgrtl.h"
54 #include "gimple.h"
55 #include "stor-layout.h"
56 #include "builtins.h"
57 #include "omp-low.h"
58 #include "gomp-constants.h"
59 #include "dumpfile.h"
60 #include "internal-fn.h"
61 #include "gimple-iterator.h"
62 #include "stringpool.h"
63 #include "tree-ssa-operands.h"
64 #include "tree-ssanames.h"
65 #include "gimplify.h"
66 #include "tree-phinodes.h"
67 #include "cfgloop.h"
68 #include "fold-const.h"
69
70 /* This file should be included last. */
71 #include "target-def.h"
72
73 /* The kind of shuffe instruction. */
74 enum nvptx_shuffle_kind
75 {
76 SHUFFLE_UP,
77 SHUFFLE_DOWN,
78 SHUFFLE_BFLY,
79 SHUFFLE_IDX,
80 SHUFFLE_MAX
81 };
82
83 /* The various PTX memory areas an object might reside in. */
84 enum nvptx_data_area
85 {
86 DATA_AREA_GENERIC,
87 DATA_AREA_GLOBAL,
88 DATA_AREA_SHARED,
89 DATA_AREA_LOCAL,
90 DATA_AREA_CONST,
91 DATA_AREA_PARAM,
92 DATA_AREA_MAX
93 };
94
95 /* We record the data area in the target symbol flags. */
96 #define SYMBOL_DATA_AREA(SYM) \
97 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
98 & 7)
99 #define SET_SYMBOL_DATA_AREA(SYM,AREA) \
100 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
101
102 /* Record the function decls we've written, and the libfuncs and function
103 decls corresponding to them. */
104 static std::stringstream func_decls;
105
106 struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
107 {
108 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
109 static bool equal (rtx a, rtx b) { return a == b; }
110 };
111
112 static GTY((cache))
113 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
114
115 struct tree_hasher : ggc_cache_ptr_hash<tree_node>
116 {
117 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
118 static bool equal (tree a, tree b) { return a == b; }
119 };
120
121 static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
122 static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
123
124 /* Buffer needed to broadcast across workers. This is used for both
125 worker-neutering and worker broadcasting. It is shared by all
126 functions emitted. The buffer is placed in shared memory. It'd be
127 nice if PTX supported common blocks, because then this could be
128 shared across TUs (taking the largest size). */
129 static unsigned worker_bcast_size;
130 static unsigned worker_bcast_align;
131 static GTY(()) rtx worker_bcast_sym;
132
133 /* Buffer needed for worker reductions. This has to be distinct from
134 the worker broadcast array, as both may be live concurrently. */
135 static unsigned worker_red_size;
136 static unsigned worker_red_align;
137 static GTY(()) rtx worker_red_sym;
138
139 /* Global lock variable, needed for 128bit worker & gang reductions. */
140 static GTY(()) tree global_lock_var;
141
142 /* Allocate a new, cleared machine_function structure. */
143
144 static struct machine_function *
145 nvptx_init_machine_status (void)
146 {
147 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
148 p->return_mode = VOIDmode;
149 return p;
150 }
151
152 /* Implement TARGET_OPTION_OVERRIDE. */
153
154 static void
155 nvptx_option_override (void)
156 {
157 init_machine_status = nvptx_init_machine_status;
158 /* Gives us a predictable order, which we need especially for variables. */
159 flag_toplevel_reorder = 1;
160 /* Assumes that it will see only hard registers. */
161 flag_var_tracking = 0;
162
163 if (write_symbols == DBX_DEBUG)
164 /* The stabs testcases want to know stabs isn't supported. */
165 sorry ("stabs debug format not supported");
166
167 /* Actually we don't have any debug format, but don't be
168 unneccesarily noisy. */
169 write_symbols = NO_DEBUG;
170 debug_info_level = DINFO_LEVEL_NONE;
171
172 if (nvptx_optimize < 0)
173 nvptx_optimize = optimize > 0;
174
175 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
176 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
177 declared_libfuncs_htab
178 = hash_table<declared_libfunc_hasher>::create_ggc (17);
179
180 worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_bcast");
181 SET_SYMBOL_DATA_AREA (worker_bcast_sym, DATA_AREA_SHARED);
182 worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
183
184 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
185 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
186 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
187 }
188
189 /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
190 deal with ptx ideosyncracies. */
191
192 const char *
193 nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
194 {
195 switch (mode)
196 {
197 case BLKmode:
198 return ".b8";
199 case BImode:
200 return ".pred";
201 case QImode:
202 if (promote)
203 return ".u32";
204 else
205 return ".u8";
206 case HImode:
207 return ".u16";
208 case SImode:
209 return ".u32";
210 case DImode:
211 return ".u64";
212
213 case SFmode:
214 return ".f32";
215 case DFmode:
216 return ".f64";
217
218 default:
219 gcc_unreachable ();
220 }
221 }
222
223 /* Encode the PTX data area that DECL (which might not actually be a
224 _DECL) should reside in. */
225
226 static void
227 nvptx_encode_section_info (tree decl, rtx rtl, int first)
228 {
229 default_encode_section_info (decl, rtl, first);
230 if (first && MEM_P (rtl))
231 {
232 nvptx_data_area area = DATA_AREA_GENERIC;
233
234 if (TREE_CONSTANT (decl))
235 area = DATA_AREA_CONST;
236 else if (TREE_CODE (decl) == VAR_DECL)
237 /* TODO: This would be a good place to check for a .shared or
238 other section name. */
239 area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
240
241 SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
242 }
243 }
244
245 /* Return the PTX name of the data area in which SYM should be
246 placed. The symbol must have already been processed by
247 nvptx_encode_seciton_info, or equivalent. */
248
249 static const char *
250 section_for_sym (rtx sym)
251 {
252 nvptx_data_area area = SYMBOL_DATA_AREA (sym);
253 /* Same order as nvptx_data_area enum. */
254 static char const *const areas[] =
255 {"", ".global", ".shared", ".local", ".const", ".param"};
256
257 return areas[area];
258 }
259
260 /* Similarly for a decl. */
261
262 static const char *
263 section_for_decl (const_tree decl)
264 {
265 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
266 }
267
268 /* Check NAME for special function names and redirect them by returning a
269 replacement. This applies to malloc, free and realloc, for which we
270 want to use libgcc wrappers, and call, which triggers a bug in ptxas. */
271
272 static const char *
273 nvptx_name_replacement (const char *name)
274 {
275 if (strcmp (name, "call") == 0)
276 return "__nvptx_call";
277 if (strcmp (name, "malloc") == 0)
278 return "__nvptx_malloc";
279 if (strcmp (name, "free") == 0)
280 return "__nvptx_free";
281 if (strcmp (name, "realloc") == 0)
282 return "__nvptx_realloc";
283 return name;
284 }
285
286 /* If MODE should be treated as two registers of an inner mode, return
287 that inner mode. Otherwise return VOIDmode. */
288
289 static machine_mode
290 maybe_split_mode (machine_mode mode)
291 {
292 if (COMPLEX_MODE_P (mode))
293 return GET_MODE_INNER (mode);
294
295 if (mode == TImode)
296 return DImode;
297
298 return VOIDmode;
299 }
300
301 /* Output a register, subreg, or register pair (with optional
302 enclosing braces). */
303
304 static void
305 output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
306 int subreg_offset = -1)
307 {
308 if (inner_mode == VOIDmode)
309 {
310 if (HARD_REGISTER_NUM_P (regno))
311 fprintf (file, "%s", reg_names[regno]);
312 else
313 fprintf (file, "%%r%d", regno);
314 }
315 else if (subreg_offset >= 0)
316 {
317 output_reg (file, regno, VOIDmode);
318 fprintf (file, "$%d", subreg_offset);
319 }
320 else
321 {
322 if (subreg_offset == -1)
323 fprintf (file, "{");
324 output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
325 fprintf (file, ",");
326 output_reg (file, regno, inner_mode, 0);
327 if (subreg_offset == -1)
328 fprintf (file, "}");
329 }
330 }
331
332 /* Emit forking instructions for MASK. */
333
334 static void
335 nvptx_emit_forking (unsigned mask, bool is_call)
336 {
337 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
338 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
339 if (mask)
340 {
341 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
342
343 /* Emit fork at all levels. This helps form SESE regions, as
344 it creates a block with a single successor before entering a
345 partitooned region. That is a good candidate for the end of
346 an SESE region. */
347 if (!is_call)
348 emit_insn (gen_nvptx_fork (op));
349 emit_insn (gen_nvptx_forked (op));
350 }
351 }
352
353 /* Emit joining instructions for MASK. */
354
355 static void
356 nvptx_emit_joining (unsigned mask, bool is_call)
357 {
358 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
359 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
360 if (mask)
361 {
362 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
363
364 /* Emit joining for all non-call pars to ensure there's a single
365 predecessor for the block the join insn ends up in. This is
366 needed for skipping entire loops. */
367 if (!is_call)
368 emit_insn (gen_nvptx_joining (op));
369 emit_insn (gen_nvptx_join (op));
370 }
371 }
372
373 \f
374 /* Determine whether MODE and TYPE (possibly NULL) should be passed or
375 returned in memory. Integer and floating types supported by the
376 machine are passed in registers, everything else is passed in
377 memory. Complex types are split. */
378
379 static bool
380 pass_in_memory (machine_mode mode, const_tree type, bool for_return)
381 {
382 if (type)
383 {
384 if (AGGREGATE_TYPE_P (type))
385 return true;
386 if (TREE_CODE (type) == VECTOR_TYPE)
387 return true;
388 }
389
390 if (!for_return && COMPLEX_MODE_P (mode))
391 /* Complex types are passed as two underlying args. */
392 mode = GET_MODE_INNER (mode);
393
394 if (GET_MODE_CLASS (mode) != MODE_INT
395 && GET_MODE_CLASS (mode) != MODE_FLOAT)
396 return true;
397
398 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
399 return true;
400
401 return false;
402 }
403
404 /* A non-memory argument of mode MODE is being passed, determine the mode it
405 should be promoted to. This is also used for determining return
406 type promotion. */
407
408 static machine_mode
409 promote_arg (machine_mode mode, bool prototyped)
410 {
411 if (!prototyped && mode == SFmode)
412 /* K&R float promotion for unprototyped functions. */
413 mode = DFmode;
414 else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
415 mode = SImode;
416
417 return mode;
418 }
419
420 /* A non-memory return type of MODE is being returned. Determine the
421 mode it should be promoted to. */
422
423 static machine_mode
424 promote_return (machine_mode mode)
425 {
426 return promote_arg (mode, true);
427 }
428
429 /* Implement TARGET_FUNCTION_ARG. */
430
431 static rtx
432 nvptx_function_arg (cumulative_args_t ARG_UNUSED (cum_v), machine_mode mode,
433 const_tree, bool named)
434 {
435 if (mode == VOIDmode || !named)
436 return NULL_RTX;
437
438 return gen_reg_rtx (mode);
439 }
440
441 /* Implement TARGET_FUNCTION_INCOMING_ARG. */
442
443 static rtx
444 nvptx_function_incoming_arg (cumulative_args_t cum_v, machine_mode mode,
445 const_tree, bool named)
446 {
447 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
448
449 if (mode == VOIDmode || !named)
450 return NULL_RTX;
451
452 /* No need to deal with split modes here, the only case that can
453 happen is complex modes and those are dealt with by
454 TARGET_SPLIT_COMPLEX_ARG. */
455 return gen_rtx_UNSPEC (mode,
456 gen_rtvec (1, GEN_INT (cum->count)),
457 UNSPEC_ARG_REG);
458 }
459
460 /* Implement TARGET_FUNCTION_ARG_ADVANCE. */
461
462 static void
463 nvptx_function_arg_advance (cumulative_args_t cum_v,
464 machine_mode ARG_UNUSED (mode),
465 const_tree ARG_UNUSED (type),
466 bool ARG_UNUSED (named))
467 {
468 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
469
470 cum->count++;
471 }
472
473 /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
474
475 For nvptx, we know how to handle functions declared as stdarg: by
476 passing an extra pointer to the unnamed arguments. However, the
477 Fortran frontend can produce a different situation, where a
478 function pointer is declared with no arguments, but the actual
479 function and calls to it take more arguments. In that case, we
480 want to ensure the call matches the definition of the function. */
481
482 static bool
483 nvptx_strict_argument_naming (cumulative_args_t cum_v)
484 {
485 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
486
487 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
488 }
489
490 /* Implement TARGET_LIBCALL_VALUE. */
491
492 static rtx
493 nvptx_libcall_value (machine_mode mode, const_rtx)
494 {
495 if (!cfun->machine->doing_call)
496 /* Pretend to return in a hard reg for early uses before pseudos can be
497 generated. */
498 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
499
500 return gen_reg_rtx (mode);
501 }
502
503 /* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
504 where function FUNC returns or receives a value of data type TYPE. */
505
506 static rtx
507 nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
508 bool outgoing)
509 {
510 machine_mode mode = promote_return (TYPE_MODE (type));
511
512 if (outgoing)
513 {
514 cfun->machine->return_mode = mode;
515 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
516 }
517
518 return nvptx_libcall_value (mode, NULL_RTX);
519 }
520
521 /* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
522
523 static bool
524 nvptx_function_value_regno_p (const unsigned int regno)
525 {
526 return regno == NVPTX_RETURN_REGNUM;
527 }
528
529 /* Types with a mode other than those supported by the machine are passed by
530 reference in memory. */
531
532 static bool
533 nvptx_pass_by_reference (cumulative_args_t ARG_UNUSED (cum),
534 machine_mode mode, const_tree type,
535 bool ARG_UNUSED (named))
536 {
537 return pass_in_memory (mode, type, false);
538 }
539
540 /* Implement TARGET_RETURN_IN_MEMORY. */
541
542 static bool
543 nvptx_return_in_memory (const_tree type, const_tree)
544 {
545 return pass_in_memory (TYPE_MODE (type), type, true);
546 }
547
548 /* Implement TARGET_PROMOTE_FUNCTION_MODE. */
549
550 static machine_mode
551 nvptx_promote_function_mode (const_tree type, machine_mode mode,
552 int *ARG_UNUSED (punsignedp),
553 const_tree funtype, int for_return)
554 {
555 return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
556 }
557
558 /* Helper for write_arg. Emit a single PTX argument of MODE, either
559 in a prototype, or as copy in a function prologue. ARGNO is the
560 index of this argument in the PTX function. FOR_REG is negative,
561 if we're emitting the PTX prototype. It is zero if we're copying
562 to an argument register and it is greater than zero if we're
563 copying to a specific hard register. */
564
565 static int
566 write_arg_mode (std::stringstream &s, int for_reg, int argno,
567 machine_mode mode)
568 {
569 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
570
571 if (for_reg < 0)
572 {
573 /* Writing PTX prototype. */
574 s << (argno ? ", " : " (");
575 s << ".param" << ptx_type << " %in_ar" << argno;
576 }
577 else
578 {
579 s << "\t.reg" << ptx_type << " ";
580 if (for_reg)
581 s << reg_names[for_reg];
582 else
583 s << "%ar" << argno;
584 s << ";\n";
585 if (argno >= 0)
586 {
587 s << "\tld.param" << ptx_type << " ";
588 if (for_reg)
589 s << reg_names[for_reg];
590 else
591 s << "%ar" << argno;
592 s << ", [%in_ar" << argno << "];\n";
593 }
594 }
595 return argno + 1;
596 }
597
598 /* Process function parameter TYPE to emit one or more PTX
599 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
600 is true, if this is a prototyped function, rather than an old-style
601 C declaration. Returns the next argument number to use.
602
603 The promotion behaviour here must match the regular GCC function
604 parameter marshalling machinery. */
605
606 static int
607 write_arg_type (std::stringstream &s, int for_reg, int argno,
608 tree type, bool prototyped)
609 {
610 machine_mode mode = TYPE_MODE (type);
611
612 if (mode == VOIDmode)
613 return argno;
614
615 if (pass_in_memory (mode, type, false))
616 mode = Pmode;
617 else
618 {
619 bool split = TREE_CODE (type) == COMPLEX_TYPE;
620
621 if (split)
622 {
623 /* Complex types are sent as two separate args. */
624 type = TREE_TYPE (type);
625 mode = TYPE_MODE (type);
626 prototyped = true;
627 }
628
629 mode = promote_arg (mode, prototyped);
630 if (split)
631 argno = write_arg_mode (s, for_reg, argno, mode);
632 }
633
634 return write_arg_mode (s, for_reg, argno, mode);
635 }
636
637 /* Emit a PTX return as a prototype or function prologue declaration
638 for MODE. */
639
640 static void
641 write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
642 {
643 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
644 const char *pfx = "\t.reg";
645 const char *sfx = ";\n";
646
647 if (for_proto)
648 pfx = "(.param", sfx = "_out) ";
649
650 s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
651 }
652
653 /* Process a function return TYPE to emit a PTX return as a prototype
654 or function prologue declaration. Returns true if return is via an
655 additional pointer parameter. The promotion behaviour here must
656 match the regular GCC function return mashalling. */
657
658 static bool
659 write_return_type (std::stringstream &s, bool for_proto, tree type)
660 {
661 machine_mode mode = TYPE_MODE (type);
662
663 if (mode == VOIDmode)
664 return false;
665
666 bool return_in_mem = pass_in_memory (mode, type, true);
667
668 if (return_in_mem)
669 {
670 if (for_proto)
671 return return_in_mem;
672
673 /* Named return values can cause us to return a pointer as well
674 as expect an argument for the return location. This is
675 optimization-level specific, so no caller can make use of
676 this data, but more importantly for us, we must ensure it
677 doesn't change the PTX prototype. */
678 mode = (machine_mode) cfun->machine->return_mode;
679
680 if (mode == VOIDmode)
681 return return_in_mem;
682
683 /* Clear return_mode to inhibit copy of retval to non-existent
684 retval parameter. */
685 cfun->machine->return_mode = VOIDmode;
686 }
687 else
688 mode = promote_return (mode);
689
690 write_return_mode (s, for_proto, mode);
691
692 return return_in_mem;
693 }
694
695 /* Look for attributes in ATTRS that would indicate we must write a function
696 as a .entry kernel rather than a .func. Return true if one is found. */
697
698 static bool
699 write_as_kernel (tree attrs)
700 {
701 return (lookup_attribute ("kernel", attrs) != NULL_TREE
702 || lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE);
703 }
704
705 /* Emit a linker marker for a function decl or defn. */
706
707 static void
708 write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
709 const char *name)
710 {
711 s << "\n// BEGIN";
712 if (globalize)
713 s << " GLOBAL";
714 s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
715 s << name << "\n";
716 }
717
718 /* Emit a linker marker for a variable decl or defn. */
719
720 static void
721 write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
722 {
723 fprintf (file, "\n// BEGIN%s VAR %s: ",
724 globalize ? " GLOBAL" : "",
725 is_defn ? "DEF" : "DECL");
726 assemble_name_raw (file, name);
727 fputs ("\n", file);
728 }
729
730 /* Write a .func or .kernel declaration or definition along with
731 a helper comment for use by ld. S is the stream to write to, DECL
732 the decl for the function with name NAME. For definitions, emit
733 a declaration too. */
734
735 static const char *
736 write_fn_proto (std::stringstream &s, bool is_defn,
737 const char *name, const_tree decl)
738 {
739 if (is_defn)
740 /* Emit a declaration. The PTX assembler gets upset without it. */
741 name = write_fn_proto (s, false, name, decl);
742 else
743 {
744 /* Avoid repeating the name replacement. */
745 name = nvptx_name_replacement (name);
746 if (name[0] == '*')
747 name++;
748 }
749
750 write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
751
752 /* PTX declaration. */
753 if (DECL_EXTERNAL (decl))
754 s << ".extern ";
755 else if (TREE_PUBLIC (decl))
756 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
757 s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
758
759 tree fntype = TREE_TYPE (decl);
760 tree result_type = TREE_TYPE (fntype);
761
762 /* Declare the result. */
763 bool return_in_mem = write_return_type (s, true, result_type);
764
765 s << name;
766
767 int argno = 0;
768
769 /* Emit argument list. */
770 if (return_in_mem)
771 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
772
773 /* We get:
774 NULL in TYPE_ARG_TYPES, for old-style functions
775 NULL in DECL_ARGUMENTS, for builtin functions without another
776 declaration.
777 So we have to pick the best one we have. */
778 tree args = TYPE_ARG_TYPES (fntype);
779 bool prototyped = true;
780 if (!args)
781 {
782 args = DECL_ARGUMENTS (decl);
783 prototyped = false;
784 }
785
786 for (; args; args = TREE_CHAIN (args))
787 {
788 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
789
790 argno = write_arg_type (s, -1, argno, type, prototyped);
791 }
792
793 if (stdarg_p (fntype))
794 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
795
796 if (DECL_STATIC_CHAIN (decl))
797 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
798
799 if (!argno && strcmp (name, "main") == 0)
800 {
801 argno = write_arg_type (s, -1, argno, integer_type_node, true);
802 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
803 }
804
805 if (argno)
806 s << ")";
807
808 s << (is_defn ? "\n" : ";\n");
809
810 return name;
811 }
812
813 /* Construct a function declaration from a call insn. This can be
814 necessary for two reasons - either we have an indirect call which
815 requires a .callprototype declaration, or we have a libcall
816 generated by emit_library_call for which no decl exists. */
817
818 static void
819 write_fn_proto_from_insn (std::stringstream &s, const char *name,
820 rtx result, rtx pat)
821 {
822 if (!name)
823 {
824 s << "\t.callprototype ";
825 name = "_";
826 }
827 else
828 {
829 name = nvptx_name_replacement (name);
830 write_fn_marker (s, false, true, name);
831 s << "\t.extern .func ";
832 }
833
834 if (result != NULL_RTX)
835 write_return_mode (s, true, GET_MODE (result));
836
837 s << name;
838
839 int arg_end = XVECLEN (pat, 0);
840 for (int i = 1; i < arg_end; i++)
841 {
842 /* We don't have to deal with mode splitting & promotion here,
843 as that was already done when generating the call
844 sequence. */
845 machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
846
847 write_arg_mode (s, -1, i - 1, mode);
848 }
849 if (arg_end != 1)
850 s << ")";
851 s << ";\n";
852 }
853
854 /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
855 table and and write a ptx prototype. These are emitted at end of
856 compilation. */
857
858 static void
859 nvptx_record_fndecl (tree decl)
860 {
861 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
862 if (*slot == NULL)
863 {
864 *slot = decl;
865 const char *name = get_fnname_from_decl (decl);
866 write_fn_proto (func_decls, false, name, decl);
867 }
868 }
869
870 /* Record a libcall or unprototyped external function. CALLEE is the
871 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
872 declaration for it. */
873
874 static void
875 nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
876 {
877 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
878 if (*slot == NULL)
879 {
880 *slot = callee;
881
882 const char *name = XSTR (callee, 0);
883 write_fn_proto_from_insn (func_decls, name, retval, pat);
884 }
885 }
886
887 /* DECL is an external FUNCTION_DECL, that we're referencing. If it
888 is prototyped, record it now. Otherwise record it as needed at end
889 of compilation, when we might have more information about it. */
890
891 void
892 nvptx_record_needed_fndecl (tree decl)
893 {
894 if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
895 {
896 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
897 if (*slot == NULL)
898 *slot = decl;
899 }
900 else
901 nvptx_record_fndecl (decl);
902 }
903
904 /* SYM is a SYMBOL_REF. If it refers to an external function, record
905 it as needed. */
906
907 static void
908 nvptx_maybe_record_fnsym (rtx sym)
909 {
910 tree decl = SYMBOL_REF_DECL (sym);
911
912 if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
913 nvptx_record_needed_fndecl (decl);
914 }
915
916 /* Emit a local array to hold some part of a conventional stack frame
917 and initialize REGNO to point to it. If the size is zero, it'll
918 never be valid to dereference, so we can simply initialize to
919 zero. */
920
921 static void
922 init_frame (FILE *file, int regno, unsigned align, unsigned size)
923 {
924 if (size)
925 fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
926 align, reg_names[regno], size);
927 fprintf (file, "\t.reg.u%d %s;\n",
928 POINTER_SIZE, reg_names[regno]);
929 fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
930 : "\tmov.u%d %s, 0;\n"),
931 POINTER_SIZE, reg_names[regno], reg_names[regno]);
932 }
933
934 /* Emit code to initialize the REGNO predicate register to indicate
935 whether we are not lane zero on the NAME axis. */
936
937 static void
938 nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
939 {
940 fprintf (file, "\t{\n");
941 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
942 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
943 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
944 fprintf (file, "\t}\n");
945 }
946
947 /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
948 function, including local var decls and copies from the arguments to
949 local regs. */
950
951 void
952 nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
953 {
954 tree fntype = TREE_TYPE (decl);
955 tree result_type = TREE_TYPE (fntype);
956 int argno = 0;
957
958 /* We construct the initial part of the function into a string
959 stream, in order to share the prototype writing code. */
960 std::stringstream s;
961 write_fn_proto (s, true, name, decl);
962 s << "{\n";
963
964 bool return_in_mem = write_return_type (s, false, result_type);
965 if (return_in_mem)
966 argno = write_arg_type (s, 0, argno, ptr_type_node, true);
967
968 /* Declare and initialize incoming arguments. */
969 tree args = TYPE_ARG_TYPES (fntype);
970 bool prototyped = true;
971 if (!args)
972 {
973 args = DECL_ARGUMENTS (decl);
974 prototyped = false;
975 }
976
977 for (; args != NULL_TREE; args = TREE_CHAIN (args))
978 {
979 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
980
981 argno = write_arg_type (s, 0, argno, type, prototyped);
982 }
983
984 if (stdarg_p (fntype))
985 argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
986 true);
987
988 if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
989 write_arg_type (s, STATIC_CHAIN_REGNUM,
990 DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
991 true);
992
993 fprintf (file, "%s", s.str().c_str());
994
995 /* Declare a local var for outgoing varargs. */
996 if (cfun->machine->has_varadic)
997 init_frame (file, STACK_POINTER_REGNUM,
998 UNITS_PER_WORD, crtl->outgoing_args_size);
999
1000 /* Declare a local variable for the frame. */
1001 HOST_WIDE_INT sz = get_frame_size ();
1002 if (sz || cfun->machine->has_chain)
1003 init_frame (file, FRAME_POINTER_REGNUM,
1004 crtl->stack_alignment_needed / BITS_PER_UNIT, sz);
1005
1006 /* Declare the pseudos we have as ptx registers. */
1007 int maxregs = max_reg_num ();
1008 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1009 {
1010 if (regno_reg_rtx[i] != const0_rtx)
1011 {
1012 machine_mode mode = PSEUDO_REGNO_MODE (i);
1013 machine_mode split = maybe_split_mode (mode);
1014
1015 if (split != VOIDmode)
1016 mode = split;
1017 fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1018 output_reg (file, i, split, -2);
1019 fprintf (file, ";\n");
1020 }
1021 }
1022
1023 /* Emit axis predicates. */
1024 if (cfun->machine->axis_predicate[0])
1025 nvptx_init_axis_predicate (file,
1026 REGNO (cfun->machine->axis_predicate[0]), "y");
1027 if (cfun->machine->axis_predicate[1])
1028 nvptx_init_axis_predicate (file,
1029 REGNO (cfun->machine->axis_predicate[1]), "x");
1030 }
1031
1032 /* Output a return instruction. Also copy the return value to its outgoing
1033 location. */
1034
1035 const char *
1036 nvptx_output_return (void)
1037 {
1038 machine_mode mode = (machine_mode)cfun->machine->return_mode;
1039
1040 if (mode != VOIDmode)
1041 fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1042 nvptx_ptx_type_from_mode (mode, false),
1043 reg_names[NVPTX_RETURN_REGNUM],
1044 reg_names[NVPTX_RETURN_REGNUM]);
1045
1046 return "ret;";
1047 }
1048
1049 /* Terminate a function by writing a closing brace to FILE. */
1050
1051 void
1052 nvptx_function_end (FILE *file)
1053 {
1054 fprintf (file, "}\n");
1055 }
1056 \f
1057 /* Decide whether we can make a sibling call to a function. For ptx, we
1058 can't. */
1059
1060 static bool
1061 nvptx_function_ok_for_sibcall (tree, tree)
1062 {
1063 return false;
1064 }
1065
1066 /* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1067
1068 static rtx
1069 nvptx_get_drap_rtx (void)
1070 {
1071 return NULL_RTX;
1072 }
1073
1074 /* Implement the TARGET_CALL_ARGS hook. Record information about one
1075 argument to the next call. */
1076
1077 static void
1078 nvptx_call_args (rtx arg, tree fntype)
1079 {
1080 if (!cfun->machine->doing_call)
1081 {
1082 cfun->machine->doing_call = true;
1083 cfun->machine->is_varadic = false;
1084 cfun->machine->num_args = 0;
1085
1086 if (fntype && stdarg_p (fntype))
1087 {
1088 cfun->machine->is_varadic = true;
1089 cfun->machine->has_varadic = true;
1090 cfun->machine->num_args++;
1091 }
1092 }
1093
1094 if (REG_P (arg) && arg != pc_rtx)
1095 {
1096 cfun->machine->num_args++;
1097 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1098 cfun->machine->call_args);
1099 }
1100 }
1101
1102 /* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1103 information we recorded. */
1104
1105 static void
1106 nvptx_end_call_args (void)
1107 {
1108 cfun->machine->doing_call = false;
1109 free_EXPR_LIST_list (&cfun->machine->call_args);
1110 }
1111
1112 /* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1113 track of whether calls involving static chains or varargs were seen
1114 in the current function.
1115 For libcalls, maintain a hash table of decls we have seen, and
1116 record a function decl for later when encountering a new one. */
1117
1118 void
1119 nvptx_expand_call (rtx retval, rtx address)
1120 {
1121 rtx callee = XEXP (address, 0);
1122 rtx varargs = NULL_RTX;
1123 unsigned parallel = 0;
1124
1125 if (!call_insn_operand (callee, Pmode))
1126 {
1127 callee = force_reg (Pmode, callee);
1128 address = change_address (address, QImode, callee);
1129 }
1130
1131 if (GET_CODE (callee) == SYMBOL_REF)
1132 {
1133 tree decl = SYMBOL_REF_DECL (callee);
1134 if (decl != NULL_TREE)
1135 {
1136 if (DECL_STATIC_CHAIN (decl))
1137 cfun->machine->has_chain = true;
1138
1139 tree attr = get_oacc_fn_attrib (decl);
1140 if (attr)
1141 {
1142 tree dims = TREE_VALUE (attr);
1143
1144 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1145 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1146 {
1147 if (TREE_PURPOSE (dims)
1148 && !integer_zerop (TREE_PURPOSE (dims)))
1149 break;
1150 /* Not on this axis. */
1151 parallel ^= GOMP_DIM_MASK (ix);
1152 dims = TREE_CHAIN (dims);
1153 }
1154 }
1155 }
1156 }
1157
1158 unsigned nargs = cfun->machine->num_args;
1159 if (cfun->machine->is_varadic)
1160 {
1161 varargs = gen_reg_rtx (Pmode);
1162 emit_move_insn (varargs, stack_pointer_rtx);
1163 }
1164
1165 rtvec vec = rtvec_alloc (nargs + 1);
1166 rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
1167 int vec_pos = 0;
1168
1169 rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
1170 rtx tmp_retval = retval;
1171 if (retval)
1172 {
1173 if (!nvptx_register_operand (retval, GET_MODE (retval)))
1174 tmp_retval = gen_reg_rtx (GET_MODE (retval));
1175 call = gen_rtx_SET (tmp_retval, call);
1176 }
1177 XVECEXP (pat, 0, vec_pos++) = call;
1178
1179 /* Construct the call insn, including a USE for each argument pseudo
1180 register. These will be used when printing the insn. */
1181 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
1182 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
1183
1184 if (varargs)
1185 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
1186
1187 gcc_assert (vec_pos = XVECLEN (pat, 0));
1188
1189 nvptx_emit_forking (parallel, true);
1190 emit_call_insn (pat);
1191 nvptx_emit_joining (parallel, true);
1192
1193 if (tmp_retval != retval)
1194 emit_move_insn (retval, tmp_retval);
1195 }
1196
1197 /* Emit a comparison COMPARE, and return the new test to be used in the
1198 jump. */
1199
1200 rtx
1201 nvptx_expand_compare (rtx compare)
1202 {
1203 rtx pred = gen_reg_rtx (BImode);
1204 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1205 XEXP (compare, 0), XEXP (compare, 1));
1206 emit_insn (gen_rtx_SET (pred, cmp));
1207 return gen_rtx_NE (BImode, pred, const0_rtx);
1208 }
1209
1210 /* Expand the oacc fork & join primitive into ptx-required unspecs. */
1211
1212 void
1213 nvptx_expand_oacc_fork (unsigned mode)
1214 {
1215 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1216 }
1217
1218 void
1219 nvptx_expand_oacc_join (unsigned mode)
1220 {
1221 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1222 }
1223
1224 /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1225 objects. */
1226
1227 static rtx
1228 nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1229 {
1230 rtx res;
1231
1232 switch (GET_MODE (src))
1233 {
1234 case DImode:
1235 res = gen_unpackdisi2 (dst0, dst1, src);
1236 break;
1237 case DFmode:
1238 res = gen_unpackdfsi2 (dst0, dst1, src);
1239 break;
1240 default: gcc_unreachable ();
1241 }
1242 return res;
1243 }
1244
1245 /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1246 object. */
1247
1248 static rtx
1249 nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1250 {
1251 rtx res;
1252
1253 switch (GET_MODE (dst))
1254 {
1255 case DImode:
1256 res = gen_packsidi2 (dst, src0, src1);
1257 break;
1258 case DFmode:
1259 res = gen_packsidf2 (dst, src0, src1);
1260 break;
1261 default: gcc_unreachable ();
1262 }
1263 return res;
1264 }
1265
1266 /* Generate an instruction or sequence to broadcast register REG
1267 across the vectors of a single warp. */
1268
1269 static rtx
1270 nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
1271 {
1272 rtx res;
1273
1274 switch (GET_MODE (dst))
1275 {
1276 case SImode:
1277 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1278 break;
1279 case SFmode:
1280 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1281 break;
1282 case DImode:
1283 case DFmode:
1284 {
1285 rtx tmp0 = gen_reg_rtx (SImode);
1286 rtx tmp1 = gen_reg_rtx (SImode);
1287
1288 start_sequence ();
1289 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1290 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1291 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1292 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1293 res = get_insns ();
1294 end_sequence ();
1295 }
1296 break;
1297 case BImode:
1298 {
1299 rtx tmp = gen_reg_rtx (SImode);
1300
1301 start_sequence ();
1302 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1303 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1304 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1305 res = get_insns ();
1306 end_sequence ();
1307 }
1308 break;
1309
1310 default:
1311 gcc_unreachable ();
1312 }
1313 return res;
1314 }
1315
1316 /* Generate an instruction or sequence to broadcast register REG
1317 across the vectors of a single warp. */
1318
1319 static rtx
1320 nvptx_gen_vcast (rtx reg)
1321 {
1322 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1323 }
1324
1325 /* Structure used when generating a worker-level spill or fill. */
1326
1327 struct wcast_data_t
1328 {
1329 rtx base; /* Register holding base addr of buffer. */
1330 rtx ptr; /* Iteration var, if needed. */
1331 unsigned offset; /* Offset into worker buffer. */
1332 };
1333
1334 /* Direction of the spill/fill and looping setup/teardown indicator. */
1335
1336 enum propagate_mask
1337 {
1338 PM_read = 1 << 0,
1339 PM_write = 1 << 1,
1340 PM_loop_begin = 1 << 2,
1341 PM_loop_end = 1 << 3,
1342
1343 PM_read_write = PM_read | PM_write
1344 };
1345
1346 /* Generate instruction(s) to spill or fill register REG to/from the
1347 worker broadcast array. PM indicates what is to be done, REP
1348 how many loop iterations will be executed (0 for not a loop). */
1349
1350 static rtx
1351 nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data)
1352 {
1353 rtx res;
1354 machine_mode mode = GET_MODE (reg);
1355
1356 switch (mode)
1357 {
1358 case BImode:
1359 {
1360 rtx tmp = gen_reg_rtx (SImode);
1361
1362 start_sequence ();
1363 if (pm & PM_read)
1364 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
1365 emit_insn (nvptx_gen_wcast (tmp, pm, rep, data));
1366 if (pm & PM_write)
1367 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1368 res = get_insns ();
1369 end_sequence ();
1370 }
1371 break;
1372
1373 default:
1374 {
1375 rtx addr = data->ptr;
1376
1377 if (!addr)
1378 {
1379 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1380
1381 if (align > worker_bcast_align)
1382 worker_bcast_align = align;
1383 data->offset = (data->offset + align - 1) & ~(align - 1);
1384 addr = data->base;
1385 if (data->offset)
1386 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1387 }
1388
1389 addr = gen_rtx_MEM (mode, addr);
1390 if (pm == PM_read)
1391 res = gen_rtx_SET (addr, reg);
1392 else if (pm == PM_write)
1393 res = gen_rtx_SET (reg, addr);
1394 else
1395 gcc_unreachable ();
1396
1397 if (data->ptr)
1398 {
1399 /* We're using a ptr, increment it. */
1400 start_sequence ();
1401
1402 emit_insn (res);
1403 emit_insn (gen_adddi3 (data->ptr, data->ptr,
1404 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
1405 res = get_insns ();
1406 end_sequence ();
1407 }
1408 else
1409 rep = 1;
1410 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
1411 }
1412 break;
1413 }
1414 return res;
1415 }
1416
1417 /* When loading an operand ORIG_OP, verify whether an address space
1418 conversion to generic is required, and if so, perform it. Check
1419 for SYMBOL_REFs and record them if needed. Return either the
1420 original operand, or the converted one. */
1421
1422 rtx
1423 nvptx_maybe_convert_symbolic_operand (rtx op)
1424 {
1425 if (GET_MODE (op) != Pmode)
1426 return op;
1427
1428 rtx sym = op;
1429 if (GET_CODE (sym) == CONST)
1430 sym = XEXP (sym, 0);
1431 if (GET_CODE (sym) == PLUS)
1432 sym = XEXP (sym, 0);
1433
1434 if (GET_CODE (sym) != SYMBOL_REF)
1435 return op;
1436
1437 nvptx_maybe_record_fnsym (sym);
1438
1439 nvptx_data_area area = SYMBOL_DATA_AREA (sym);
1440 if (area == DATA_AREA_GENERIC)
1441 return op;
1442
1443 rtx dest = gen_reg_rtx (Pmode);
1444 emit_insn (gen_rtx_SET (dest,
1445 gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op),
1446 UNSPEC_TO_GENERIC)));
1447 return dest;
1448 }
1449 \f
1450 /* Returns true if X is a valid address for use in a memory reference. */
1451
1452 static bool
1453 nvptx_legitimate_address_p (machine_mode, rtx x, bool)
1454 {
1455 enum rtx_code code = GET_CODE (x);
1456
1457 switch (code)
1458 {
1459 case REG:
1460 return true;
1461
1462 case PLUS:
1463 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
1464 return true;
1465 return false;
1466
1467 case CONST:
1468 case SYMBOL_REF:
1469 case LABEL_REF:
1470 return true;
1471
1472 default:
1473 return false;
1474 }
1475 }
1476 \f
1477 /* Machinery to output constant initializers. When beginning an
1478 initializer, we decide on a fragment size (which is visible in ptx
1479 in the type used), and then all initializer data is buffered until
1480 a fragment is filled and ready to be written out. */
1481
1482 static struct
1483 {
1484 unsigned HOST_WIDE_INT mask; /* Mask for storing fragment. */
1485 unsigned HOST_WIDE_INT val; /* Current fragment value. */
1486 unsigned HOST_WIDE_INT remaining; /* Remaining bytes to be written
1487 out. */
1488 unsigned size; /* Fragment size to accumulate. */
1489 unsigned offset; /* Offset within current fragment. */
1490 bool started; /* Whether we've output any initializer. */
1491 } init_frag;
1492
1493 /* The current fragment is full, write it out. SYM may provide a
1494 symbolic reference we should output, in which case the fragment
1495 value is the addend. */
1496
1497 static void
1498 output_init_frag (rtx sym)
1499 {
1500 fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
1501 unsigned HOST_WIDE_INT val = init_frag.val;
1502
1503 init_frag.started = true;
1504 init_frag.val = 0;
1505 init_frag.offset = 0;
1506 init_frag.remaining--;
1507
1508 if (sym)
1509 {
1510 fprintf (asm_out_file, "generic(");
1511 output_address (VOIDmode, sym);
1512 fprintf (asm_out_file, val ? ") + " : ")");
1513 }
1514
1515 if (!sym || val)
1516 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
1517 }
1518
1519 /* Add value VAL of size SIZE to the data we're emitting, and keep
1520 writing out chunks as they fill up. */
1521
1522 static void
1523 nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
1524 {
1525 val &= ((unsigned HOST_WIDE_INT)2 << (size * BITS_PER_UNIT - 1)) - 1;
1526
1527 for (unsigned part = 0; size; size -= part)
1528 {
1529 val >>= part * BITS_PER_UNIT;
1530 part = init_frag.size - init_frag.offset;
1531 if (part > size)
1532 part = size;
1533
1534 unsigned HOST_WIDE_INT partial
1535 = val << (init_frag.offset * BITS_PER_UNIT);
1536 init_frag.val |= partial & init_frag.mask;
1537 init_frag.offset += part;
1538
1539 if (init_frag.offset == init_frag.size)
1540 output_init_frag (NULL);
1541 }
1542 }
1543
1544 /* Target hook for assembling integer object X of size SIZE. */
1545
1546 static bool
1547 nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
1548 {
1549 HOST_WIDE_INT val = 0;
1550
1551 switch (GET_CODE (x))
1552 {
1553 default:
1554 /* Let the generic machinery figure it out, usually for a
1555 CONST_WIDE_INT. */
1556 return false;
1557
1558 case CONST_INT:
1559 nvptx_assemble_value (INTVAL (x), size);
1560 break;
1561
1562 case CONST:
1563 x = XEXP (x, 0);
1564 gcc_assert (GET_CODE (x) == PLUS);
1565 val = INTVAL (XEXP (x, 1));
1566 x = XEXP (x, 0);
1567 gcc_assert (GET_CODE (x) == SYMBOL_REF);
1568 /* FALLTHROUGH */
1569
1570 case SYMBOL_REF:
1571 gcc_assert (size == init_frag.size);
1572 if (init_frag.offset)
1573 sorry ("cannot emit unaligned pointers in ptx assembly");
1574
1575 nvptx_maybe_record_fnsym (x);
1576 init_frag.val = val;
1577 output_init_frag (x);
1578 break;
1579 }
1580
1581 return true;
1582 }
1583
1584 /* Output SIZE zero bytes. We ignore the FILE argument since the
1585 functions we're calling to perform the output just use
1586 asm_out_file. */
1587
1588 void
1589 nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
1590 {
1591 /* Finish the current fragment, if it's started. */
1592 if (init_frag.offset)
1593 {
1594 unsigned part = init_frag.size - init_frag.offset;
1595 if (part > size)
1596 part = (unsigned) size;
1597 size -= part;
1598 nvptx_assemble_value (0, part);
1599 }
1600
1601 /* If this skip doesn't terminate the initializer, write as many
1602 remaining pieces as possible directly. */
1603 if (size < init_frag.remaining * init_frag.size)
1604 {
1605 while (size >= init_frag.size)
1606 {
1607 size -= init_frag.size;
1608 output_init_frag (NULL_RTX);
1609 }
1610 if (size)
1611 nvptx_assemble_value (0, size);
1612 }
1613 }
1614
1615 /* Output a string STR with length SIZE. As in nvptx_output_skip we
1616 ignore the FILE arg. */
1617
1618 void
1619 nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
1620 {
1621 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
1622 nvptx_assemble_value (str[i], 1);
1623 }
1624
1625 /* Emit a PTX variable decl and prepare for emission of its
1626 initializer. NAME is the symbol name and SETION the PTX data
1627 area. The type is TYPE, object size SIZE and alignment is ALIGN.
1628 The caller has already emitted any indentation and linkage
1629 specifier. It is responsible for any initializer, terminating ;
1630 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
1631 this is the opposite way round that PTX wants them! */
1632
1633 static void
1634 nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
1635 const_tree type, HOST_WIDE_INT size, unsigned align)
1636 {
1637 while (TREE_CODE (type) == ARRAY_TYPE)
1638 type = TREE_TYPE (type);
1639
1640 if (TREE_CODE (type) == VECTOR_TYPE
1641 || TREE_CODE (type) == COMPLEX_TYPE)
1642 /* Neither vector nor complex types can contain the other. */
1643 type = TREE_TYPE (type);
1644
1645 unsigned elt_size = int_size_in_bytes (type);
1646
1647 /* Largest mode we're prepared to accept. For BLKmode types we
1648 don't know if it'll contain pointer constants, so have to choose
1649 pointer size, otherwise we can choose DImode. */
1650 machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
1651
1652 elt_size |= GET_MODE_SIZE (elt_mode);
1653 elt_size &= -elt_size; /* Extract LSB set. */
1654
1655 init_frag.size = elt_size;
1656 /* Avoid undefined shift behaviour by using '2'. */
1657 init_frag.mask = ((unsigned HOST_WIDE_INT)2
1658 << (elt_size * BITS_PER_UNIT - 1)) - 1;
1659 init_frag.val = 0;
1660 init_frag.offset = 0;
1661 init_frag.started = false;
1662 /* Size might not be a multiple of elt size, if there's an
1663 initialized trailing struct array with smaller type than
1664 elt_size. */
1665 init_frag.remaining = (size + elt_size - 1) / elt_size;
1666
1667 fprintf (file, "%s .align %d .u%d ",
1668 section, align / BITS_PER_UNIT,
1669 elt_size * BITS_PER_UNIT);
1670 assemble_name (file, name);
1671
1672 if (size)
1673 /* We make everything an array, to simplify any initialization
1674 emission. */
1675 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]", init_frag.remaining);
1676 }
1677
1678 /* Called when the initializer for a decl has been completely output through
1679 combinations of the three functions above. */
1680
1681 static void
1682 nvptx_assemble_decl_end (void)
1683 {
1684 if (init_frag.offset)
1685 /* This can happen with a packed struct with trailing array member. */
1686 nvptx_assemble_value (0, init_frag.size - init_frag.offset);
1687 fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
1688 }
1689
1690 /* Output an uninitialized common or file-scope variable. */
1691
1692 void
1693 nvptx_output_aligned_decl (FILE *file, const char *name,
1694 const_tree decl, HOST_WIDE_INT size, unsigned align)
1695 {
1696 write_var_marker (file, true, TREE_PUBLIC (decl), name);
1697
1698 /* If this is public, it is common. The nearest thing we have to
1699 common is weak. */
1700 fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
1701
1702 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
1703 TREE_TYPE (decl), size, align);
1704 nvptx_assemble_decl_end ();
1705 }
1706
1707 /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
1708 writing a constant variable EXP with NAME and SIZE and its
1709 initializer to FILE. */
1710
1711 static void
1712 nvptx_asm_declare_constant_name (FILE *file, const char *name,
1713 const_tree exp, HOST_WIDE_INT obj_size)
1714 {
1715 write_var_marker (file, true, false, name);
1716
1717 fprintf (file, "\t");
1718
1719 tree type = TREE_TYPE (exp);
1720 nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
1721 TYPE_ALIGN (type));
1722 }
1723
1724 /* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
1725 a variable DECL with NAME to FILE. */
1726
1727 void
1728 nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
1729 {
1730 write_var_marker (file, true, TREE_PUBLIC (decl), name);
1731
1732 fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
1733 : DECL_WEAK (decl) ? ".weak " : ".visible "));
1734
1735 tree type = TREE_TYPE (decl);
1736 HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
1737 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
1738 type, obj_size, DECL_ALIGN (decl));
1739 }
1740
1741 /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
1742
1743 static void
1744 nvptx_globalize_label (FILE *, const char *)
1745 {
1746 }
1747
1748 /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
1749 declaration only for variable DECL with NAME to FILE. */
1750
1751 static void
1752 nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
1753 {
1754 write_var_marker (file, false, TREE_PUBLIC (decl), name);
1755
1756 fprintf (file, "\t.extern ");
1757 tree size = DECL_SIZE_UNIT (decl);
1758 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
1759 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
1760 DECL_ALIGN (decl));
1761 nvptx_assemble_decl_end ();
1762 }
1763
1764 /* Output a pattern for a move instruction. */
1765
1766 const char *
1767 nvptx_output_mov_insn (rtx dst, rtx src)
1768 {
1769 machine_mode dst_mode = GET_MODE (dst);
1770 machine_mode dst_inner = (GET_CODE (dst) == SUBREG
1771 ? GET_MODE (XEXP (dst, 0)) : dst_mode);
1772 machine_mode src_inner = (GET_CODE (src) == SUBREG
1773 ? GET_MODE (XEXP (src, 0)) : dst_mode);
1774
1775 if (src_inner == dst_inner)
1776 return "%.\tmov%t0\t%0, %1;";
1777
1778 if (CONSTANT_P (src))
1779 return (GET_MODE_CLASS (dst_inner) == MODE_INT
1780 && GET_MODE_CLASS (src_inner) != MODE_FLOAT
1781 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
1782
1783 if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
1784 return "%.\tmov.b%T0\t%0, %1;";
1785
1786 return "%.\tcvt%t0%t1\t%0, %1;";
1787 }
1788
1789 /* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
1790 involves writing .param declarations and in/out copies into them. For
1791 indirect calls, also write the .callprototype. */
1792
1793 const char *
1794 nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
1795 {
1796 char buf[16];
1797 static int labelno;
1798 bool needs_tgt = register_operand (callee, Pmode);
1799 rtx pat = PATTERN (insn);
1800 int arg_end = XVECLEN (pat, 0);
1801 tree decl = NULL_TREE;
1802
1803 fprintf (asm_out_file, "\t{\n");
1804 if (result != NULL)
1805 fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
1806 nvptx_ptx_type_from_mode (GET_MODE (result), false),
1807 reg_names[NVPTX_RETURN_REGNUM]);
1808
1809 /* Ensure we have a ptx declaration in the output if necessary. */
1810 if (GET_CODE (callee) == SYMBOL_REF)
1811 {
1812 decl = SYMBOL_REF_DECL (callee);
1813 if (!decl
1814 || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
1815 nvptx_record_libfunc (callee, result, pat);
1816 else if (DECL_EXTERNAL (decl))
1817 nvptx_record_fndecl (decl);
1818 }
1819
1820 if (needs_tgt)
1821 {
1822 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
1823 labelno++;
1824 ASM_OUTPUT_LABEL (asm_out_file, buf);
1825 std::stringstream s;
1826 write_fn_proto_from_insn (s, NULL, result, pat);
1827 fputs (s.str().c_str(), asm_out_file);
1828 }
1829
1830 for (int argno = 1; argno < arg_end; argno++)
1831 {
1832 rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
1833 machine_mode mode = GET_MODE (t);
1834
1835 /* Mode splitting has already been done. */
1836 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d%s;\n",
1837 nvptx_ptx_type_from_mode (mode, false), argno,
1838 mode == QImode || mode == HImode ? "[1]" : "");
1839 fprintf (asm_out_file, "\t\tst.param%s [%%out_arg%d], %%r%d;\n",
1840 nvptx_ptx_type_from_mode (mode, false), argno,
1841 REGNO (t));
1842 }
1843
1844 fprintf (asm_out_file, "\t\tcall ");
1845 if (result != NULL_RTX)
1846 fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
1847
1848 if (decl)
1849 {
1850 const char *name = get_fnname_from_decl (decl);
1851 name = nvptx_name_replacement (name);
1852 assemble_name (asm_out_file, name);
1853 }
1854 else
1855 output_address (VOIDmode, callee);
1856
1857 const char *open = "(";
1858 for (int argno = 1; argno < arg_end; argno++)
1859 {
1860 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
1861 open = "";
1862 }
1863 if (decl && DECL_STATIC_CHAIN (decl))
1864 {
1865 fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
1866 open = "";
1867 }
1868 if (!open[0])
1869 fprintf (asm_out_file, ")");
1870
1871 if (needs_tgt)
1872 {
1873 fprintf (asm_out_file, ", ");
1874 assemble_name (asm_out_file, buf);
1875 }
1876 fprintf (asm_out_file, ";\n");
1877
1878 if (find_reg_note (insn, REG_NORETURN, NULL))
1879 /* No return functions confuse the PTX JIT, as it doesn't realize
1880 the flow control barrier they imply. It can seg fault if it
1881 encounters what looks like an unexitable loop. Emit a trailing
1882 trap, which it does grok. */
1883 fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
1884
1885 if (result)
1886 {
1887 static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
1888
1889 if (!rval[0])
1890 /* We must escape the '%' that starts RETURN_REGNUM. */
1891 sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
1892 reg_names[NVPTX_RETURN_REGNUM]);
1893 return rval;
1894 }
1895
1896 return "}";
1897 }
1898
1899 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
1900
1901 static bool
1902 nvptx_print_operand_punct_valid_p (unsigned char c)
1903 {
1904 return c == '.' || c== '#';
1905 }
1906
1907 static void nvptx_print_operand (FILE *, rtx, int);
1908
1909 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
1910
1911 static void
1912 nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
1913 {
1914 rtx off;
1915 if (GET_CODE (x) == CONST)
1916 x = XEXP (x, 0);
1917 switch (GET_CODE (x))
1918 {
1919 case PLUS:
1920 off = XEXP (x, 1);
1921 output_address (VOIDmode, XEXP (x, 0));
1922 fprintf (file, "+");
1923 output_address (VOIDmode, off);
1924 break;
1925
1926 case SYMBOL_REF:
1927 case LABEL_REF:
1928 output_addr_const (file, x);
1929 break;
1930
1931 default:
1932 gcc_assert (GET_CODE (x) != MEM);
1933 nvptx_print_operand (file, x, 0);
1934 break;
1935 }
1936 }
1937
1938 /* Write assembly language output for the address ADDR to FILE. */
1939
1940 static void
1941 nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
1942 {
1943 nvptx_print_address_operand (file, addr, mode);
1944 }
1945
1946 /* Print an operand, X, to FILE, with an optional modifier in CODE.
1947
1948 Meaning of CODE:
1949 . -- print the predicate for the instruction or an emptry string for an
1950 unconditional one.
1951 # -- print a rounding mode for the instruction
1952
1953 A -- print a data area for a MEM
1954 c -- print an opcode suffix for a comparison operator, including a type code
1955 D -- print a data area for a MEM operand
1956 S -- print a shuffle kind specified by CONST_INT
1957 t -- print a type opcode suffix, promoting QImode to 32 bits
1958 T -- print a type size in bits
1959 u -- print a type opcode suffix without promotions. */
1960
1961 static void
1962 nvptx_print_operand (FILE *file, rtx x, int code)
1963 {
1964 if (code == '.')
1965 {
1966 x = current_insn_predicate;
1967 if (x)
1968 {
1969 unsigned int regno = REGNO (XEXP (x, 0));
1970 fputs ("[", file);
1971 if (GET_CODE (x) == EQ)
1972 fputs ("!", file);
1973 fputs (reg_names [regno], file);
1974 fputs ("]", file);
1975 }
1976 return;
1977 }
1978 else if (code == '#')
1979 {
1980 fputs (".rn", file);
1981 return;
1982 }
1983
1984 enum rtx_code x_code = GET_CODE (x);
1985 machine_mode mode = GET_MODE (x);
1986
1987 switch (code)
1988 {
1989 case 'A':
1990 x = XEXP (x, 0);
1991 /* FALLTHROUGH. */
1992
1993 case 'D':
1994 if (GET_CODE (x) == CONST)
1995 x = XEXP (x, 0);
1996 if (GET_CODE (x) == PLUS)
1997 x = XEXP (x, 0);
1998
1999 if (GET_CODE (x) == SYMBOL_REF)
2000 fputs (section_for_sym (x), file);
2001 break;
2002
2003 case 't':
2004 case 'u':
2005 if (x_code == SUBREG)
2006 {
2007 mode = GET_MODE (SUBREG_REG (x));
2008 if (mode == TImode)
2009 mode = DImode;
2010 else if (COMPLEX_MODE_P (mode))
2011 mode = GET_MODE_INNER (mode);
2012 }
2013 fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
2014 break;
2015
2016 case 'S':
2017 {
2018 nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
2019 /* Same order as nvptx_shuffle_kind. */
2020 static const char *const kinds[] =
2021 {".up", ".down", ".bfly", ".idx"};
2022 fputs (kinds[kind], file);
2023 }
2024 break;
2025
2026 case 'T':
2027 fprintf (file, "%d", GET_MODE_BITSIZE (mode));
2028 break;
2029
2030 case 'j':
2031 fprintf (file, "@");
2032 goto common;
2033
2034 case 'J':
2035 fprintf (file, "@!");
2036 goto common;
2037
2038 case 'c':
2039 mode = GET_MODE (XEXP (x, 0));
2040 switch (x_code)
2041 {
2042 case EQ:
2043 fputs (".eq", file);
2044 break;
2045 case NE:
2046 if (FLOAT_MODE_P (mode))
2047 fputs (".neu", file);
2048 else
2049 fputs (".ne", file);
2050 break;
2051 case LE:
2052 fputs (".le", file);
2053 break;
2054 case GE:
2055 fputs (".ge", file);
2056 break;
2057 case LT:
2058 fputs (".lt", file);
2059 break;
2060 case GT:
2061 fputs (".gt", file);
2062 break;
2063 case LEU:
2064 fputs (".ls", file);
2065 break;
2066 case GEU:
2067 fputs (".hs", file);
2068 break;
2069 case LTU:
2070 fputs (".lo", file);
2071 break;
2072 case GTU:
2073 fputs (".hi", file);
2074 break;
2075 case LTGT:
2076 fputs (".ne", file);
2077 break;
2078 case UNEQ:
2079 fputs (".equ", file);
2080 break;
2081 case UNLE:
2082 fputs (".leu", file);
2083 break;
2084 case UNGE:
2085 fputs (".geu", file);
2086 break;
2087 case UNLT:
2088 fputs (".ltu", file);
2089 break;
2090 case UNGT:
2091 fputs (".gtu", file);
2092 break;
2093 case UNORDERED:
2094 fputs (".nan", file);
2095 break;
2096 case ORDERED:
2097 fputs (".num", file);
2098 break;
2099 default:
2100 gcc_unreachable ();
2101 }
2102 if (FLOAT_MODE_P (mode)
2103 || x_code == EQ || x_code == NE
2104 || x_code == GEU || x_code == GTU
2105 || x_code == LEU || x_code == LTU)
2106 fputs (nvptx_ptx_type_from_mode (mode, true), file);
2107 else
2108 fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
2109 break;
2110 default:
2111 common:
2112 switch (x_code)
2113 {
2114 case SUBREG:
2115 {
2116 rtx inner_x = SUBREG_REG (x);
2117 machine_mode inner_mode = GET_MODE (inner_x);
2118 machine_mode split = maybe_split_mode (inner_mode);
2119
2120 if (split != VOIDmode
2121 && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
2122 output_reg (file, REGNO (inner_x), split);
2123 else
2124 output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
2125 }
2126 break;
2127
2128 case REG:
2129 output_reg (file, REGNO (x), maybe_split_mode (mode));
2130 break;
2131
2132 case MEM:
2133 fputc ('[', file);
2134 nvptx_print_address_operand (file, XEXP (x, 0), mode);
2135 fputc (']', file);
2136 break;
2137
2138 case CONST_INT:
2139 output_addr_const (file, x);
2140 break;
2141
2142 case CONST:
2143 case SYMBOL_REF:
2144 case LABEL_REF:
2145 /* We could use output_addr_const, but that can print things like
2146 "x-8", which breaks ptxas. Need to ensure it is output as
2147 "x+-8". */
2148 nvptx_print_address_operand (file, x, VOIDmode);
2149 break;
2150
2151 case CONST_DOUBLE:
2152 long vals[2];
2153 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
2154 vals[0] &= 0xffffffff;
2155 vals[1] &= 0xffffffff;
2156 if (mode == SFmode)
2157 fprintf (file, "0f%08lx", vals[0]);
2158 else
2159 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2160 break;
2161
2162 default:
2163 output_addr_const (file, x);
2164 }
2165 }
2166 }
2167 \f
2168 /* Record replacement regs used to deal with subreg operands. */
2169 struct reg_replace
2170 {
2171 rtx replacement[MAX_RECOG_OPERANDS];
2172 machine_mode mode;
2173 int n_allocated;
2174 int n_in_use;
2175 };
2176
2177 /* Allocate or reuse a replacement in R and return the rtx. */
2178
2179 static rtx
2180 get_replacement (struct reg_replace *r)
2181 {
2182 if (r->n_allocated == r->n_in_use)
2183 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2184 return r->replacement[r->n_in_use++];
2185 }
2186
2187 /* Clean up subreg operands. In ptx assembly, everything is typed, and
2188 the presence of subregs would break the rules for most instructions.
2189 Replace them with a suitable new register of the right size, plus
2190 conversion copyin/copyout instructions. */
2191
2192 static void
2193 nvptx_reorg_subreg (void)
2194 {
2195 struct reg_replace qiregs, hiregs, siregs, diregs;
2196 rtx_insn *insn, *next;
2197
2198 qiregs.n_allocated = 0;
2199 hiregs.n_allocated = 0;
2200 siregs.n_allocated = 0;
2201 diregs.n_allocated = 0;
2202 qiregs.mode = QImode;
2203 hiregs.mode = HImode;
2204 siregs.mode = SImode;
2205 diregs.mode = DImode;
2206
2207 for (insn = get_insns (); insn; insn = next)
2208 {
2209 next = NEXT_INSN (insn);
2210 if (!NONDEBUG_INSN_P (insn)
2211 || asm_noperands (PATTERN (insn)) >= 0
2212 || GET_CODE (PATTERN (insn)) == USE
2213 || GET_CODE (PATTERN (insn)) == CLOBBER)
2214 continue;
2215
2216 qiregs.n_in_use = 0;
2217 hiregs.n_in_use = 0;
2218 siregs.n_in_use = 0;
2219 diregs.n_in_use = 0;
2220 extract_insn (insn);
2221 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
2222
2223 for (int i = 0; i < recog_data.n_operands; i++)
2224 {
2225 rtx op = recog_data.operand[i];
2226 if (GET_CODE (op) != SUBREG)
2227 continue;
2228
2229 rtx inner = SUBREG_REG (op);
2230
2231 machine_mode outer_mode = GET_MODE (op);
2232 machine_mode inner_mode = GET_MODE (inner);
2233 gcc_assert (s_ok);
2234 if (s_ok
2235 && (GET_MODE_PRECISION (inner_mode)
2236 >= GET_MODE_PRECISION (outer_mode)))
2237 continue;
2238 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2239 struct reg_replace *r = (outer_mode == QImode ? &qiregs
2240 : outer_mode == HImode ? &hiregs
2241 : outer_mode == SImode ? &siregs
2242 : &diregs);
2243 rtx new_reg = get_replacement (r);
2244
2245 if (recog_data.operand_type[i] != OP_OUT)
2246 {
2247 enum rtx_code code;
2248 if (GET_MODE_PRECISION (inner_mode)
2249 < GET_MODE_PRECISION (outer_mode))
2250 code = ZERO_EXTEND;
2251 else
2252 code = TRUNCATE;
2253
2254 rtx pat = gen_rtx_SET (new_reg,
2255 gen_rtx_fmt_e (code, outer_mode, inner));
2256 emit_insn_before (pat, insn);
2257 }
2258
2259 if (recog_data.operand_type[i] != OP_IN)
2260 {
2261 enum rtx_code code;
2262 if (GET_MODE_PRECISION (inner_mode)
2263 < GET_MODE_PRECISION (outer_mode))
2264 code = TRUNCATE;
2265 else
2266 code = ZERO_EXTEND;
2267
2268 rtx pat = gen_rtx_SET (inner,
2269 gen_rtx_fmt_e (code, inner_mode, new_reg));
2270 emit_insn_after (pat, insn);
2271 }
2272 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2273 }
2274 }
2275 }
2276
2277 /* Loop structure of the function. The entire function is described as
2278 a NULL loop. */
2279
2280 struct parallel
2281 {
2282 /* Parent parallel. */
2283 parallel *parent;
2284
2285 /* Next sibling parallel. */
2286 parallel *next;
2287
2288 /* First child parallel. */
2289 parallel *inner;
2290
2291 /* Partitioning mask of the parallel. */
2292 unsigned mask;
2293
2294 /* Partitioning used within inner parallels. */
2295 unsigned inner_mask;
2296
2297 /* Location of parallel forked and join. The forked is the first
2298 block in the parallel and the join is the first block after of
2299 the partition. */
2300 basic_block forked_block;
2301 basic_block join_block;
2302
2303 rtx_insn *forked_insn;
2304 rtx_insn *join_insn;
2305
2306 rtx_insn *fork_insn;
2307 rtx_insn *joining_insn;
2308
2309 /* Basic blocks in this parallel, but not in child parallels. The
2310 FORKED and JOINING blocks are in the partition. The FORK and JOIN
2311 blocks are not. */
2312 auto_vec<basic_block> blocks;
2313
2314 public:
2315 parallel (parallel *parent, unsigned mode);
2316 ~parallel ();
2317 };
2318
2319 /* Constructor links the new parallel into it's parent's chain of
2320 children. */
2321
2322 parallel::parallel (parallel *parent_, unsigned mask_)
2323 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
2324 {
2325 forked_block = join_block = 0;
2326 forked_insn = join_insn = 0;
2327 fork_insn = joining_insn = 0;
2328
2329 if (parent)
2330 {
2331 next = parent->inner;
2332 parent->inner = this;
2333 }
2334 }
2335
2336 parallel::~parallel ()
2337 {
2338 delete inner;
2339 delete next;
2340 }
2341
2342 /* Map of basic blocks to insns */
2343 typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
2344
2345 /* A tuple of an insn of interest and the BB in which it resides. */
2346 typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
2347 typedef auto_vec<insn_bb_t> insn_bb_vec_t;
2348
2349 /* Split basic blocks such that each forked and join unspecs are at
2350 the start of their basic blocks. Thus afterwards each block will
2351 have a single partitioning mode. We also do the same for return
2352 insns, as they are executed by every thread. Return the
2353 partitioning mode of the function as a whole. Populate MAP with
2354 head and tail blocks. We also clear the BB visited flag, which is
2355 used when finding partitions. */
2356
2357 static void
2358 nvptx_split_blocks (bb_insn_map_t *map)
2359 {
2360 insn_bb_vec_t worklist;
2361 basic_block block;
2362 rtx_insn *insn;
2363
2364 /* Locate all the reorg instructions of interest. */
2365 FOR_ALL_BB_FN (block, cfun)
2366 {
2367 bool seen_insn = false;
2368
2369 /* Clear visited flag, for use by parallel locator */
2370 block->flags &= ~BB_VISITED;
2371
2372 FOR_BB_INSNS (block, insn)
2373 {
2374 if (!INSN_P (insn))
2375 continue;
2376 switch (recog_memoized (insn))
2377 {
2378 default:
2379 seen_insn = true;
2380 continue;
2381 case CODE_FOR_nvptx_forked:
2382 case CODE_FOR_nvptx_join:
2383 break;
2384
2385 case CODE_FOR_return:
2386 /* We also need to split just before return insns, as
2387 that insn needs executing by all threads, but the
2388 block it is in probably does not. */
2389 break;
2390 }
2391
2392 if (seen_insn)
2393 /* We've found an instruction that must be at the start of
2394 a block, but isn't. Add it to the worklist. */
2395 worklist.safe_push (insn_bb_t (insn, block));
2396 else
2397 /* It was already the first instruction. Just add it to
2398 the map. */
2399 map->get_or_insert (block) = insn;
2400 seen_insn = true;
2401 }
2402 }
2403
2404 /* Split blocks on the worklist. */
2405 unsigned ix;
2406 insn_bb_t *elt;
2407 basic_block remap = 0;
2408 for (ix = 0; worklist.iterate (ix, &elt); ix++)
2409 {
2410 if (remap != elt->second)
2411 {
2412 block = elt->second;
2413 remap = block;
2414 }
2415
2416 /* Split block before insn. The insn is in the new block */
2417 edge e = split_block (block, PREV_INSN (elt->first));
2418
2419 block = e->dest;
2420 map->get_or_insert (block) = elt->first;
2421 }
2422 }
2423
2424 /* BLOCK is a basic block containing a head or tail instruction.
2425 Locate the associated prehead or pretail instruction, which must be
2426 in the single predecessor block. */
2427
2428 static rtx_insn *
2429 nvptx_discover_pre (basic_block block, int expected)
2430 {
2431 gcc_assert (block->preds->length () == 1);
2432 basic_block pre_block = (*block->preds)[0]->src;
2433 rtx_insn *pre_insn;
2434
2435 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
2436 pre_insn = PREV_INSN (pre_insn))
2437 gcc_assert (pre_insn != BB_HEAD (pre_block));
2438
2439 gcc_assert (recog_memoized (pre_insn) == expected);
2440 return pre_insn;
2441 }
2442
2443 /* Dump this parallel and all its inner parallels. */
2444
2445 static void
2446 nvptx_dump_pars (parallel *par, unsigned depth)
2447 {
2448 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
2449 depth, par->mask,
2450 par->forked_block ? par->forked_block->index : -1,
2451 par->join_block ? par->join_block->index : -1);
2452
2453 fprintf (dump_file, " blocks:");
2454
2455 basic_block block;
2456 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
2457 fprintf (dump_file, " %d", block->index);
2458 fprintf (dump_file, "\n");
2459 if (par->inner)
2460 nvptx_dump_pars (par->inner, depth + 1);
2461
2462 if (par->next)
2463 nvptx_dump_pars (par->next, depth);
2464 }
2465
2466 /* If BLOCK contains a fork/join marker, process it to create or
2467 terminate a loop structure. Add this block to the current loop,
2468 and then walk successor blocks. */
2469
2470 static parallel *
2471 nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
2472 {
2473 if (block->flags & BB_VISITED)
2474 return par;
2475 block->flags |= BB_VISITED;
2476
2477 if (rtx_insn **endp = map->get (block))
2478 {
2479 rtx_insn *end = *endp;
2480
2481 /* This is a block head or tail, or return instruction. */
2482 switch (recog_memoized (end))
2483 {
2484 case CODE_FOR_return:
2485 /* Return instructions are in their own block, and we
2486 don't need to do anything more. */
2487 return par;
2488
2489 case CODE_FOR_nvptx_forked:
2490 /* Loop head, create a new inner loop and add it into
2491 our parent's child list. */
2492 {
2493 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
2494
2495 gcc_assert (mask);
2496 par = new parallel (par, mask);
2497 par->forked_block = block;
2498 par->forked_insn = end;
2499 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
2500 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
2501 par->fork_insn
2502 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
2503 }
2504 break;
2505
2506 case CODE_FOR_nvptx_join:
2507 /* A loop tail. Finish the current loop and return to
2508 parent. */
2509 {
2510 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
2511
2512 gcc_assert (par->mask == mask);
2513 par->join_block = block;
2514 par->join_insn = end;
2515 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
2516 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
2517 par->joining_insn
2518 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
2519 par = par->parent;
2520 }
2521 break;
2522
2523 default:
2524 gcc_unreachable ();
2525 }
2526 }
2527
2528 if (par)
2529 /* Add this block onto the current loop's list of blocks. */
2530 par->blocks.safe_push (block);
2531 else
2532 /* This must be the entry block. Create a NULL parallel. */
2533 par = new parallel (0, 0);
2534
2535 /* Walk successor blocks. */
2536 edge e;
2537 edge_iterator ei;
2538
2539 FOR_EACH_EDGE (e, ei, block->succs)
2540 nvptx_find_par (map, par, e->dest);
2541
2542 return par;
2543 }
2544
2545 /* DFS walk the CFG looking for fork & join markers. Construct
2546 loop structures as we go. MAP is a mapping of basic blocks
2547 to head & tail markers, discovered when splitting blocks. This
2548 speeds up the discovery. We rely on the BB visited flag having
2549 been cleared when splitting blocks. */
2550
2551 static parallel *
2552 nvptx_discover_pars (bb_insn_map_t *map)
2553 {
2554 basic_block block;
2555
2556 /* Mark exit blocks as visited. */
2557 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
2558 block->flags |= BB_VISITED;
2559
2560 /* And entry block as not. */
2561 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
2562 block->flags &= ~BB_VISITED;
2563
2564 parallel *par = nvptx_find_par (map, 0, block);
2565
2566 if (dump_file)
2567 {
2568 fprintf (dump_file, "\nLoops\n");
2569 nvptx_dump_pars (par, 0);
2570 fprintf (dump_file, "\n");
2571 }
2572
2573 return par;
2574 }
2575
2576 /* Analyse a group of BBs within a partitioned region and create N
2577 Single-Entry-Single-Exit regions. Some of those regions will be
2578 trivial ones consisting of a single BB. The blocks of a
2579 partitioned region might form a set of disjoint graphs -- because
2580 the region encloses a differently partitoned sub region.
2581
2582 We use the linear time algorithm described in 'Finding Regions Fast:
2583 Single Entry Single Exit and control Regions in Linear Time'
2584 Johnson, Pearson & Pingali. That algorithm deals with complete
2585 CFGs, where a back edge is inserted from END to START, and thus the
2586 problem becomes one of finding equivalent loops.
2587
2588 In this case we have a partial CFG. We complete it by redirecting
2589 any incoming edge to the graph to be from an arbitrary external BB,
2590 and similarly redirecting any outgoing edge to be to that BB.
2591 Thus we end up with a closed graph.
2592
2593 The algorithm works by building a spanning tree of an undirected
2594 graph and keeping track of back edges from nodes further from the
2595 root in the tree to nodes nearer to the root in the tree. In the
2596 description below, the root is up and the tree grows downwards.
2597
2598 We avoid having to deal with degenerate back-edges to the same
2599 block, by splitting each BB into 3 -- one for input edges, one for
2600 the node itself and one for the output edges. Such back edges are
2601 referred to as 'Brackets'. Cycle equivalent nodes will have the
2602 same set of brackets.
2603
2604 Determining bracket equivalency is done by maintaining a list of
2605 brackets in such a manner that the list length and final bracket
2606 uniquely identify the set.
2607
2608 We use coloring to mark all BBs with cycle equivalency with the
2609 same color. This is the output of the 'Finding Regions Fast'
2610 algorithm. Notice it doesn't actually find the set of nodes within
2611 a particular region, just unorderd sets of nodes that are the
2612 entries and exits of SESE regions.
2613
2614 After determining cycle equivalency, we need to find the minimal
2615 set of SESE regions. Do this with a DFS coloring walk of the
2616 complete graph. We're either 'looking' or 'coloring'. When
2617 looking, and we're in the subgraph, we start coloring the color of
2618 the current node, and remember that node as the start of the
2619 current color's SESE region. Every time we go to a new node, we
2620 decrement the count of nodes with thet color. If it reaches zero,
2621 we remember that node as the end of the current color's SESE region
2622 and return to 'looking'. Otherwise we color the node the current
2623 color.
2624
2625 This way we end up with coloring the inside of non-trivial SESE
2626 regions with the color of that region. */
2627
2628 /* A pair of BBs. We use this to represent SESE regions. */
2629 typedef std::pair<basic_block, basic_block> bb_pair_t;
2630 typedef auto_vec<bb_pair_t> bb_pair_vec_t;
2631
2632 /* A node in the undirected CFG. The discriminator SECOND indicates just
2633 above or just below the BB idicated by FIRST. */
2634 typedef std::pair<basic_block, int> pseudo_node_t;
2635
2636 /* A bracket indicates an edge towards the root of the spanning tree of the
2637 undirected graph. Each bracket has a color, determined
2638 from the currrent set of brackets. */
2639 struct bracket
2640 {
2641 pseudo_node_t back; /* Back target */
2642
2643 /* Current color and size of set. */
2644 unsigned color;
2645 unsigned size;
2646
2647 bracket (pseudo_node_t back_)
2648 : back (back_), color (~0u), size (~0u)
2649 {
2650 }
2651
2652 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
2653 {
2654 if (length != size)
2655 {
2656 size = length;
2657 color = color_counts.length ();
2658 color_counts.quick_push (0);
2659 }
2660 color_counts[color]++;
2661 return color;
2662 }
2663 };
2664
2665 typedef auto_vec<bracket> bracket_vec_t;
2666
2667 /* Basic block info for finding SESE regions. */
2668
2669 struct bb_sese
2670 {
2671 int node; /* Node number in spanning tree. */
2672 int parent; /* Parent node number. */
2673
2674 /* The algorithm splits each node A into Ai, A', Ao. The incoming
2675 edges arrive at pseudo-node Ai and the outgoing edges leave at
2676 pseudo-node Ao. We have to remember which way we arrived at a
2677 particular node when generating the spanning tree. dir > 0 means
2678 we arrived at Ai, dir < 0 means we arrived at Ao. */
2679 int dir;
2680
2681 /* Lowest numbered pseudo-node reached via a backedge from thsis
2682 node, or any descendant. */
2683 pseudo_node_t high;
2684
2685 int color; /* Cycle-equivalence color */
2686
2687 /* Stack of brackets for this node. */
2688 bracket_vec_t brackets;
2689
2690 bb_sese (unsigned node_, unsigned p, int dir_)
2691 :node (node_), parent (p), dir (dir_)
2692 {
2693 }
2694 ~bb_sese ();
2695
2696 /* Push a bracket ending at BACK. */
2697 void push (const pseudo_node_t &back)
2698 {
2699 if (dump_file)
2700 fprintf (dump_file, "Pushing backedge %d:%+d\n",
2701 back.first ? back.first->index : 0, back.second);
2702 brackets.safe_push (bracket (back));
2703 }
2704
2705 void append (bb_sese *child);
2706 void remove (const pseudo_node_t &);
2707
2708 /* Set node's color. */
2709 void set_color (auto_vec<unsigned> &color_counts)
2710 {
2711 color = brackets.last ().get_color (color_counts, brackets.length ());
2712 }
2713 };
2714
2715 bb_sese::~bb_sese ()
2716 {
2717 }
2718
2719 /* Destructively append CHILD's brackets. */
2720
2721 void
2722 bb_sese::append (bb_sese *child)
2723 {
2724 if (int len = child->brackets.length ())
2725 {
2726 int ix;
2727
2728 if (dump_file)
2729 {
2730 for (ix = 0; ix < len; ix++)
2731 {
2732 const pseudo_node_t &pseudo = child->brackets[ix].back;
2733 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
2734 child->node, pseudo.first ? pseudo.first->index : 0,
2735 pseudo.second);
2736 }
2737 }
2738 if (!brackets.length ())
2739 std::swap (brackets, child->brackets);
2740 else
2741 {
2742 brackets.reserve (len);
2743 for (ix = 0; ix < len; ix++)
2744 brackets.quick_push (child->brackets[ix]);
2745 }
2746 }
2747 }
2748
2749 /* Remove brackets that terminate at PSEUDO. */
2750
2751 void
2752 bb_sese::remove (const pseudo_node_t &pseudo)
2753 {
2754 unsigned removed = 0;
2755 int len = brackets.length ();
2756
2757 for (int ix = 0; ix < len; ix++)
2758 {
2759 if (brackets[ix].back == pseudo)
2760 {
2761 if (dump_file)
2762 fprintf (dump_file, "Removing backedge %d:%+d\n",
2763 pseudo.first ? pseudo.first->index : 0, pseudo.second);
2764 removed++;
2765 }
2766 else if (removed)
2767 brackets[ix-removed] = brackets[ix];
2768 }
2769 while (removed--)
2770 brackets.pop ();
2771 }
2772
2773 /* Accessors for BB's aux pointer. */
2774 #define BB_SET_SESE(B, S) ((B)->aux = (S))
2775 #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
2776
2777 /* DFS walk creating SESE data structures. Only cover nodes with
2778 BB_VISITED set. Append discovered blocks to LIST. We number in
2779 increments of 3 so that the above and below pseudo nodes can be
2780 implicitly numbered too. */
2781
2782 static int
2783 nvptx_sese_number (int n, int p, int dir, basic_block b,
2784 auto_vec<basic_block> *list)
2785 {
2786 if (BB_GET_SESE (b))
2787 return n;
2788
2789 if (dump_file)
2790 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
2791 b->index, n, p, dir);
2792
2793 BB_SET_SESE (b, new bb_sese (n, p, dir));
2794 p = n;
2795
2796 n += 3;
2797 list->quick_push (b);
2798
2799 /* First walk the nodes on the 'other side' of this node, then walk
2800 the nodes on the same side. */
2801 for (unsigned ix = 2; ix; ix--)
2802 {
2803 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
2804 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
2805 : offsetof (edge_def, src));
2806 edge e;
2807 edge_iterator (ei);
2808
2809 FOR_EACH_EDGE (e, ei, edges)
2810 {
2811 basic_block target = *(basic_block *)((char *)e + offset);
2812
2813 if (target->flags & BB_VISITED)
2814 n = nvptx_sese_number (n, p, dir, target, list);
2815 }
2816 dir = -dir;
2817 }
2818 return n;
2819 }
2820
2821 /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
2822 EDGES are the outgoing edges and OFFSET is the offset to the src
2823 or dst block on the edges. */
2824
2825 static void
2826 nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
2827 vec<edge, va_gc> *edges, size_t offset)
2828 {
2829 edge e;
2830 edge_iterator (ei);
2831 int hi_back = depth;
2832 pseudo_node_t node_back (0, depth);
2833 int hi_child = depth;
2834 pseudo_node_t node_child (0, depth);
2835 basic_block child = NULL;
2836 unsigned num_children = 0;
2837 int usd = -dir * sese->dir;
2838
2839 if (dump_file)
2840 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
2841 me->index, sese->node, dir);
2842
2843 if (dir < 0)
2844 {
2845 /* This is the above pseudo-child. It has the BB itself as an
2846 additional child node. */
2847 node_child = sese->high;
2848 hi_child = node_child.second;
2849 if (node_child.first)
2850 hi_child += BB_GET_SESE (node_child.first)->node;
2851 num_children++;
2852 }
2853
2854 /* Examine each edge.
2855 - if it is a child (a) append its bracket list and (b) record
2856 whether it is the child with the highest reaching bracket.
2857 - if it is an edge to ancestor, record whether it's the highest
2858 reaching backlink. */
2859 FOR_EACH_EDGE (e, ei, edges)
2860 {
2861 basic_block target = *(basic_block *)((char *)e + offset);
2862
2863 if (bb_sese *t_sese = BB_GET_SESE (target))
2864 {
2865 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
2866 {
2867 /* Child node. Append its bracket list. */
2868 num_children++;
2869 sese->append (t_sese);
2870
2871 /* Compare it's hi value. */
2872 int t_hi = t_sese->high.second;
2873
2874 if (basic_block child_hi_block = t_sese->high.first)
2875 t_hi += BB_GET_SESE (child_hi_block)->node;
2876
2877 if (hi_child > t_hi)
2878 {
2879 hi_child = t_hi;
2880 node_child = t_sese->high;
2881 child = target;
2882 }
2883 }
2884 else if (t_sese->node < sese->node + dir
2885 && !(dir < 0 && sese->parent == t_sese->node))
2886 {
2887 /* Non-parental ancestor node -- a backlink. */
2888 int d = usd * t_sese->dir;
2889 int back = t_sese->node + d;
2890
2891 if (hi_back > back)
2892 {
2893 hi_back = back;
2894 node_back = pseudo_node_t (target, d);
2895 }
2896 }
2897 }
2898 else
2899 { /* Fallen off graph, backlink to entry node. */
2900 hi_back = 0;
2901 node_back = pseudo_node_t (0, 0);
2902 }
2903 }
2904
2905 /* Remove any brackets that terminate at this pseudo node. */
2906 sese->remove (pseudo_node_t (me, dir));
2907
2908 /* Now push any backlinks from this pseudo node. */
2909 FOR_EACH_EDGE (e, ei, edges)
2910 {
2911 basic_block target = *(basic_block *)((char *)e + offset);
2912 if (bb_sese *t_sese = BB_GET_SESE (target))
2913 {
2914 if (t_sese->node < sese->node + dir
2915 && !(dir < 0 && sese->parent == t_sese->node))
2916 /* Non-parental ancestor node - backedge from me. */
2917 sese->push (pseudo_node_t (target, usd * t_sese->dir));
2918 }
2919 else
2920 {
2921 /* back edge to entry node */
2922 sese->push (pseudo_node_t (0, 0));
2923 }
2924 }
2925
2926 /* If this node leads directly or indirectly to a no-return region of
2927 the graph, then fake a backedge to entry node. */
2928 if (!sese->brackets.length () || !edges || !edges->length ())
2929 {
2930 hi_back = 0;
2931 node_back = pseudo_node_t (0, 0);
2932 sese->push (node_back);
2933 }
2934
2935 /* Record the highest reaching backedge from us or a descendant. */
2936 sese->high = hi_back < hi_child ? node_back : node_child;
2937
2938 if (num_children > 1)
2939 {
2940 /* There is more than one child -- this is a Y shaped piece of
2941 spanning tree. We have to insert a fake backedge from this
2942 node to the highest ancestor reached by not-the-highest
2943 reaching child. Note that there may be multiple children
2944 with backedges to the same highest node. That's ok and we
2945 insert the edge to that highest node. */
2946 hi_child = depth;
2947 if (dir < 0 && child)
2948 {
2949 node_child = sese->high;
2950 hi_child = node_child.second;
2951 if (node_child.first)
2952 hi_child += BB_GET_SESE (node_child.first)->node;
2953 }
2954
2955 FOR_EACH_EDGE (e, ei, edges)
2956 {
2957 basic_block target = *(basic_block *)((char *)e + offset);
2958
2959 if (target == child)
2960 /* Ignore the highest child. */
2961 continue;
2962
2963 bb_sese *t_sese = BB_GET_SESE (target);
2964 if (!t_sese)
2965 continue;
2966 if (t_sese->parent != sese->node)
2967 /* Not a child. */
2968 continue;
2969
2970 /* Compare its hi value. */
2971 int t_hi = t_sese->high.second;
2972
2973 if (basic_block child_hi_block = t_sese->high.first)
2974 t_hi += BB_GET_SESE (child_hi_block)->node;
2975
2976 if (hi_child > t_hi)
2977 {
2978 hi_child = t_hi;
2979 node_child = t_sese->high;
2980 }
2981 }
2982
2983 sese->push (node_child);
2984 }
2985 }
2986
2987
2988 /* DFS walk of BB graph. Color node BLOCK according to COLORING then
2989 proceed to successors. Set SESE entry and exit nodes of
2990 REGIONS. */
2991
2992 static void
2993 nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
2994 basic_block block, int coloring)
2995 {
2996 bb_sese *sese = BB_GET_SESE (block);
2997
2998 if (block->flags & BB_VISITED)
2999 {
3000 /* If we've already encountered this block, either we must not
3001 be coloring, or it must have been colored the current color. */
3002 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
3003 return;
3004 }
3005
3006 block->flags |= BB_VISITED;
3007
3008 if (sese)
3009 {
3010 if (coloring < 0)
3011 {
3012 /* Start coloring a region. */
3013 regions[sese->color].first = block;
3014 coloring = sese->color;
3015 }
3016
3017 if (!--color_counts[sese->color] && sese->color == coloring)
3018 {
3019 /* Found final block of SESE region. */
3020 regions[sese->color].second = block;
3021 coloring = -1;
3022 }
3023 else
3024 /* Color the node, so we can assert on revisiting the node
3025 that the graph is indeed SESE. */
3026 sese->color = coloring;
3027 }
3028 else
3029 /* Fallen off the subgraph, we cannot be coloring. */
3030 gcc_assert (coloring < 0);
3031
3032 /* Walk each successor block. */
3033 if (block->succs && block->succs->length ())
3034 {
3035 edge e;
3036 edge_iterator ei;
3037
3038 FOR_EACH_EDGE (e, ei, block->succs)
3039 nvptx_sese_color (color_counts, regions, e->dest, coloring);
3040 }
3041 else
3042 gcc_assert (coloring < 0);
3043 }
3044
3045 /* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3046 end up with NULL entries in it. */
3047
3048 static void
3049 nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
3050 {
3051 basic_block block;
3052 int ix;
3053
3054 /* First clear each BB of the whole function. */
3055 FOR_EACH_BB_FN (block, cfun)
3056 {
3057 block->flags &= ~BB_VISITED;
3058 BB_SET_SESE (block, 0);
3059 }
3060 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3061 block->flags &= ~BB_VISITED;
3062 BB_SET_SESE (block, 0);
3063 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3064 block->flags &= ~BB_VISITED;
3065 BB_SET_SESE (block, 0);
3066
3067 /* Mark blocks in the function that are in this graph. */
3068 for (ix = 0; blocks.iterate (ix, &block); ix++)
3069 block->flags |= BB_VISITED;
3070
3071 /* Counts of nodes assigned to each color. There cannot be more
3072 colors than blocks (and hopefully there will be fewer). */
3073 auto_vec<unsigned> color_counts;
3074 color_counts.reserve (blocks.length ());
3075
3076 /* Worklist of nodes in the spanning tree. Again, there cannot be
3077 more nodes in the tree than blocks (there will be fewer if the
3078 CFG of blocks is disjoint). */
3079 auto_vec<basic_block> spanlist;
3080 spanlist.reserve (blocks.length ());
3081
3082 /* Make sure every block has its cycle class determined. */
3083 for (ix = 0; blocks.iterate (ix, &block); ix++)
3084 {
3085 if (BB_GET_SESE (block))
3086 /* We already met this block in an earlier graph solve. */
3087 continue;
3088
3089 if (dump_file)
3090 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3091
3092 /* Number the nodes reachable from block initial DFS order. */
3093 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3094
3095 /* Now walk in reverse DFS order to find cycle equivalents. */
3096 while (spanlist.length ())
3097 {
3098 block = spanlist.pop ();
3099 bb_sese *sese = BB_GET_SESE (block);
3100
3101 /* Do the pseudo node below. */
3102 nvptx_sese_pseudo (block, sese, depth, +1,
3103 sese->dir > 0 ? block->succs : block->preds,
3104 (sese->dir > 0 ? offsetof (edge_def, dest)
3105 : offsetof (edge_def, src)));
3106 sese->set_color (color_counts);
3107 /* Do the pseudo node above. */
3108 nvptx_sese_pseudo (block, sese, depth, -1,
3109 sese->dir < 0 ? block->succs : block->preds,
3110 (sese->dir < 0 ? offsetof (edge_def, dest)
3111 : offsetof (edge_def, src)));
3112 }
3113 if (dump_file)
3114 fprintf (dump_file, "\n");
3115 }
3116
3117 if (dump_file)
3118 {
3119 unsigned count;
3120 const char *comma = "";
3121
3122 fprintf (dump_file, "Found %d cycle equivalents\n",
3123 color_counts.length ());
3124 for (ix = 0; color_counts.iterate (ix, &count); ix++)
3125 {
3126 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3127
3128 comma = "";
3129 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3130 if (BB_GET_SESE (block)->color == ix)
3131 {
3132 block->flags |= BB_VISITED;
3133 fprintf (dump_file, "%s%d", comma, block->index);
3134 comma=",";
3135 }
3136 fprintf (dump_file, "}");
3137 comma = ", ";
3138 }
3139 fprintf (dump_file, "\n");
3140 }
3141
3142 /* Now we've colored every block in the subgraph. We now need to
3143 determine the minimal set of SESE regions that cover that
3144 subgraph. Do this with a DFS walk of the complete function.
3145 During the walk we're either 'looking' or 'coloring'. When we
3146 reach the last node of a particular color, we stop coloring and
3147 return to looking. */
3148
3149 /* There cannot be more SESE regions than colors. */
3150 regions.reserve (color_counts.length ());
3151 for (ix = color_counts.length (); ix--;)
3152 regions.quick_push (bb_pair_t (0, 0));
3153
3154 for (ix = 0; blocks.iterate (ix, &block); ix++)
3155 block->flags &= ~BB_VISITED;
3156
3157 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
3158
3159 if (dump_file)
3160 {
3161 const char *comma = "";
3162 int len = regions.length ();
3163
3164 fprintf (dump_file, "SESE regions:");
3165 for (ix = 0; ix != len; ix++)
3166 {
3167 basic_block from = regions[ix].first;
3168 basic_block to = regions[ix].second;
3169
3170 if (from)
3171 {
3172 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
3173 if (to != from)
3174 fprintf (dump_file, "->%d", to->index);
3175
3176 int color = BB_GET_SESE (from)->color;
3177
3178 /* Print the blocks within the region (excluding ends). */
3179 FOR_EACH_BB_FN (block, cfun)
3180 {
3181 bb_sese *sese = BB_GET_SESE (block);
3182
3183 if (sese && sese->color == color
3184 && block != from && block != to)
3185 fprintf (dump_file, ".%d", block->index);
3186 }
3187 fprintf (dump_file, "}");
3188 }
3189 comma = ",";
3190 }
3191 fprintf (dump_file, "\n\n");
3192 }
3193
3194 for (ix = 0; blocks.iterate (ix, &block); ix++)
3195 delete BB_GET_SESE (block);
3196 }
3197
3198 #undef BB_SET_SESE
3199 #undef BB_GET_SESE
3200
3201 /* Propagate live state at the start of a partitioned region. BLOCK
3202 provides the live register information, and might not contain
3203 INSN. Propagation is inserted just after INSN. RW indicates whether
3204 we are reading and/or writing state. This
3205 separation is needed for worker-level proppagation where we
3206 essentially do a spill & fill. FN is the underlying worker
3207 function to generate the propagation instructions for single
3208 register. DATA is user data.
3209
3210 We propagate the live register set and the entire frame. We could
3211 do better by (a) propagating just the live set that is used within
3212 the partitioned regions and (b) only propagating stack entries that
3213 are used. The latter might be quite hard to determine. */
3214
3215 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
3216
3217 static void
3218 nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
3219 propagator_fn fn, void *data)
3220 {
3221 bitmap live = DF_LIVE_IN (block);
3222 bitmap_iterator iterator;
3223 unsigned ix;
3224
3225 /* Copy the frame array. */
3226 HOST_WIDE_INT fs = get_frame_size ();
3227 if (fs)
3228 {
3229 rtx tmp = gen_reg_rtx (DImode);
3230 rtx idx = NULL_RTX;
3231 rtx ptr = gen_reg_rtx (Pmode);
3232 rtx pred = NULL_RTX;
3233 rtx_code_label *label = NULL;
3234
3235 gcc_assert (!(fs & (GET_MODE_SIZE (DImode) - 1)));
3236 fs /= GET_MODE_SIZE (DImode);
3237 /* Detect single iteration loop. */
3238 if (fs == 1)
3239 fs = 0;
3240
3241 start_sequence ();
3242 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
3243 if (fs)
3244 {
3245 idx = gen_reg_rtx (SImode);
3246 pred = gen_reg_rtx (BImode);
3247 label = gen_label_rtx ();
3248
3249 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
3250 /* Allow worker function to initialize anything needed. */
3251 rtx init = fn (tmp, PM_loop_begin, fs, data);
3252 if (init)
3253 emit_insn (init);
3254 emit_label (label);
3255 LABEL_NUSES (label)++;
3256 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
3257 }
3258 if (rw & PM_read)
3259 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
3260 emit_insn (fn (tmp, rw, fs, data));
3261 if (rw & PM_write)
3262 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
3263 if (fs)
3264 {
3265 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
3266 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
3267 emit_insn (gen_br_true_uni (pred, label));
3268 rtx fini = fn (tmp, PM_loop_end, fs, data);
3269 if (fini)
3270 emit_insn (fini);
3271 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
3272 }
3273 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
3274 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
3275 rtx cpy = get_insns ();
3276 end_sequence ();
3277 insn = emit_insn_after (cpy, insn);
3278 }
3279
3280 /* Copy live registers. */
3281 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
3282 {
3283 rtx reg = regno_reg_rtx[ix];
3284
3285 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
3286 {
3287 rtx bcast = fn (reg, rw, 0, data);
3288
3289 insn = emit_insn_after (bcast, insn);
3290 }
3291 }
3292 }
3293
3294 /* Worker for nvptx_vpropagate. */
3295
3296 static rtx
3297 vprop_gen (rtx reg, propagate_mask pm,
3298 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data))
3299 {
3300 if (!(pm & PM_read_write))
3301 return 0;
3302
3303 return nvptx_gen_vcast (reg);
3304 }
3305
3306 /* Propagate state that is live at start of BLOCK across the vectors
3307 of a single warp. Propagation is inserted just after INSN. */
3308
3309 static void
3310 nvptx_vpropagate (basic_block block, rtx_insn *insn)
3311 {
3312 nvptx_propagate (block, insn, PM_read_write, vprop_gen, 0);
3313 }
3314
3315 /* Worker for nvptx_wpropagate. */
3316
3317 static rtx
3318 wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
3319 {
3320 wcast_data_t *data = (wcast_data_t *)data_;
3321
3322 if (pm & PM_loop_begin)
3323 {
3324 /* Starting a loop, initialize pointer. */
3325 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
3326
3327 if (align > worker_bcast_align)
3328 worker_bcast_align = align;
3329 data->offset = (data->offset + align - 1) & ~(align - 1);
3330
3331 data->ptr = gen_reg_rtx (Pmode);
3332
3333 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
3334 }
3335 else if (pm & PM_loop_end)
3336 {
3337 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
3338 data->ptr = NULL_RTX;
3339 return clobber;
3340 }
3341 else
3342 return nvptx_gen_wcast (reg, pm, rep, data);
3343 }
3344
3345 /* Spill or fill live state that is live at start of BLOCK. PRE_P
3346 indicates if this is just before partitioned mode (do spill), or
3347 just after it starts (do fill). Sequence is inserted just after
3348 INSN. */
3349
3350 static void
3351 nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
3352 {
3353 wcast_data_t data;
3354
3355 data.base = gen_reg_rtx (Pmode);
3356 data.offset = 0;
3357 data.ptr = NULL_RTX;
3358
3359 nvptx_propagate (block, insn, pre_p ? PM_read : PM_write, wprop_gen, &data);
3360 if (data.offset)
3361 {
3362 /* Stuff was emitted, initialize the base pointer now. */
3363 rtx init = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, worker_bcast_sym),
3364 UNSPEC_TO_GENERIC);
3365 init = gen_rtx_SET (data.base, init);
3366 emit_insn_after (init, insn);
3367
3368 if (worker_bcast_size < data.offset)
3369 worker_bcast_size = data.offset;
3370 }
3371 }
3372
3373 /* Emit a worker-level synchronization barrier. We use different
3374 markers for before and after synchronizations. */
3375
3376 static rtx
3377 nvptx_wsync (bool after)
3378 {
3379 return gen_nvptx_barsync (GEN_INT (after));
3380 }
3381
3382 /* Single neutering according to MASK. FROM is the incoming block and
3383 TO is the outgoing block. These may be the same block. Insert at
3384 start of FROM:
3385
3386 if (tid.<axis>) goto end.
3387
3388 and insert before ending branch of TO (if there is such an insn):
3389
3390 end:
3391 <possibly-broadcast-cond>
3392 <branch>
3393
3394 We currently only use differnt FROM and TO when skipping an entire
3395 loop. We could do more if we detected superblocks. */
3396
3397 static void
3398 nvptx_single (unsigned mask, basic_block from, basic_block to)
3399 {
3400 rtx_insn *head = BB_HEAD (from);
3401 rtx_insn *tail = BB_END (to);
3402 unsigned skip_mask = mask;
3403
3404 /* Find first insn of from block */
3405 while (head != BB_END (from) && !INSN_P (head))
3406 head = NEXT_INSN (head);
3407
3408 /* Find last insn of to block */
3409 rtx_insn *limit = from == to ? head : BB_HEAD (to);
3410 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
3411 tail = PREV_INSN (tail);
3412
3413 /* Detect if tail is a branch. */
3414 rtx tail_branch = NULL_RTX;
3415 rtx cond_branch = NULL_RTX;
3416 if (tail && INSN_P (tail))
3417 {
3418 tail_branch = PATTERN (tail);
3419 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
3420 tail_branch = NULL_RTX;
3421 else
3422 {
3423 cond_branch = SET_SRC (tail_branch);
3424 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
3425 cond_branch = NULL_RTX;
3426 }
3427 }
3428
3429 if (tail == head)
3430 {
3431 /* If this is empty, do nothing. */
3432 if (!head || !INSN_P (head))
3433 return;
3434
3435 /* If this is a dummy insn, do nothing. */
3436 switch (recog_memoized (head))
3437 {
3438 default:
3439 break;
3440 case CODE_FOR_nvptx_fork:
3441 case CODE_FOR_nvptx_forked:
3442 case CODE_FOR_nvptx_joining:
3443 case CODE_FOR_nvptx_join:
3444 return;
3445 }
3446
3447 if (cond_branch)
3448 {
3449 /* If we're only doing vector single, there's no need to
3450 emit skip code because we'll not insert anything. */
3451 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
3452 skip_mask = 0;
3453 }
3454 else if (tail_branch)
3455 /* Block with only unconditional branch. Nothing to do. */
3456 return;
3457 }
3458
3459 /* Insert the vector test inside the worker test. */
3460 unsigned mode;
3461 rtx_insn *before = tail;
3462 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
3463 if (GOMP_DIM_MASK (mode) & skip_mask)
3464 {
3465 rtx_code_label *label = gen_label_rtx ();
3466 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
3467
3468 if (!pred)
3469 {
3470 pred = gen_reg_rtx (BImode);
3471 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
3472 }
3473
3474 rtx br;
3475 if (mode == GOMP_DIM_VECTOR)
3476 br = gen_br_true (pred, label);
3477 else
3478 br = gen_br_true_uni (pred, label);
3479 emit_insn_before (br, head);
3480
3481 LABEL_NUSES (label)++;
3482 if (tail_branch)
3483 before = emit_label_before (label, before);
3484 else
3485 emit_label_after (label, tail);
3486 }
3487
3488 /* Now deal with propagating the branch condition. */
3489 if (cond_branch)
3490 {
3491 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
3492
3493 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
3494 {
3495 /* Vector mode only, do a shuffle. */
3496 emit_insn_before (nvptx_gen_vcast (pvar), tail);
3497 }
3498 else
3499 {
3500 /* Includes worker mode, do spill & fill. By construction
3501 we should never have worker mode only. */
3502 wcast_data_t data;
3503
3504 data.base = worker_bcast_sym;
3505 data.ptr = 0;
3506
3507 if (worker_bcast_size < GET_MODE_SIZE (SImode))
3508 worker_bcast_size = GET_MODE_SIZE (SImode);
3509
3510 data.offset = 0;
3511 emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data),
3512 before);
3513 /* Barrier so other workers can see the write. */
3514 emit_insn_before (nvptx_wsync (false), tail);
3515 data.offset = 0;
3516 emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data), tail);
3517 /* This barrier is needed to avoid worker zero clobbering
3518 the broadcast buffer before all the other workers have
3519 had a chance to read this instance of it. */
3520 emit_insn_before (nvptx_wsync (true), tail);
3521 }
3522
3523 extract_insn (tail);
3524 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
3525 UNSPEC_BR_UNIFIED);
3526 validate_change (tail, recog_data.operand_loc[0], unsp, false);
3527 }
3528 }
3529
3530 /* PAR is a parallel that is being skipped in its entirety according to
3531 MASK. Treat this as skipping a superblock starting at forked
3532 and ending at joining. */
3533
3534 static void
3535 nvptx_skip_par (unsigned mask, parallel *par)
3536 {
3537 basic_block tail = par->join_block;
3538 gcc_assert (tail->preds->length () == 1);
3539
3540 basic_block pre_tail = (*tail->preds)[0]->src;
3541 gcc_assert (pre_tail->succs->length () == 1);
3542
3543 nvptx_single (mask, par->forked_block, pre_tail);
3544 }
3545
3546 /* If PAR has a single inner parallel and PAR itself only contains
3547 empty entry and exit blocks, swallow the inner PAR. */
3548
3549 static void
3550 nvptx_optimize_inner (parallel *par)
3551 {
3552 parallel *inner = par->inner;
3553
3554 /* We mustn't be the outer dummy par. */
3555 if (!par->mask)
3556 return;
3557
3558 /* We must have a single inner par. */
3559 if (!inner || inner->next)
3560 return;
3561
3562 /* We must only contain 2 blocks ourselves -- the head and tail of
3563 the inner par. */
3564 if (par->blocks.length () != 2)
3565 return;
3566
3567 /* We must be disjoint partitioning. As we only have vector and
3568 worker partitioning, this is sufficient to guarantee the pars
3569 have adjacent partitioning. */
3570 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
3571 /* This indicates malformed code generation. */
3572 return;
3573
3574 /* The outer forked insn should be immediately followed by the inner
3575 fork insn. */
3576 rtx_insn *forked = par->forked_insn;
3577 rtx_insn *fork = BB_END (par->forked_block);
3578
3579 if (NEXT_INSN (forked) != fork)
3580 return;
3581 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
3582
3583 /* The outer joining insn must immediately follow the inner join
3584 insn. */
3585 rtx_insn *joining = par->joining_insn;
3586 rtx_insn *join = inner->join_insn;
3587 if (NEXT_INSN (join) != joining)
3588 return;
3589
3590 /* Preconditions met. Swallow the inner par. */
3591 if (dump_file)
3592 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
3593 inner->mask, inner->forked_block->index,
3594 inner->join_block->index,
3595 par->mask, par->forked_block->index, par->join_block->index);
3596
3597 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
3598
3599 par->blocks.reserve (inner->blocks.length ());
3600 while (inner->blocks.length ())
3601 par->blocks.quick_push (inner->blocks.pop ());
3602
3603 par->inner = inner->inner;
3604 inner->inner = NULL;
3605
3606 delete inner;
3607 }
3608
3609 /* Process the parallel PAR and all its contained
3610 parallels. We do everything but the neutering. Return mask of
3611 partitioned modes used within this parallel. */
3612
3613 static unsigned
3614 nvptx_process_pars (parallel *par)
3615 {
3616 if (nvptx_optimize)
3617 nvptx_optimize_inner (par);
3618
3619 unsigned inner_mask = par->mask;
3620
3621 /* Do the inner parallels first. */
3622 if (par->inner)
3623 {
3624 par->inner_mask = nvptx_process_pars (par->inner);
3625 inner_mask |= par->inner_mask;
3626 }
3627
3628 if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
3629 /* No propagation needed for a call. */;
3630 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
3631 {
3632 nvptx_wpropagate (false, par->forked_block, par->forked_insn);
3633 nvptx_wpropagate (true, par->forked_block, par->fork_insn);
3634 /* Insert begin and end synchronizations. */
3635 emit_insn_after (nvptx_wsync (false), par->forked_insn);
3636 emit_insn_before (nvptx_wsync (true), par->joining_insn);
3637 }
3638 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
3639 nvptx_vpropagate (par->forked_block, par->forked_insn);
3640
3641 /* Now do siblings. */
3642 if (par->next)
3643 inner_mask |= nvptx_process_pars (par->next);
3644 return inner_mask;
3645 }
3646
3647 /* Neuter the parallel described by PAR. We recurse in depth-first
3648 order. MODES are the partitioning of the execution and OUTER is
3649 the partitioning of the parallels we are contained in. */
3650
3651 static void
3652 nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
3653 {
3654 unsigned me = (par->mask
3655 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
3656 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
3657 unsigned skip_mask = 0, neuter_mask = 0;
3658
3659 if (par->inner)
3660 nvptx_neuter_pars (par->inner, modes, outer | me);
3661
3662 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
3663 {
3664 if ((outer | me) & GOMP_DIM_MASK (mode))
3665 {} /* Mode is partitioned: no neutering. */
3666 else if (!(modes & GOMP_DIM_MASK (mode)))
3667 {} /* Mode is not used: nothing to do. */
3668 else if (par->inner_mask & GOMP_DIM_MASK (mode)
3669 || !par->forked_insn)
3670 /* Partitioned in inner parallels, or we're not a partitioned
3671 at all: neuter individual blocks. */
3672 neuter_mask |= GOMP_DIM_MASK (mode);
3673 else if (!par->parent || !par->parent->forked_insn
3674 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
3675 /* Parent isn't a parallel or contains this paralleling: skip
3676 parallel at this level. */
3677 skip_mask |= GOMP_DIM_MASK (mode);
3678 else
3679 {} /* Parent will skip this parallel itself. */
3680 }
3681
3682 if (neuter_mask)
3683 {
3684 int ix, len;
3685
3686 if (nvptx_optimize)
3687 {
3688 /* Neuter whole SESE regions. */
3689 bb_pair_vec_t regions;
3690
3691 nvptx_find_sese (par->blocks, regions);
3692 len = regions.length ();
3693 for (ix = 0; ix != len; ix++)
3694 {
3695 basic_block from = regions[ix].first;
3696 basic_block to = regions[ix].second;
3697
3698 if (from)
3699 nvptx_single (neuter_mask, from, to);
3700 else
3701 gcc_assert (!to);
3702 }
3703 }
3704 else
3705 {
3706 /* Neuter each BB individually. */
3707 len = par->blocks.length ();
3708 for (ix = 0; ix != len; ix++)
3709 {
3710 basic_block block = par->blocks[ix];
3711
3712 nvptx_single (neuter_mask, block, block);
3713 }
3714 }
3715 }
3716
3717 if (skip_mask)
3718 nvptx_skip_par (skip_mask, par);
3719
3720 if (par->next)
3721 nvptx_neuter_pars (par->next, modes, outer);
3722 }
3723
3724 /* PTX-specific reorganization
3725 - Split blocks at fork and join instructions
3726 - Compute live registers
3727 - Mark now-unused registers, so function begin doesn't declare
3728 unused registers.
3729 - Insert state propagation when entering partitioned mode
3730 - Insert neutering instructions when in single mode
3731 - Replace subregs with suitable sequences.
3732 */
3733
3734 static void
3735 nvptx_reorg (void)
3736 {
3737 /* We are freeing block_for_insn in the toplev to keep compatibility
3738 with old MDEP_REORGS that are not CFG based. Recompute it now. */
3739 compute_bb_for_insn ();
3740
3741 thread_prologue_and_epilogue_insns ();
3742
3743 /* Split blocks and record interesting unspecs. */
3744 bb_insn_map_t bb_insn_map;
3745
3746 nvptx_split_blocks (&bb_insn_map);
3747
3748 /* Compute live regs */
3749 df_clear_flags (DF_LR_RUN_DCE);
3750 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
3751 df_live_add_problem ();
3752 df_live_set_all_dirty ();
3753 df_analyze ();
3754 regstat_init_n_sets_and_refs ();
3755
3756 if (dump_file)
3757 df_dump (dump_file);
3758
3759 /* Mark unused regs as unused. */
3760 int max_regs = max_reg_num ();
3761 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
3762 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
3763 regno_reg_rtx[i] = const0_rtx;
3764
3765 /* Determine launch dimensions of the function. If it is not an
3766 offloaded function (i.e. this is a regular compiler), the
3767 function has no neutering. */
3768 tree attr = get_oacc_fn_attrib (current_function_decl);
3769 if (attr)
3770 {
3771 /* If we determined this mask before RTL expansion, we could
3772 elide emission of some levels of forks and joins. */
3773 unsigned mask = 0;
3774 tree dims = TREE_VALUE (attr);
3775 unsigned ix;
3776
3777 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
3778 {
3779 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
3780 tree allowed = TREE_PURPOSE (dims);
3781
3782 if (size != 1 && !(allowed && integer_zerop (allowed)))
3783 mask |= GOMP_DIM_MASK (ix);
3784 }
3785 /* If there is worker neutering, there must be vector
3786 neutering. Otherwise the hardware will fail. */
3787 gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
3788 || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
3789
3790 /* Discover & process partitioned regions. */
3791 parallel *pars = nvptx_discover_pars (&bb_insn_map);
3792 nvptx_process_pars (pars);
3793 nvptx_neuter_pars (pars, mask, 0);
3794 delete pars;
3795 }
3796
3797 /* Replace subregs. */
3798 nvptx_reorg_subreg ();
3799
3800 regstat_free_n_sets_and_refs ();
3801
3802 df_finish_pass (true);
3803 }
3804 \f
3805 /* Handle a "kernel" attribute; arguments as in
3806 struct attribute_spec.handler. */
3807
3808 static tree
3809 nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
3810 int ARG_UNUSED (flags), bool *no_add_attrs)
3811 {
3812 tree decl = *node;
3813
3814 if (TREE_CODE (decl) != FUNCTION_DECL)
3815 {
3816 error ("%qE attribute only applies to functions", name);
3817 *no_add_attrs = true;
3818 }
3819 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
3820 {
3821 error ("%qE attribute requires a void return type", name);
3822 *no_add_attrs = true;
3823 }
3824
3825 return NULL_TREE;
3826 }
3827
3828 /* Table of valid machine attributes. */
3829 static const struct attribute_spec nvptx_attribute_table[] =
3830 {
3831 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
3832 affects_type_identity } */
3833 { "kernel", 0, 0, true, false, false, nvptx_handle_kernel_attribute, false },
3834 { NULL, 0, 0, false, false, false, NULL, false }
3835 };
3836 \f
3837 /* Limit vector alignments to BIGGEST_ALIGNMENT. */
3838
3839 static HOST_WIDE_INT
3840 nvptx_vector_alignment (const_tree type)
3841 {
3842 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
3843
3844 return MIN (align, BIGGEST_ALIGNMENT);
3845 }
3846
3847 /* Indicate that INSN cannot be duplicated. */
3848
3849 static bool
3850 nvptx_cannot_copy_insn_p (rtx_insn *insn)
3851 {
3852 switch (recog_memoized (insn))
3853 {
3854 case CODE_FOR_nvptx_shufflesi:
3855 case CODE_FOR_nvptx_shufflesf:
3856 case CODE_FOR_nvptx_barsync:
3857 case CODE_FOR_nvptx_fork:
3858 case CODE_FOR_nvptx_forked:
3859 case CODE_FOR_nvptx_joining:
3860 case CODE_FOR_nvptx_join:
3861 return true;
3862 default:
3863 return false;
3864 }
3865 }
3866
3867 /* Section anchors do not work. Initialization for flag_section_anchor
3868 probes the existence of the anchoring target hooks and prevents
3869 anchoring if they don't exist. However, we may be being used with
3870 a host-side compiler that does support anchoring, and hence see
3871 the anchor flag set (as it's not recalculated). So provide an
3872 implementation denying anchoring. */
3873
3874 static bool
3875 nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
3876 {
3877 return false;
3878 }
3879 \f
3880 /* Record a symbol for mkoffload to enter into the mapping table. */
3881
3882 static void
3883 nvptx_record_offload_symbol (tree decl)
3884 {
3885 switch (TREE_CODE (decl))
3886 {
3887 case VAR_DECL:
3888 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
3889 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
3890 break;
3891
3892 case FUNCTION_DECL:
3893 {
3894 tree attr = get_oacc_fn_attrib (decl);
3895 tree dims = TREE_VALUE (attr);
3896 unsigned ix;
3897
3898 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
3899 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
3900
3901 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
3902 {
3903 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
3904
3905 gcc_assert (!TREE_PURPOSE (dims));
3906 fprintf (asm_out_file, ", %#x", size);
3907 }
3908
3909 fprintf (asm_out_file, "\n");
3910 }
3911 break;
3912
3913 default:
3914 gcc_unreachable ();
3915 }
3916 }
3917
3918 /* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
3919 at the start of a file. */
3920
3921 static void
3922 nvptx_file_start (void)
3923 {
3924 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
3925 fputs ("\t.version\t3.1\n", asm_out_file);
3926 fputs ("\t.target\tsm_30\n", asm_out_file);
3927 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
3928 fputs ("// END PREAMBLE\n", asm_out_file);
3929 }
3930
3931 /* Emit a declaration for a worker-level buffer in .shared memory. */
3932
3933 static void
3934 write_worker_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
3935 {
3936 const char *name = XSTR (sym, 0);
3937
3938 write_var_marker (file, true, false, name);
3939 fprintf (file, ".shared .align %d .u8 %s[%d];\n",
3940 align, name, size);
3941 }
3942
3943 /* Write out the function declarations we've collected and declare storage
3944 for the broadcast buffer. */
3945
3946 static void
3947 nvptx_file_end (void)
3948 {
3949 hash_table<tree_hasher>::iterator iter;
3950 tree decl;
3951 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
3952 nvptx_record_fndecl (decl);
3953 fputs (func_decls.str().c_str(), asm_out_file);
3954
3955 if (worker_bcast_size)
3956 write_worker_buffer (asm_out_file, worker_bcast_sym,
3957 worker_bcast_align, worker_bcast_size);
3958
3959 if (worker_red_size)
3960 write_worker_buffer (asm_out_file, worker_red_sym,
3961 worker_red_align, worker_red_size);
3962 }
3963
3964 /* Expander for the shuffle builtins. */
3965
3966 static rtx
3967 nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
3968 {
3969 if (ignore)
3970 return target;
3971
3972 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
3973 NULL_RTX, mode, EXPAND_NORMAL);
3974 if (!REG_P (src))
3975 src = copy_to_mode_reg (mode, src);
3976
3977 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
3978 NULL_RTX, SImode, EXPAND_NORMAL);
3979 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
3980 NULL_RTX, SImode, EXPAND_NORMAL);
3981
3982 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
3983 idx = copy_to_mode_reg (SImode, idx);
3984
3985 rtx pat = nvptx_gen_shuffle (target, src, idx,
3986 (nvptx_shuffle_kind) INTVAL (op));
3987 if (pat)
3988 emit_insn (pat);
3989
3990 return target;
3991 }
3992
3993 /* Worker reduction address expander. */
3994
3995 static rtx
3996 nvptx_expand_worker_addr (tree exp, rtx target,
3997 machine_mode ARG_UNUSED (mode), int ignore)
3998 {
3999 if (ignore)
4000 return target;
4001
4002 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
4003 if (align > worker_red_align)
4004 worker_red_align = align;
4005
4006 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
4007 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
4008 if (size + offset > worker_red_size)
4009 worker_red_size = size + offset;
4010
4011 rtx addr = worker_red_sym;
4012 if (offset)
4013 {
4014 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
4015 addr = gen_rtx_CONST (Pmode, addr);
4016 }
4017
4018 emit_move_insn (target, addr);
4019
4020 return target;
4021 }
4022
4023 /* Expand the CMP_SWAP PTX builtins. We have our own versions that do
4024 not require taking the address of any object, other than the memory
4025 cell being operated on. */
4026
4027 static rtx
4028 nvptx_expand_cmp_swap (tree exp, rtx target,
4029 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
4030 {
4031 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
4032
4033 if (!target)
4034 target = gen_reg_rtx (mode);
4035
4036 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
4037 NULL_RTX, Pmode, EXPAND_NORMAL);
4038 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
4039 NULL_RTX, mode, EXPAND_NORMAL);
4040 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
4041 NULL_RTX, mode, EXPAND_NORMAL);
4042 rtx pat;
4043
4044 mem = gen_rtx_MEM (mode, mem);
4045 if (!REG_P (cmp))
4046 cmp = copy_to_mode_reg (mode, cmp);
4047 if (!REG_P (src))
4048 src = copy_to_mode_reg (mode, src);
4049
4050 if (mode == SImode)
4051 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
4052 else
4053 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
4054
4055 emit_insn (pat);
4056
4057 return target;
4058 }
4059
4060
4061 /* Codes for all the NVPTX builtins. */
4062 enum nvptx_builtins
4063 {
4064 NVPTX_BUILTIN_SHUFFLE,
4065 NVPTX_BUILTIN_SHUFFLELL,
4066 NVPTX_BUILTIN_WORKER_ADDR,
4067 NVPTX_BUILTIN_CMP_SWAP,
4068 NVPTX_BUILTIN_CMP_SWAPLL,
4069 NVPTX_BUILTIN_MAX
4070 };
4071
4072 static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
4073
4074 /* Return the NVPTX builtin for CODE. */
4075
4076 static tree
4077 nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
4078 {
4079 if (code >= NVPTX_BUILTIN_MAX)
4080 return error_mark_node;
4081
4082 return nvptx_builtin_decls[code];
4083 }
4084
4085 /* Set up all builtin functions for this target. */
4086
4087 static void
4088 nvptx_init_builtins (void)
4089 {
4090 #define DEF(ID, NAME, T) \
4091 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
4092 = add_builtin_function ("__builtin_nvptx_" NAME, \
4093 build_function_type_list T, \
4094 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
4095 #define ST sizetype
4096 #define UINT unsigned_type_node
4097 #define LLUINT long_long_unsigned_type_node
4098 #define PTRVOID ptr_type_node
4099
4100 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
4101 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
4102 DEF (WORKER_ADDR, "worker_addr",
4103 (PTRVOID, ST, UINT, UINT, NULL_TREE));
4104 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
4105 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
4106
4107 #undef DEF
4108 #undef ST
4109 #undef UINT
4110 #undef LLUINT
4111 #undef PTRVOID
4112 }
4113
4114 /* Expand an expression EXP that calls a built-in function,
4115 with result going to TARGET if that's convenient
4116 (and in mode MODE if that's convenient).
4117 SUBTARGET may be used as the target for computing one of EXP's operands.
4118 IGNORE is nonzero if the value is to be ignored. */
4119
4120 static rtx
4121 nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
4122 machine_mode mode, int ignore)
4123 {
4124 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4125 switch (DECL_FUNCTION_CODE (fndecl))
4126 {
4127 case NVPTX_BUILTIN_SHUFFLE:
4128 case NVPTX_BUILTIN_SHUFFLELL:
4129 return nvptx_expand_shuffle (exp, target, mode, ignore);
4130
4131 case NVPTX_BUILTIN_WORKER_ADDR:
4132 return nvptx_expand_worker_addr (exp, target, mode, ignore);
4133
4134 case NVPTX_BUILTIN_CMP_SWAP:
4135 case NVPTX_BUILTIN_CMP_SWAPLL:
4136 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
4137
4138 default: gcc_unreachable ();
4139 }
4140 }
4141 \f
4142 /* Define dimension sizes for known hardware. */
4143 #define PTX_VECTOR_LENGTH 32
4144 #define PTX_WORKER_LENGTH 32
4145
4146 /* Validate compute dimensions of an OpenACC offload or routine, fill
4147 in non-unity defaults. FN_LEVEL indicates the level at which a
4148 routine might spawn a loop. It is negative for non-routines. */
4149
4150 static bool
4151 nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
4152 {
4153 bool changed = false;
4154
4155 /* The vector size must be 32, unless this is a SEQ routine. */
4156 if (fn_level <= GOMP_DIM_VECTOR
4157 && dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH)
4158 {
4159 if (dims[GOMP_DIM_VECTOR] >= 0 && fn_level < 0)
4160 warning_at (DECL_SOURCE_LOCATION (decl), 0,
4161 dims[GOMP_DIM_VECTOR]
4162 ? "using vector_length (%d), ignoring %d"
4163 : "using vector_length (%d), ignoring runtime setting",
4164 PTX_VECTOR_LENGTH, dims[GOMP_DIM_VECTOR]);
4165 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
4166 changed = true;
4167 }
4168
4169 /* Check the num workers is not too large. */
4170 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
4171 {
4172 warning_at (DECL_SOURCE_LOCATION (decl), 0,
4173 "using num_workers (%d), ignoring %d",
4174 PTX_WORKER_LENGTH, dims[GOMP_DIM_WORKER]);
4175 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
4176 changed = true;
4177 }
4178
4179 return changed;
4180 }
4181
4182 /* Return maximum dimension size, or zero for unbounded. */
4183
4184 static int
4185 nvptx_dim_limit (int axis)
4186 {
4187 switch (axis)
4188 {
4189 case GOMP_DIM_WORKER:
4190 return PTX_WORKER_LENGTH;
4191
4192 case GOMP_DIM_VECTOR:
4193 return PTX_VECTOR_LENGTH;
4194
4195 default:
4196 break;
4197 }
4198 return 0;
4199 }
4200
4201 /* Determine whether fork & joins are needed. */
4202
4203 static bool
4204 nvptx_goacc_fork_join (gcall *call, const int dims[],
4205 bool ARG_UNUSED (is_fork))
4206 {
4207 tree arg = gimple_call_arg (call, 2);
4208 unsigned axis = TREE_INT_CST_LOW (arg);
4209
4210 /* We only care about worker and vector partitioning. */
4211 if (axis < GOMP_DIM_WORKER)
4212 return false;
4213
4214 /* If the size is 1, there's no partitioning. */
4215 if (dims[axis] == 1)
4216 return false;
4217
4218 return true;
4219 }
4220
4221 /* Generate a PTX builtin function call that returns the address in
4222 the worker reduction buffer at OFFSET. TYPE is the type of the
4223 data at that location. */
4224
4225 static tree
4226 nvptx_get_worker_red_addr (tree type, tree offset)
4227 {
4228 machine_mode mode = TYPE_MODE (type);
4229 tree fndecl = nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR, true);
4230 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
4231 tree align = build_int_cst (unsigned_type_node,
4232 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
4233 tree call = build_call_expr (fndecl, 3, offset, size, align);
4234
4235 return fold_convert (build_pointer_type (type), call);
4236 }
4237
4238 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
4239 will cast the variable if necessary. */
4240
4241 static void
4242 nvptx_generate_vector_shuffle (location_t loc,
4243 tree dest_var, tree var, unsigned shift,
4244 gimple_seq *seq)
4245 {
4246 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
4247 tree_code code = NOP_EXPR;
4248 tree arg_type = unsigned_type_node;
4249 tree var_type = TREE_TYPE (var);
4250 tree dest_type = var_type;
4251
4252 if (TREE_CODE (var_type) == COMPLEX_TYPE)
4253 var_type = TREE_TYPE (var_type);
4254
4255 if (TREE_CODE (var_type) == REAL_TYPE)
4256 code = VIEW_CONVERT_EXPR;
4257
4258 if (TYPE_SIZE (var_type)
4259 == TYPE_SIZE (long_long_unsigned_type_node))
4260 {
4261 fn = NVPTX_BUILTIN_SHUFFLELL;
4262 arg_type = long_long_unsigned_type_node;
4263 }
4264
4265 tree call = nvptx_builtin_decl (fn, true);
4266 tree bits = build_int_cst (unsigned_type_node, shift);
4267 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
4268 tree expr;
4269
4270 if (var_type != dest_type)
4271 {
4272 /* Do real and imaginary parts separately. */
4273 tree real = fold_build1 (REALPART_EXPR, var_type, var);
4274 real = fold_build1 (code, arg_type, real);
4275 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
4276 real = fold_build1 (code, var_type, real);
4277
4278 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
4279 imag = fold_build1 (code, arg_type, imag);
4280 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
4281 imag = fold_build1 (code, var_type, imag);
4282
4283 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
4284 }
4285 else
4286 {
4287 expr = fold_build1 (code, arg_type, var);
4288 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
4289 expr = fold_build1 (code, dest_type, expr);
4290 }
4291
4292 gimplify_assign (dest_var, expr, seq);
4293 }
4294
4295 /* Lazily generate the global lock var decl and return its address. */
4296
4297 static tree
4298 nvptx_global_lock_addr ()
4299 {
4300 tree v = global_lock_var;
4301
4302 if (!v)
4303 {
4304 tree name = get_identifier ("__reduction_lock");
4305 tree type = build_qualified_type (unsigned_type_node,
4306 TYPE_QUAL_VOLATILE);
4307 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
4308 global_lock_var = v;
4309 DECL_ARTIFICIAL (v) = 1;
4310 DECL_EXTERNAL (v) = 1;
4311 TREE_STATIC (v) = 1;
4312 TREE_PUBLIC (v) = 1;
4313 TREE_USED (v) = 1;
4314 mark_addressable (v);
4315 mark_decl_referenced (v);
4316 }
4317
4318 return build_fold_addr_expr (v);
4319 }
4320
4321 /* Insert code to locklessly update *PTR with *PTR OP VAR just before
4322 GSI. We use a lockless scheme for nearly all case, which looks
4323 like:
4324 actual = initval(OP);
4325 do {
4326 guess = actual;
4327 write = guess OP myval;
4328 actual = cmp&swap (ptr, guess, write)
4329 } while (actual bit-different-to guess);
4330 return write;
4331
4332 This relies on a cmp&swap instruction, which is available for 32-
4333 and 64-bit types. Larger types must use a locking scheme. */
4334
4335 static tree
4336 nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
4337 tree ptr, tree var, tree_code op)
4338 {
4339 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
4340 tree_code code = NOP_EXPR;
4341 tree arg_type = unsigned_type_node;
4342 tree var_type = TREE_TYPE (var);
4343
4344 if (TREE_CODE (var_type) == COMPLEX_TYPE
4345 || TREE_CODE (var_type) == REAL_TYPE)
4346 code = VIEW_CONVERT_EXPR;
4347
4348 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
4349 {
4350 arg_type = long_long_unsigned_type_node;
4351 fn = NVPTX_BUILTIN_CMP_SWAPLL;
4352 }
4353
4354 tree swap_fn = nvptx_builtin_decl (fn, true);
4355
4356 gimple_seq init_seq = NULL;
4357 tree init_var = make_ssa_name (arg_type);
4358 tree init_expr = omp_reduction_init_op (loc, op, var_type);
4359 init_expr = fold_build1 (code, arg_type, init_expr);
4360 gimplify_assign (init_var, init_expr, &init_seq);
4361 gimple *init_end = gimple_seq_last (init_seq);
4362
4363 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
4364
4365 /* Split the block just after the init stmts. */
4366 basic_block pre_bb = gsi_bb (*gsi);
4367 edge pre_edge = split_block (pre_bb, init_end);
4368 basic_block loop_bb = pre_edge->dest;
4369 pre_bb = pre_edge->src;
4370 /* Reset the iterator. */
4371 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4372
4373 tree expect_var = make_ssa_name (arg_type);
4374 tree actual_var = make_ssa_name (arg_type);
4375 tree write_var = make_ssa_name (arg_type);
4376
4377 /* Build and insert the reduction calculation. */
4378 gimple_seq red_seq = NULL;
4379 tree write_expr = fold_build1 (code, var_type, expect_var);
4380 write_expr = fold_build2 (op, var_type, write_expr, var);
4381 write_expr = fold_build1 (code, arg_type, write_expr);
4382 gimplify_assign (write_var, write_expr, &red_seq);
4383
4384 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
4385
4386 /* Build & insert the cmp&swap sequence. */
4387 gimple_seq latch_seq = NULL;
4388 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
4389 ptr, expect_var, write_var);
4390 gimplify_assign (actual_var, swap_expr, &latch_seq);
4391
4392 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
4393 NULL_TREE, NULL_TREE);
4394 gimple_seq_add_stmt (&latch_seq, cond);
4395
4396 gimple *latch_end = gimple_seq_last (latch_seq);
4397 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
4398
4399 /* Split the block just after the latch stmts. */
4400 edge post_edge = split_block (loop_bb, latch_end);
4401 basic_block post_bb = post_edge->dest;
4402 loop_bb = post_edge->src;
4403 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4404
4405 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
4406 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
4407 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
4408 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
4409
4410 gphi *phi = create_phi_node (expect_var, loop_bb);
4411 add_phi_arg (phi, init_var, pre_edge, loc);
4412 add_phi_arg (phi, actual_var, loop_edge, loc);
4413
4414 loop *loop = alloc_loop ();
4415 loop->header = loop_bb;
4416 loop->latch = loop_bb;
4417 add_loop (loop, loop_bb->loop_father);
4418
4419 return fold_build1 (code, var_type, write_var);
4420 }
4421
4422 /* Insert code to lockfully update *PTR with *PTR OP VAR just before
4423 GSI. This is necessary for types larger than 64 bits, where there
4424 is no cmp&swap instruction to implement a lockless scheme. We use
4425 a lock variable in global memory.
4426
4427 while (cmp&swap (&lock_var, 0, 1))
4428 continue;
4429 T accum = *ptr;
4430 accum = accum OP var;
4431 *ptr = accum;
4432 cmp&swap (&lock_var, 1, 0);
4433 return accum;
4434
4435 A lock in global memory is necessary to force execution engine
4436 descheduling and avoid resource starvation that can occur if the
4437 lock is in .shared memory. */
4438
4439 static tree
4440 nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
4441 tree ptr, tree var, tree_code op)
4442 {
4443 tree var_type = TREE_TYPE (var);
4444 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
4445 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
4446 tree uns_locked = build_int_cst (unsigned_type_node, 1);
4447
4448 /* Split the block just before the gsi. Insert a gimple nop to make
4449 this easier. */
4450 gimple *nop = gimple_build_nop ();
4451 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
4452 basic_block entry_bb = gsi_bb (*gsi);
4453 edge entry_edge = split_block (entry_bb, nop);
4454 basic_block lock_bb = entry_edge->dest;
4455 /* Reset the iterator. */
4456 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4457
4458 /* Build and insert the locking sequence. */
4459 gimple_seq lock_seq = NULL;
4460 tree lock_var = make_ssa_name (unsigned_type_node);
4461 tree lock_expr = nvptx_global_lock_addr ();
4462 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
4463 uns_unlocked, uns_locked);
4464 gimplify_assign (lock_var, lock_expr, &lock_seq);
4465 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
4466 NULL_TREE, NULL_TREE);
4467 gimple_seq_add_stmt (&lock_seq, cond);
4468 gimple *lock_end = gimple_seq_last (lock_seq);
4469 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
4470
4471 /* Split the block just after the lock sequence. */
4472 edge locked_edge = split_block (lock_bb, lock_end);
4473 basic_block update_bb = locked_edge->dest;
4474 lock_bb = locked_edge->src;
4475 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4476
4477 /* Create the lock loop ... */
4478 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
4479 make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
4480 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
4481 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
4482
4483 /* ... and the loop structure. */
4484 loop *lock_loop = alloc_loop ();
4485 lock_loop->header = lock_bb;
4486 lock_loop->latch = lock_bb;
4487 lock_loop->nb_iterations_estimate = 1;
4488 lock_loop->any_estimate = true;
4489 add_loop (lock_loop, entry_bb->loop_father);
4490
4491 /* Build and insert the reduction calculation. */
4492 gimple_seq red_seq = NULL;
4493 tree acc_in = make_ssa_name (var_type);
4494 tree ref_in = build_simple_mem_ref (ptr);
4495 TREE_THIS_VOLATILE (ref_in) = 1;
4496 gimplify_assign (acc_in, ref_in, &red_seq);
4497
4498 tree acc_out = make_ssa_name (var_type);
4499 tree update_expr = fold_build2 (op, var_type, ref_in, var);
4500 gimplify_assign (acc_out, update_expr, &red_seq);
4501
4502 tree ref_out = build_simple_mem_ref (ptr);
4503 TREE_THIS_VOLATILE (ref_out) = 1;
4504 gimplify_assign (ref_out, acc_out, &red_seq);
4505
4506 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
4507
4508 /* Build & insert the unlock sequence. */
4509 gimple_seq unlock_seq = NULL;
4510 tree unlock_expr = nvptx_global_lock_addr ();
4511 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
4512 uns_locked, uns_unlocked);
4513 gimplify_and_add (unlock_expr, &unlock_seq);
4514 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
4515
4516 return acc_out;
4517 }
4518
4519 /* Emit a sequence to update a reduction accumlator at *PTR with the
4520 value held in VAR using operator OP. Return the updated value.
4521
4522 TODO: optimize for atomic ops and indepedent complex ops. */
4523
4524 static tree
4525 nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
4526 tree ptr, tree var, tree_code op)
4527 {
4528 tree type = TREE_TYPE (var);
4529 tree size = TYPE_SIZE (type);
4530
4531 if (size == TYPE_SIZE (unsigned_type_node)
4532 || size == TYPE_SIZE (long_long_unsigned_type_node))
4533 return nvptx_lockless_update (loc, gsi, ptr, var, op);
4534 else
4535 return nvptx_lockfull_update (loc, gsi, ptr, var, op);
4536 }
4537
4538 /* NVPTX implementation of GOACC_REDUCTION_SETUP. */
4539
4540 static void
4541 nvptx_goacc_reduction_setup (gcall *call)
4542 {
4543 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4544 tree lhs = gimple_call_lhs (call);
4545 tree var = gimple_call_arg (call, 2);
4546 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4547 gimple_seq seq = NULL;
4548
4549 push_gimplify_context (true);
4550
4551 if (level != GOMP_DIM_GANG)
4552 {
4553 /* Copy the receiver object. */
4554 tree ref_to_res = gimple_call_arg (call, 1);
4555
4556 if (!integer_zerop (ref_to_res))
4557 var = build_simple_mem_ref (ref_to_res);
4558 }
4559
4560 if (level == GOMP_DIM_WORKER)
4561 {
4562 /* Store incoming value to worker reduction buffer. */
4563 tree offset = gimple_call_arg (call, 5);
4564 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
4565 tree ptr = make_ssa_name (TREE_TYPE (call));
4566
4567 gimplify_assign (ptr, call, &seq);
4568 tree ref = build_simple_mem_ref (ptr);
4569 TREE_THIS_VOLATILE (ref) = 1;
4570 gimplify_assign (ref, var, &seq);
4571 }
4572
4573 if (lhs)
4574 gimplify_assign (lhs, var, &seq);
4575
4576 pop_gimplify_context (NULL);
4577 gsi_replace_with_seq (&gsi, seq, true);
4578 }
4579
4580 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
4581
4582 static void
4583 nvptx_goacc_reduction_init (gcall *call)
4584 {
4585 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4586 tree lhs = gimple_call_lhs (call);
4587 tree var = gimple_call_arg (call, 2);
4588 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4589 enum tree_code rcode
4590 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
4591 tree init = omp_reduction_init_op (gimple_location (call), rcode,
4592 TREE_TYPE (var));
4593 gimple_seq seq = NULL;
4594
4595 push_gimplify_context (true);
4596
4597 if (level == GOMP_DIM_VECTOR)
4598 {
4599 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
4600 tree tid = make_ssa_name (integer_type_node);
4601 tree dim_vector = gimple_call_arg (call, 3);
4602 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
4603 dim_vector);
4604 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
4605 NULL_TREE, NULL_TREE);
4606
4607 gimple_call_set_lhs (tid_call, tid);
4608 gimple_seq_add_stmt (&seq, tid_call);
4609 gimple_seq_add_stmt (&seq, cond_stmt);
4610
4611 /* Split the block just after the call. */
4612 edge init_edge = split_block (gsi_bb (gsi), call);
4613 basic_block init_bb = init_edge->dest;
4614 basic_block call_bb = init_edge->src;
4615
4616 /* Fixup flags from call_bb to init_bb. */
4617 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
4618
4619 /* Set the initialization stmts. */
4620 gimple_seq init_seq = NULL;
4621 tree init_var = make_ssa_name (TREE_TYPE (var));
4622 gimplify_assign (init_var, init, &init_seq);
4623 gsi = gsi_start_bb (init_bb);
4624 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
4625
4626 /* Split block just after the init stmt. */
4627 gsi_prev (&gsi);
4628 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
4629 basic_block dst_bb = inited_edge->dest;
4630
4631 /* Create false edge from call_bb to dst_bb. */
4632 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
4633
4634 /* Create phi node in dst block. */
4635 gphi *phi = create_phi_node (lhs, dst_bb);
4636 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
4637 add_phi_arg (phi, var, nop_edge, gimple_location (call));
4638
4639 /* Reset dominator of dst bb. */
4640 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
4641
4642 /* Reset the gsi. */
4643 gsi = gsi_for_stmt (call);
4644 }
4645 else
4646 {
4647 if (level == GOMP_DIM_GANG)
4648 {
4649 /* If there's no receiver object, propagate the incoming VAR. */
4650 tree ref_to_res = gimple_call_arg (call, 1);
4651 if (integer_zerop (ref_to_res))
4652 init = var;
4653 }
4654
4655 gimplify_assign (lhs, init, &seq);
4656 }
4657
4658 pop_gimplify_context (NULL);
4659 gsi_replace_with_seq (&gsi, seq, true);
4660 }
4661
4662 /* NVPTX implementation of GOACC_REDUCTION_FINI. */
4663
4664 static void
4665 nvptx_goacc_reduction_fini (gcall *call)
4666 {
4667 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4668 tree lhs = gimple_call_lhs (call);
4669 tree ref_to_res = gimple_call_arg (call, 1);
4670 tree var = gimple_call_arg (call, 2);
4671 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4672 enum tree_code op
4673 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
4674 gimple_seq seq = NULL;
4675 tree r = NULL_TREE;;
4676
4677 push_gimplify_context (true);
4678
4679 if (level == GOMP_DIM_VECTOR)
4680 {
4681 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
4682 but that requires a method of emitting a unified jump at the
4683 gimple level. */
4684 for (int shfl = PTX_VECTOR_LENGTH / 2; shfl > 0; shfl = shfl >> 1)
4685 {
4686 tree other_var = make_ssa_name (TREE_TYPE (var));
4687 nvptx_generate_vector_shuffle (gimple_location (call),
4688 other_var, var, shfl, &seq);
4689
4690 r = make_ssa_name (TREE_TYPE (var));
4691 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
4692 var, other_var), &seq);
4693 var = r;
4694 }
4695 }
4696 else
4697 {
4698 tree accum = NULL_TREE;
4699
4700 if (level == GOMP_DIM_WORKER)
4701 {
4702 /* Get reduction buffer address. */
4703 tree offset = gimple_call_arg (call, 5);
4704 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
4705 tree ptr = make_ssa_name (TREE_TYPE (call));
4706
4707 gimplify_assign (ptr, call, &seq);
4708 accum = ptr;
4709 }
4710 else if (integer_zerop (ref_to_res))
4711 r = var;
4712 else
4713 accum = ref_to_res;
4714
4715 if (accum)
4716 {
4717 /* UPDATE the accumulator. */
4718 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4719 seq = NULL;
4720 r = nvptx_reduction_update (gimple_location (call), &gsi,
4721 accum, var, op);
4722 }
4723 }
4724
4725 if (lhs)
4726 gimplify_assign (lhs, r, &seq);
4727 pop_gimplify_context (NULL);
4728
4729 gsi_replace_with_seq (&gsi, seq, true);
4730 }
4731
4732 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
4733
4734 static void
4735 nvptx_goacc_reduction_teardown (gcall *call)
4736 {
4737 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4738 tree lhs = gimple_call_lhs (call);
4739 tree var = gimple_call_arg (call, 2);
4740 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4741 gimple_seq seq = NULL;
4742
4743 push_gimplify_context (true);
4744 if (level == GOMP_DIM_WORKER)
4745 {
4746 /* Read the worker reduction buffer. */
4747 tree offset = gimple_call_arg (call, 5);
4748 tree call = nvptx_get_worker_red_addr(TREE_TYPE (var), offset);
4749 tree ptr = make_ssa_name (TREE_TYPE (call));
4750
4751 gimplify_assign (ptr, call, &seq);
4752 var = build_simple_mem_ref (ptr);
4753 TREE_THIS_VOLATILE (var) = 1;
4754 }
4755
4756 if (level != GOMP_DIM_GANG)
4757 {
4758 /* Write to the receiver object. */
4759 tree ref_to_res = gimple_call_arg (call, 1);
4760
4761 if (!integer_zerop (ref_to_res))
4762 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
4763 }
4764
4765 if (lhs)
4766 gimplify_assign (lhs, var, &seq);
4767
4768 pop_gimplify_context (NULL);
4769
4770 gsi_replace_with_seq (&gsi, seq, true);
4771 }
4772
4773 /* NVPTX reduction expander. */
4774
4775 static void
4776 nvptx_goacc_reduction (gcall *call)
4777 {
4778 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
4779
4780 switch (code)
4781 {
4782 case IFN_GOACC_REDUCTION_SETUP:
4783 nvptx_goacc_reduction_setup (call);
4784 break;
4785
4786 case IFN_GOACC_REDUCTION_INIT:
4787 nvptx_goacc_reduction_init (call);
4788 break;
4789
4790 case IFN_GOACC_REDUCTION_FINI:
4791 nvptx_goacc_reduction_fini (call);
4792 break;
4793
4794 case IFN_GOACC_REDUCTION_TEARDOWN:
4795 nvptx_goacc_reduction_teardown (call);
4796 break;
4797
4798 default:
4799 gcc_unreachable ();
4800 }
4801 }
4802
4803 #undef TARGET_OPTION_OVERRIDE
4804 #define TARGET_OPTION_OVERRIDE nvptx_option_override
4805
4806 #undef TARGET_ATTRIBUTE_TABLE
4807 #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
4808
4809 #undef TARGET_LEGITIMATE_ADDRESS_P
4810 #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
4811
4812 #undef TARGET_PROMOTE_FUNCTION_MODE
4813 #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
4814
4815 #undef TARGET_FUNCTION_ARG
4816 #define TARGET_FUNCTION_ARG nvptx_function_arg
4817 #undef TARGET_FUNCTION_INCOMING_ARG
4818 #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
4819 #undef TARGET_FUNCTION_ARG_ADVANCE
4820 #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
4821 #undef TARGET_PASS_BY_REFERENCE
4822 #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
4823 #undef TARGET_FUNCTION_VALUE_REGNO_P
4824 #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
4825 #undef TARGET_FUNCTION_VALUE
4826 #define TARGET_FUNCTION_VALUE nvptx_function_value
4827 #undef TARGET_LIBCALL_VALUE
4828 #define TARGET_LIBCALL_VALUE nvptx_libcall_value
4829 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
4830 #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
4831 #undef TARGET_GET_DRAP_RTX
4832 #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
4833 #undef TARGET_SPLIT_COMPLEX_ARG
4834 #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
4835 #undef TARGET_RETURN_IN_MEMORY
4836 #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
4837 #undef TARGET_OMIT_STRUCT_RETURN_REG
4838 #define TARGET_OMIT_STRUCT_RETURN_REG true
4839 #undef TARGET_STRICT_ARGUMENT_NAMING
4840 #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
4841 #undef TARGET_CALL_ARGS
4842 #define TARGET_CALL_ARGS nvptx_call_args
4843 #undef TARGET_END_CALL_ARGS
4844 #define TARGET_END_CALL_ARGS nvptx_end_call_args
4845
4846 #undef TARGET_ASM_FILE_START
4847 #define TARGET_ASM_FILE_START nvptx_file_start
4848 #undef TARGET_ASM_FILE_END
4849 #define TARGET_ASM_FILE_END nvptx_file_end
4850 #undef TARGET_ASM_GLOBALIZE_LABEL
4851 #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
4852 #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
4853 #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
4854 #undef TARGET_PRINT_OPERAND
4855 #define TARGET_PRINT_OPERAND nvptx_print_operand
4856 #undef TARGET_PRINT_OPERAND_ADDRESS
4857 #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
4858 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
4859 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
4860 #undef TARGET_ASM_INTEGER
4861 #define TARGET_ASM_INTEGER nvptx_assemble_integer
4862 #undef TARGET_ASM_DECL_END
4863 #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
4864 #undef TARGET_ASM_DECLARE_CONSTANT_NAME
4865 #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
4866 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
4867 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
4868 #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
4869 #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
4870
4871 #undef TARGET_MACHINE_DEPENDENT_REORG
4872 #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
4873 #undef TARGET_NO_REGISTER_ALLOCATION
4874 #define TARGET_NO_REGISTER_ALLOCATION true
4875
4876 #undef TARGET_ENCODE_SECTION_INFO
4877 #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
4878 #undef TARGET_RECORD_OFFLOAD_SYMBOL
4879 #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
4880
4881 #undef TARGET_VECTOR_ALIGNMENT
4882 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
4883
4884 #undef TARGET_CANNOT_COPY_INSN_P
4885 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
4886
4887 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
4888 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
4889
4890 #undef TARGET_INIT_BUILTINS
4891 #define TARGET_INIT_BUILTINS nvptx_init_builtins
4892 #undef TARGET_EXPAND_BUILTIN
4893 #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
4894 #undef TARGET_BUILTIN_DECL
4895 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
4896
4897 #undef TARGET_GOACC_VALIDATE_DIMS
4898 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
4899
4900 #undef TARGET_GOACC_DIM_LIMIT
4901 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
4902
4903 #undef TARGET_GOACC_FORK_JOIN
4904 #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
4905
4906 #undef TARGET_GOACC_REDUCTION
4907 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
4908
4909 struct gcc_target targetm = TARGET_INITIALIZER;
4910
4911 #include "gt-nvptx.h"