nvptx.c (PTX_GANG_DEFAULT): New.
[gcc.git] / gcc / config / nvptx / nvptx.c
1 /* Target code for NVPTX.
2 Copyright (C) 2014-2016 Free Software Foundation, Inc.
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #include "config.h"
22 #include <sstream>
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "cfghooks.h"
30 #include "df.h"
31 #include "tm_p.h"
32 #include "expmed.h"
33 #include "optabs.h"
34 #include "regs.h"
35 #include "emit-rtl.h"
36 #include "recog.h"
37 #include "diagnostic.h"
38 #include "alias.h"
39 #include "insn-flags.h"
40 #include "output.h"
41 #include "insn-attr.h"
42 #include "flags.h"
43 #include "dojump.h"
44 #include "explow.h"
45 #include "calls.h"
46 #include "varasm.h"
47 #include "stmt.h"
48 #include "expr.h"
49 #include "tm-preds.h"
50 #include "tm-constrs.h"
51 #include "langhooks.h"
52 #include "dbxout.h"
53 #include "cfgrtl.h"
54 #include "gimple.h"
55 #include "stor-layout.h"
56 #include "builtins.h"
57 #include "omp-low.h"
58 #include "gomp-constants.h"
59 #include "dumpfile.h"
60 #include "internal-fn.h"
61 #include "gimple-iterator.h"
62 #include "stringpool.h"
63 #include "tree-ssa-operands.h"
64 #include "tree-ssanames.h"
65 #include "gimplify.h"
66 #include "tree-phinodes.h"
67 #include "cfgloop.h"
68 #include "fold-const.h"
69
70 /* This file should be included last. */
71 #include "target-def.h"
72
73 /* The kind of shuffe instruction. */
74 enum nvptx_shuffle_kind
75 {
76 SHUFFLE_UP,
77 SHUFFLE_DOWN,
78 SHUFFLE_BFLY,
79 SHUFFLE_IDX,
80 SHUFFLE_MAX
81 };
82
83 /* The various PTX memory areas an object might reside in. */
84 enum nvptx_data_area
85 {
86 DATA_AREA_GENERIC,
87 DATA_AREA_GLOBAL,
88 DATA_AREA_SHARED,
89 DATA_AREA_LOCAL,
90 DATA_AREA_CONST,
91 DATA_AREA_PARAM,
92 DATA_AREA_MAX
93 };
94
95 /* We record the data area in the target symbol flags. */
96 #define SYMBOL_DATA_AREA(SYM) \
97 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
98 & 7)
99 #define SET_SYMBOL_DATA_AREA(SYM,AREA) \
100 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
101
102 /* Record the function decls we've written, and the libfuncs and function
103 decls corresponding to them. */
104 static std::stringstream func_decls;
105
106 struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
107 {
108 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
109 static bool equal (rtx a, rtx b) { return a == b; }
110 };
111
112 static GTY((cache))
113 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
114
115 struct tree_hasher : ggc_cache_ptr_hash<tree_node>
116 {
117 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
118 static bool equal (tree a, tree b) { return a == b; }
119 };
120
121 static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
122 static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
123
124 /* Buffer needed to broadcast across workers. This is used for both
125 worker-neutering and worker broadcasting. It is shared by all
126 functions emitted. The buffer is placed in shared memory. It'd be
127 nice if PTX supported common blocks, because then this could be
128 shared across TUs (taking the largest size). */
129 static unsigned worker_bcast_size;
130 static unsigned worker_bcast_align;
131 static GTY(()) rtx worker_bcast_sym;
132
133 /* Buffer needed for worker reductions. This has to be distinct from
134 the worker broadcast array, as both may be live concurrently. */
135 static unsigned worker_red_size;
136 static unsigned worker_red_align;
137 static GTY(()) rtx worker_red_sym;
138
139 /* Global lock variable, needed for 128bit worker & gang reductions. */
140 static GTY(()) tree global_lock_var;
141
142 /* Allocate a new, cleared machine_function structure. */
143
144 static struct machine_function *
145 nvptx_init_machine_status (void)
146 {
147 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
148 p->return_mode = VOIDmode;
149 return p;
150 }
151
152 /* Implement TARGET_OPTION_OVERRIDE. */
153
154 static void
155 nvptx_option_override (void)
156 {
157 init_machine_status = nvptx_init_machine_status;
158 /* Gives us a predictable order, which we need especially for variables. */
159 flag_toplevel_reorder = 1;
160 /* Assumes that it will see only hard registers. */
161 flag_var_tracking = 0;
162
163 if (write_symbols == DBX_DEBUG)
164 /* The stabs testcases want to know stabs isn't supported. */
165 sorry ("stabs debug format not supported");
166
167 /* Actually we don't have any debug format, but don't be
168 unneccesarily noisy. */
169 write_symbols = NO_DEBUG;
170 debug_info_level = DINFO_LEVEL_NONE;
171
172 if (nvptx_optimize < 0)
173 nvptx_optimize = optimize > 0;
174
175 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
176 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
177 declared_libfuncs_htab
178 = hash_table<declared_libfunc_hasher>::create_ggc (17);
179
180 worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_bcast");
181 SET_SYMBOL_DATA_AREA (worker_bcast_sym, DATA_AREA_SHARED);
182 worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
183
184 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
185 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
186 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
187 }
188
189 /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
190 deal with ptx ideosyncracies. */
191
192 const char *
193 nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
194 {
195 switch (mode)
196 {
197 case BLKmode:
198 return ".b8";
199 case BImode:
200 return ".pred";
201 case QImode:
202 if (promote)
203 return ".u32";
204 else
205 return ".u8";
206 case HImode:
207 return ".u16";
208 case SImode:
209 return ".u32";
210 case DImode:
211 return ".u64";
212
213 case SFmode:
214 return ".f32";
215 case DFmode:
216 return ".f64";
217
218 default:
219 gcc_unreachable ();
220 }
221 }
222
223 /* Encode the PTX data area that DECL (which might not actually be a
224 _DECL) should reside in. */
225
226 static void
227 nvptx_encode_section_info (tree decl, rtx rtl, int first)
228 {
229 default_encode_section_info (decl, rtl, first);
230 if (first && MEM_P (rtl))
231 {
232 nvptx_data_area area = DATA_AREA_GENERIC;
233
234 if (TREE_CONSTANT (decl))
235 area = DATA_AREA_CONST;
236 else if (TREE_CODE (decl) == VAR_DECL)
237 /* TODO: This would be a good place to check for a .shared or
238 other section name. */
239 area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
240
241 SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
242 }
243 }
244
245 /* Return the PTX name of the data area in which SYM should be
246 placed. The symbol must have already been processed by
247 nvptx_encode_seciton_info, or equivalent. */
248
249 static const char *
250 section_for_sym (rtx sym)
251 {
252 nvptx_data_area area = SYMBOL_DATA_AREA (sym);
253 /* Same order as nvptx_data_area enum. */
254 static char const *const areas[] =
255 {"", ".global", ".shared", ".local", ".const", ".param"};
256
257 return areas[area];
258 }
259
260 /* Similarly for a decl. */
261
262 static const char *
263 section_for_decl (const_tree decl)
264 {
265 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
266 }
267
268 /* Check NAME for special function names and redirect them by returning a
269 replacement. This applies to malloc, free and realloc, for which we
270 want to use libgcc wrappers, and call, which triggers a bug in ptxas. */
271
272 static const char *
273 nvptx_name_replacement (const char *name)
274 {
275 if (strcmp (name, "call") == 0)
276 return "__nvptx_call";
277 if (strcmp (name, "malloc") == 0)
278 return "__nvptx_malloc";
279 if (strcmp (name, "free") == 0)
280 return "__nvptx_free";
281 if (strcmp (name, "realloc") == 0)
282 return "__nvptx_realloc";
283 return name;
284 }
285
286 /* If MODE should be treated as two registers of an inner mode, return
287 that inner mode. Otherwise return VOIDmode. */
288
289 static machine_mode
290 maybe_split_mode (machine_mode mode)
291 {
292 if (COMPLEX_MODE_P (mode))
293 return GET_MODE_INNER (mode);
294
295 if (mode == TImode)
296 return DImode;
297
298 return VOIDmode;
299 }
300
301 /* Output a register, subreg, or register pair (with optional
302 enclosing braces). */
303
304 static void
305 output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
306 int subreg_offset = -1)
307 {
308 if (inner_mode == VOIDmode)
309 {
310 if (HARD_REGISTER_NUM_P (regno))
311 fprintf (file, "%s", reg_names[regno]);
312 else
313 fprintf (file, "%%r%d", regno);
314 }
315 else if (subreg_offset >= 0)
316 {
317 output_reg (file, regno, VOIDmode);
318 fprintf (file, "$%d", subreg_offset);
319 }
320 else
321 {
322 if (subreg_offset == -1)
323 fprintf (file, "{");
324 output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
325 fprintf (file, ",");
326 output_reg (file, regno, inner_mode, 0);
327 if (subreg_offset == -1)
328 fprintf (file, "}");
329 }
330 }
331
332 /* Emit forking instructions for MASK. */
333
334 static void
335 nvptx_emit_forking (unsigned mask, bool is_call)
336 {
337 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
338 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
339 if (mask)
340 {
341 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
342
343 /* Emit fork at all levels. This helps form SESE regions, as
344 it creates a block with a single successor before entering a
345 partitooned region. That is a good candidate for the end of
346 an SESE region. */
347 if (!is_call)
348 emit_insn (gen_nvptx_fork (op));
349 emit_insn (gen_nvptx_forked (op));
350 }
351 }
352
353 /* Emit joining instructions for MASK. */
354
355 static void
356 nvptx_emit_joining (unsigned mask, bool is_call)
357 {
358 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
359 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
360 if (mask)
361 {
362 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
363
364 /* Emit joining for all non-call pars to ensure there's a single
365 predecessor for the block the join insn ends up in. This is
366 needed for skipping entire loops. */
367 if (!is_call)
368 emit_insn (gen_nvptx_joining (op));
369 emit_insn (gen_nvptx_join (op));
370 }
371 }
372
373 \f
374 /* Determine whether MODE and TYPE (possibly NULL) should be passed or
375 returned in memory. Integer and floating types supported by the
376 machine are passed in registers, everything else is passed in
377 memory. Complex types are split. */
378
379 static bool
380 pass_in_memory (machine_mode mode, const_tree type, bool for_return)
381 {
382 if (type)
383 {
384 if (AGGREGATE_TYPE_P (type))
385 return true;
386 if (TREE_CODE (type) == VECTOR_TYPE)
387 return true;
388 }
389
390 if (!for_return && COMPLEX_MODE_P (mode))
391 /* Complex types are passed as two underlying args. */
392 mode = GET_MODE_INNER (mode);
393
394 if (GET_MODE_CLASS (mode) != MODE_INT
395 && GET_MODE_CLASS (mode) != MODE_FLOAT)
396 return true;
397
398 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
399 return true;
400
401 return false;
402 }
403
404 /* A non-memory argument of mode MODE is being passed, determine the mode it
405 should be promoted to. This is also used for determining return
406 type promotion. */
407
408 static machine_mode
409 promote_arg (machine_mode mode, bool prototyped)
410 {
411 if (!prototyped && mode == SFmode)
412 /* K&R float promotion for unprototyped functions. */
413 mode = DFmode;
414 else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
415 mode = SImode;
416
417 return mode;
418 }
419
420 /* A non-memory return type of MODE is being returned. Determine the
421 mode it should be promoted to. */
422
423 static machine_mode
424 promote_return (machine_mode mode)
425 {
426 return promote_arg (mode, true);
427 }
428
429 /* Implement TARGET_FUNCTION_ARG. */
430
431 static rtx
432 nvptx_function_arg (cumulative_args_t ARG_UNUSED (cum_v), machine_mode mode,
433 const_tree, bool named)
434 {
435 if (mode == VOIDmode || !named)
436 return NULL_RTX;
437
438 return gen_reg_rtx (mode);
439 }
440
441 /* Implement TARGET_FUNCTION_INCOMING_ARG. */
442
443 static rtx
444 nvptx_function_incoming_arg (cumulative_args_t cum_v, machine_mode mode,
445 const_tree, bool named)
446 {
447 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
448
449 if (mode == VOIDmode || !named)
450 return NULL_RTX;
451
452 /* No need to deal with split modes here, the only case that can
453 happen is complex modes and those are dealt with by
454 TARGET_SPLIT_COMPLEX_ARG. */
455 return gen_rtx_UNSPEC (mode,
456 gen_rtvec (1, GEN_INT (cum->count)),
457 UNSPEC_ARG_REG);
458 }
459
460 /* Implement TARGET_FUNCTION_ARG_ADVANCE. */
461
462 static void
463 nvptx_function_arg_advance (cumulative_args_t cum_v,
464 machine_mode ARG_UNUSED (mode),
465 const_tree ARG_UNUSED (type),
466 bool ARG_UNUSED (named))
467 {
468 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
469
470 cum->count++;
471 }
472
473 /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
474
475 For nvptx, we know how to handle functions declared as stdarg: by
476 passing an extra pointer to the unnamed arguments. However, the
477 Fortran frontend can produce a different situation, where a
478 function pointer is declared with no arguments, but the actual
479 function and calls to it take more arguments. In that case, we
480 want to ensure the call matches the definition of the function. */
481
482 static bool
483 nvptx_strict_argument_naming (cumulative_args_t cum_v)
484 {
485 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
486
487 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
488 }
489
490 /* Implement TARGET_LIBCALL_VALUE. */
491
492 static rtx
493 nvptx_libcall_value (machine_mode mode, const_rtx)
494 {
495 if (!cfun->machine->doing_call)
496 /* Pretend to return in a hard reg for early uses before pseudos can be
497 generated. */
498 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
499
500 return gen_reg_rtx (mode);
501 }
502
503 /* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
504 where function FUNC returns or receives a value of data type TYPE. */
505
506 static rtx
507 nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
508 bool outgoing)
509 {
510 machine_mode mode = promote_return (TYPE_MODE (type));
511
512 if (outgoing)
513 {
514 cfun->machine->return_mode = mode;
515 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
516 }
517
518 return nvptx_libcall_value (mode, NULL_RTX);
519 }
520
521 /* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
522
523 static bool
524 nvptx_function_value_regno_p (const unsigned int regno)
525 {
526 return regno == NVPTX_RETURN_REGNUM;
527 }
528
529 /* Types with a mode other than those supported by the machine are passed by
530 reference in memory. */
531
532 static bool
533 nvptx_pass_by_reference (cumulative_args_t ARG_UNUSED (cum),
534 machine_mode mode, const_tree type,
535 bool ARG_UNUSED (named))
536 {
537 return pass_in_memory (mode, type, false);
538 }
539
540 /* Implement TARGET_RETURN_IN_MEMORY. */
541
542 static bool
543 nvptx_return_in_memory (const_tree type, const_tree)
544 {
545 return pass_in_memory (TYPE_MODE (type), type, true);
546 }
547
548 /* Implement TARGET_PROMOTE_FUNCTION_MODE. */
549
550 static machine_mode
551 nvptx_promote_function_mode (const_tree type, machine_mode mode,
552 int *ARG_UNUSED (punsignedp),
553 const_tree funtype, int for_return)
554 {
555 return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
556 }
557
558 /* Helper for write_arg. Emit a single PTX argument of MODE, either
559 in a prototype, or as copy in a function prologue. ARGNO is the
560 index of this argument in the PTX function. FOR_REG is negative,
561 if we're emitting the PTX prototype. It is zero if we're copying
562 to an argument register and it is greater than zero if we're
563 copying to a specific hard register. */
564
565 static int
566 write_arg_mode (std::stringstream &s, int for_reg, int argno,
567 machine_mode mode)
568 {
569 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
570
571 if (for_reg < 0)
572 {
573 /* Writing PTX prototype. */
574 s << (argno ? ", " : " (");
575 s << ".param" << ptx_type << " %in_ar" << argno;
576 }
577 else
578 {
579 s << "\t.reg" << ptx_type << " ";
580 if (for_reg)
581 s << reg_names[for_reg];
582 else
583 s << "%ar" << argno;
584 s << ";\n";
585 if (argno >= 0)
586 {
587 s << "\tld.param" << ptx_type << " ";
588 if (for_reg)
589 s << reg_names[for_reg];
590 else
591 s << "%ar" << argno;
592 s << ", [%in_ar" << argno << "];\n";
593 }
594 }
595 return argno + 1;
596 }
597
598 /* Process function parameter TYPE to emit one or more PTX
599 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
600 is true, if this is a prototyped function, rather than an old-style
601 C declaration. Returns the next argument number to use.
602
603 The promotion behaviour here must match the regular GCC function
604 parameter marshalling machinery. */
605
606 static int
607 write_arg_type (std::stringstream &s, int for_reg, int argno,
608 tree type, bool prototyped)
609 {
610 machine_mode mode = TYPE_MODE (type);
611
612 if (mode == VOIDmode)
613 return argno;
614
615 if (pass_in_memory (mode, type, false))
616 mode = Pmode;
617 else
618 {
619 bool split = TREE_CODE (type) == COMPLEX_TYPE;
620
621 if (split)
622 {
623 /* Complex types are sent as two separate args. */
624 type = TREE_TYPE (type);
625 mode = TYPE_MODE (type);
626 prototyped = true;
627 }
628
629 mode = promote_arg (mode, prototyped);
630 if (split)
631 argno = write_arg_mode (s, for_reg, argno, mode);
632 }
633
634 return write_arg_mode (s, for_reg, argno, mode);
635 }
636
637 /* Emit a PTX return as a prototype or function prologue declaration
638 for MODE. */
639
640 static void
641 write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
642 {
643 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
644 const char *pfx = "\t.reg";
645 const char *sfx = ";\n";
646
647 if (for_proto)
648 pfx = "(.param", sfx = "_out) ";
649
650 s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
651 }
652
653 /* Process a function return TYPE to emit a PTX return as a prototype
654 or function prologue declaration. Returns true if return is via an
655 additional pointer parameter. The promotion behaviour here must
656 match the regular GCC function return mashalling. */
657
658 static bool
659 write_return_type (std::stringstream &s, bool for_proto, tree type)
660 {
661 machine_mode mode = TYPE_MODE (type);
662
663 if (mode == VOIDmode)
664 return false;
665
666 bool return_in_mem = pass_in_memory (mode, type, true);
667
668 if (return_in_mem)
669 {
670 if (for_proto)
671 return return_in_mem;
672
673 /* Named return values can cause us to return a pointer as well
674 as expect an argument for the return location. This is
675 optimization-level specific, so no caller can make use of
676 this data, but more importantly for us, we must ensure it
677 doesn't change the PTX prototype. */
678 mode = (machine_mode) cfun->machine->return_mode;
679
680 if (mode == VOIDmode)
681 return return_in_mem;
682
683 /* Clear return_mode to inhibit copy of retval to non-existent
684 retval parameter. */
685 cfun->machine->return_mode = VOIDmode;
686 }
687 else
688 mode = promote_return (mode);
689
690 write_return_mode (s, for_proto, mode);
691
692 return return_in_mem;
693 }
694
695 /* Look for attributes in ATTRS that would indicate we must write a function
696 as a .entry kernel rather than a .func. Return true if one is found. */
697
698 static bool
699 write_as_kernel (tree attrs)
700 {
701 return (lookup_attribute ("kernel", attrs) != NULL_TREE
702 || lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE);
703 }
704
705 /* Emit a linker marker for a function decl or defn. */
706
707 static void
708 write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
709 const char *name)
710 {
711 s << "\n// BEGIN";
712 if (globalize)
713 s << " GLOBAL";
714 s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
715 s << name << "\n";
716 }
717
718 /* Emit a linker marker for a variable decl or defn. */
719
720 static void
721 write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
722 {
723 fprintf (file, "\n// BEGIN%s VAR %s: ",
724 globalize ? " GLOBAL" : "",
725 is_defn ? "DEF" : "DECL");
726 assemble_name_raw (file, name);
727 fputs ("\n", file);
728 }
729
730 /* Write a .func or .kernel declaration or definition along with
731 a helper comment for use by ld. S is the stream to write to, DECL
732 the decl for the function with name NAME. For definitions, emit
733 a declaration too. */
734
735 static const char *
736 write_fn_proto (std::stringstream &s, bool is_defn,
737 const char *name, const_tree decl)
738 {
739 if (is_defn)
740 /* Emit a declaration. The PTX assembler gets upset without it. */
741 name = write_fn_proto (s, false, name, decl);
742 else
743 {
744 /* Avoid repeating the name replacement. */
745 name = nvptx_name_replacement (name);
746 if (name[0] == '*')
747 name++;
748 }
749
750 write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
751
752 /* PTX declaration. */
753 if (DECL_EXTERNAL (decl))
754 s << ".extern ";
755 else if (TREE_PUBLIC (decl))
756 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
757 s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
758
759 tree fntype = TREE_TYPE (decl);
760 tree result_type = TREE_TYPE (fntype);
761
762 /* Declare the result. */
763 bool return_in_mem = write_return_type (s, true, result_type);
764
765 s << name;
766
767 int argno = 0;
768
769 /* Emit argument list. */
770 if (return_in_mem)
771 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
772
773 /* We get:
774 NULL in TYPE_ARG_TYPES, for old-style functions
775 NULL in DECL_ARGUMENTS, for builtin functions without another
776 declaration.
777 So we have to pick the best one we have. */
778 tree args = TYPE_ARG_TYPES (fntype);
779 bool prototyped = true;
780 if (!args)
781 {
782 args = DECL_ARGUMENTS (decl);
783 prototyped = false;
784 }
785
786 for (; args; args = TREE_CHAIN (args))
787 {
788 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
789
790 argno = write_arg_type (s, -1, argno, type, prototyped);
791 }
792
793 if (stdarg_p (fntype))
794 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
795
796 if (DECL_STATIC_CHAIN (decl))
797 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
798
799 if (!argno && strcmp (name, "main") == 0)
800 {
801 argno = write_arg_type (s, -1, argno, integer_type_node, true);
802 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
803 }
804
805 if (argno)
806 s << ")";
807
808 s << (is_defn ? "\n" : ";\n");
809
810 return name;
811 }
812
813 /* Construct a function declaration from a call insn. This can be
814 necessary for two reasons - either we have an indirect call which
815 requires a .callprototype declaration, or we have a libcall
816 generated by emit_library_call for which no decl exists. */
817
818 static void
819 write_fn_proto_from_insn (std::stringstream &s, const char *name,
820 rtx result, rtx pat)
821 {
822 if (!name)
823 {
824 s << "\t.callprototype ";
825 name = "_";
826 }
827 else
828 {
829 name = nvptx_name_replacement (name);
830 write_fn_marker (s, false, true, name);
831 s << "\t.extern .func ";
832 }
833
834 if (result != NULL_RTX)
835 write_return_mode (s, true, GET_MODE (result));
836
837 s << name;
838
839 int arg_end = XVECLEN (pat, 0);
840 for (int i = 1; i < arg_end; i++)
841 {
842 /* We don't have to deal with mode splitting & promotion here,
843 as that was already done when generating the call
844 sequence. */
845 machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
846
847 write_arg_mode (s, -1, i - 1, mode);
848 }
849 if (arg_end != 1)
850 s << ")";
851 s << ";\n";
852 }
853
854 /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
855 table and and write a ptx prototype. These are emitted at end of
856 compilation. */
857
858 static void
859 nvptx_record_fndecl (tree decl)
860 {
861 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
862 if (*slot == NULL)
863 {
864 *slot = decl;
865 const char *name = get_fnname_from_decl (decl);
866 write_fn_proto (func_decls, false, name, decl);
867 }
868 }
869
870 /* Record a libcall or unprototyped external function. CALLEE is the
871 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
872 declaration for it. */
873
874 static void
875 nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
876 {
877 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
878 if (*slot == NULL)
879 {
880 *slot = callee;
881
882 const char *name = XSTR (callee, 0);
883 write_fn_proto_from_insn (func_decls, name, retval, pat);
884 }
885 }
886
887 /* DECL is an external FUNCTION_DECL, that we're referencing. If it
888 is prototyped, record it now. Otherwise record it as needed at end
889 of compilation, when we might have more information about it. */
890
891 void
892 nvptx_record_needed_fndecl (tree decl)
893 {
894 if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
895 {
896 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
897 if (*slot == NULL)
898 *slot = decl;
899 }
900 else
901 nvptx_record_fndecl (decl);
902 }
903
904 /* SYM is a SYMBOL_REF. If it refers to an external function, record
905 it as needed. */
906
907 static void
908 nvptx_maybe_record_fnsym (rtx sym)
909 {
910 tree decl = SYMBOL_REF_DECL (sym);
911
912 if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
913 nvptx_record_needed_fndecl (decl);
914 }
915
916 /* Emit a local array to hold some part of a conventional stack frame
917 and initialize REGNO to point to it. If the size is zero, it'll
918 never be valid to dereference, so we can simply initialize to
919 zero. */
920
921 static void
922 init_frame (FILE *file, int regno, unsigned align, unsigned size)
923 {
924 if (size)
925 fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
926 align, reg_names[regno], size);
927 fprintf (file, "\t.reg.u%d %s;\n",
928 POINTER_SIZE, reg_names[regno]);
929 fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
930 : "\tmov.u%d %s, 0;\n"),
931 POINTER_SIZE, reg_names[regno], reg_names[regno]);
932 }
933
934 /* Emit code to initialize the REGNO predicate register to indicate
935 whether we are not lane zero on the NAME axis. */
936
937 static void
938 nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
939 {
940 fprintf (file, "\t{\n");
941 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
942 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
943 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
944 fprintf (file, "\t}\n");
945 }
946
947 /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
948 function, including local var decls and copies from the arguments to
949 local regs. */
950
951 void
952 nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
953 {
954 tree fntype = TREE_TYPE (decl);
955 tree result_type = TREE_TYPE (fntype);
956 int argno = 0;
957
958 /* We construct the initial part of the function into a string
959 stream, in order to share the prototype writing code. */
960 std::stringstream s;
961 write_fn_proto (s, true, name, decl);
962 s << "{\n";
963
964 bool return_in_mem = write_return_type (s, false, result_type);
965 if (return_in_mem)
966 argno = write_arg_type (s, 0, argno, ptr_type_node, true);
967
968 /* Declare and initialize incoming arguments. */
969 tree args = TYPE_ARG_TYPES (fntype);
970 bool prototyped = true;
971 if (!args)
972 {
973 args = DECL_ARGUMENTS (decl);
974 prototyped = false;
975 }
976
977 for (; args != NULL_TREE; args = TREE_CHAIN (args))
978 {
979 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
980
981 argno = write_arg_type (s, 0, argno, type, prototyped);
982 }
983
984 if (stdarg_p (fntype))
985 argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
986 true);
987
988 if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
989 write_arg_type (s, STATIC_CHAIN_REGNUM,
990 DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
991 true);
992
993 fprintf (file, "%s", s.str().c_str());
994
995 /* Declare a local var for outgoing varargs. */
996 if (cfun->machine->has_varadic)
997 init_frame (file, STACK_POINTER_REGNUM,
998 UNITS_PER_WORD, crtl->outgoing_args_size);
999
1000 /* Declare a local variable for the frame. */
1001 HOST_WIDE_INT sz = get_frame_size ();
1002 if (sz || cfun->machine->has_chain)
1003 init_frame (file, FRAME_POINTER_REGNUM,
1004 crtl->stack_alignment_needed / BITS_PER_UNIT, sz);
1005
1006 /* Declare the pseudos we have as ptx registers. */
1007 int maxregs = max_reg_num ();
1008 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1009 {
1010 if (regno_reg_rtx[i] != const0_rtx)
1011 {
1012 machine_mode mode = PSEUDO_REGNO_MODE (i);
1013 machine_mode split = maybe_split_mode (mode);
1014
1015 if (split != VOIDmode)
1016 mode = split;
1017 fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1018 output_reg (file, i, split, -2);
1019 fprintf (file, ";\n");
1020 }
1021 }
1022
1023 /* Emit axis predicates. */
1024 if (cfun->machine->axis_predicate[0])
1025 nvptx_init_axis_predicate (file,
1026 REGNO (cfun->machine->axis_predicate[0]), "y");
1027 if (cfun->machine->axis_predicate[1])
1028 nvptx_init_axis_predicate (file,
1029 REGNO (cfun->machine->axis_predicate[1]), "x");
1030 }
1031
1032 /* Output a return instruction. Also copy the return value to its outgoing
1033 location. */
1034
1035 const char *
1036 nvptx_output_return (void)
1037 {
1038 machine_mode mode = (machine_mode)cfun->machine->return_mode;
1039
1040 if (mode != VOIDmode)
1041 fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1042 nvptx_ptx_type_from_mode (mode, false),
1043 reg_names[NVPTX_RETURN_REGNUM],
1044 reg_names[NVPTX_RETURN_REGNUM]);
1045
1046 return "ret;";
1047 }
1048
1049 /* Terminate a function by writing a closing brace to FILE. */
1050
1051 void
1052 nvptx_function_end (FILE *file)
1053 {
1054 fprintf (file, "}\n");
1055 }
1056 \f
1057 /* Decide whether we can make a sibling call to a function. For ptx, we
1058 can't. */
1059
1060 static bool
1061 nvptx_function_ok_for_sibcall (tree, tree)
1062 {
1063 return false;
1064 }
1065
1066 /* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1067
1068 static rtx
1069 nvptx_get_drap_rtx (void)
1070 {
1071 return NULL_RTX;
1072 }
1073
1074 /* Implement the TARGET_CALL_ARGS hook. Record information about one
1075 argument to the next call. */
1076
1077 static void
1078 nvptx_call_args (rtx arg, tree fntype)
1079 {
1080 if (!cfun->machine->doing_call)
1081 {
1082 cfun->machine->doing_call = true;
1083 cfun->machine->is_varadic = false;
1084 cfun->machine->num_args = 0;
1085
1086 if (fntype && stdarg_p (fntype))
1087 {
1088 cfun->machine->is_varadic = true;
1089 cfun->machine->has_varadic = true;
1090 cfun->machine->num_args++;
1091 }
1092 }
1093
1094 if (REG_P (arg) && arg != pc_rtx)
1095 {
1096 cfun->machine->num_args++;
1097 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1098 cfun->machine->call_args);
1099 }
1100 }
1101
1102 /* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1103 information we recorded. */
1104
1105 static void
1106 nvptx_end_call_args (void)
1107 {
1108 cfun->machine->doing_call = false;
1109 free_EXPR_LIST_list (&cfun->machine->call_args);
1110 }
1111
1112 /* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1113 track of whether calls involving static chains or varargs were seen
1114 in the current function.
1115 For libcalls, maintain a hash table of decls we have seen, and
1116 record a function decl for later when encountering a new one. */
1117
1118 void
1119 nvptx_expand_call (rtx retval, rtx address)
1120 {
1121 rtx callee = XEXP (address, 0);
1122 rtx varargs = NULL_RTX;
1123 unsigned parallel = 0;
1124
1125 if (!call_insn_operand (callee, Pmode))
1126 {
1127 callee = force_reg (Pmode, callee);
1128 address = change_address (address, QImode, callee);
1129 }
1130
1131 if (GET_CODE (callee) == SYMBOL_REF)
1132 {
1133 tree decl = SYMBOL_REF_DECL (callee);
1134 if (decl != NULL_TREE)
1135 {
1136 if (DECL_STATIC_CHAIN (decl))
1137 cfun->machine->has_chain = true;
1138
1139 tree attr = get_oacc_fn_attrib (decl);
1140 if (attr)
1141 {
1142 tree dims = TREE_VALUE (attr);
1143
1144 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1145 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1146 {
1147 if (TREE_PURPOSE (dims)
1148 && !integer_zerop (TREE_PURPOSE (dims)))
1149 break;
1150 /* Not on this axis. */
1151 parallel ^= GOMP_DIM_MASK (ix);
1152 dims = TREE_CHAIN (dims);
1153 }
1154 }
1155 }
1156 }
1157
1158 unsigned nargs = cfun->machine->num_args;
1159 if (cfun->machine->is_varadic)
1160 {
1161 varargs = gen_reg_rtx (Pmode);
1162 emit_move_insn (varargs, stack_pointer_rtx);
1163 }
1164
1165 rtvec vec = rtvec_alloc (nargs + 1);
1166 rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
1167 int vec_pos = 0;
1168
1169 rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
1170 rtx tmp_retval = retval;
1171 if (retval)
1172 {
1173 if (!nvptx_register_operand (retval, GET_MODE (retval)))
1174 tmp_retval = gen_reg_rtx (GET_MODE (retval));
1175 call = gen_rtx_SET (tmp_retval, call);
1176 }
1177 XVECEXP (pat, 0, vec_pos++) = call;
1178
1179 /* Construct the call insn, including a USE for each argument pseudo
1180 register. These will be used when printing the insn. */
1181 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
1182 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
1183
1184 if (varargs)
1185 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
1186
1187 gcc_assert (vec_pos = XVECLEN (pat, 0));
1188
1189 nvptx_emit_forking (parallel, true);
1190 emit_call_insn (pat);
1191 nvptx_emit_joining (parallel, true);
1192
1193 if (tmp_retval != retval)
1194 emit_move_insn (retval, tmp_retval);
1195 }
1196
1197 /* Emit a comparison COMPARE, and return the new test to be used in the
1198 jump. */
1199
1200 rtx
1201 nvptx_expand_compare (rtx compare)
1202 {
1203 rtx pred = gen_reg_rtx (BImode);
1204 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1205 XEXP (compare, 0), XEXP (compare, 1));
1206 emit_insn (gen_rtx_SET (pred, cmp));
1207 return gen_rtx_NE (BImode, pred, const0_rtx);
1208 }
1209
1210 /* Expand the oacc fork & join primitive into ptx-required unspecs. */
1211
1212 void
1213 nvptx_expand_oacc_fork (unsigned mode)
1214 {
1215 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1216 }
1217
1218 void
1219 nvptx_expand_oacc_join (unsigned mode)
1220 {
1221 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1222 }
1223
1224 /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1225 objects. */
1226
1227 static rtx
1228 nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1229 {
1230 rtx res;
1231
1232 switch (GET_MODE (src))
1233 {
1234 case DImode:
1235 res = gen_unpackdisi2 (dst0, dst1, src);
1236 break;
1237 case DFmode:
1238 res = gen_unpackdfsi2 (dst0, dst1, src);
1239 break;
1240 default: gcc_unreachable ();
1241 }
1242 return res;
1243 }
1244
1245 /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1246 object. */
1247
1248 static rtx
1249 nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1250 {
1251 rtx res;
1252
1253 switch (GET_MODE (dst))
1254 {
1255 case DImode:
1256 res = gen_packsidi2 (dst, src0, src1);
1257 break;
1258 case DFmode:
1259 res = gen_packsidf2 (dst, src0, src1);
1260 break;
1261 default: gcc_unreachable ();
1262 }
1263 return res;
1264 }
1265
1266 /* Generate an instruction or sequence to broadcast register REG
1267 across the vectors of a single warp. */
1268
1269 static rtx
1270 nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
1271 {
1272 rtx res;
1273
1274 switch (GET_MODE (dst))
1275 {
1276 case SImode:
1277 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1278 break;
1279 case SFmode:
1280 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1281 break;
1282 case DImode:
1283 case DFmode:
1284 {
1285 rtx tmp0 = gen_reg_rtx (SImode);
1286 rtx tmp1 = gen_reg_rtx (SImode);
1287
1288 start_sequence ();
1289 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1290 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1291 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1292 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1293 res = get_insns ();
1294 end_sequence ();
1295 }
1296 break;
1297 case BImode:
1298 {
1299 rtx tmp = gen_reg_rtx (SImode);
1300
1301 start_sequence ();
1302 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1303 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1304 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1305 res = get_insns ();
1306 end_sequence ();
1307 }
1308 break;
1309
1310 default:
1311 gcc_unreachable ();
1312 }
1313 return res;
1314 }
1315
1316 /* Generate an instruction or sequence to broadcast register REG
1317 across the vectors of a single warp. */
1318
1319 static rtx
1320 nvptx_gen_vcast (rtx reg)
1321 {
1322 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1323 }
1324
1325 /* Structure used when generating a worker-level spill or fill. */
1326
1327 struct wcast_data_t
1328 {
1329 rtx base; /* Register holding base addr of buffer. */
1330 rtx ptr; /* Iteration var, if needed. */
1331 unsigned offset; /* Offset into worker buffer. */
1332 };
1333
1334 /* Direction of the spill/fill and looping setup/teardown indicator. */
1335
1336 enum propagate_mask
1337 {
1338 PM_read = 1 << 0,
1339 PM_write = 1 << 1,
1340 PM_loop_begin = 1 << 2,
1341 PM_loop_end = 1 << 3,
1342
1343 PM_read_write = PM_read | PM_write
1344 };
1345
1346 /* Generate instruction(s) to spill or fill register REG to/from the
1347 worker broadcast array. PM indicates what is to be done, REP
1348 how many loop iterations will be executed (0 for not a loop). */
1349
1350 static rtx
1351 nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data)
1352 {
1353 rtx res;
1354 machine_mode mode = GET_MODE (reg);
1355
1356 switch (mode)
1357 {
1358 case BImode:
1359 {
1360 rtx tmp = gen_reg_rtx (SImode);
1361
1362 start_sequence ();
1363 if (pm & PM_read)
1364 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
1365 emit_insn (nvptx_gen_wcast (tmp, pm, rep, data));
1366 if (pm & PM_write)
1367 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1368 res = get_insns ();
1369 end_sequence ();
1370 }
1371 break;
1372
1373 default:
1374 {
1375 rtx addr = data->ptr;
1376
1377 if (!addr)
1378 {
1379 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1380
1381 if (align > worker_bcast_align)
1382 worker_bcast_align = align;
1383 data->offset = (data->offset + align - 1) & ~(align - 1);
1384 addr = data->base;
1385 if (data->offset)
1386 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1387 }
1388
1389 addr = gen_rtx_MEM (mode, addr);
1390 if (pm == PM_read)
1391 res = gen_rtx_SET (addr, reg);
1392 else if (pm == PM_write)
1393 res = gen_rtx_SET (reg, addr);
1394 else
1395 gcc_unreachable ();
1396
1397 if (data->ptr)
1398 {
1399 /* We're using a ptr, increment it. */
1400 start_sequence ();
1401
1402 emit_insn (res);
1403 emit_insn (gen_adddi3 (data->ptr, data->ptr,
1404 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
1405 res = get_insns ();
1406 end_sequence ();
1407 }
1408 else
1409 rep = 1;
1410 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
1411 }
1412 break;
1413 }
1414 return res;
1415 }
1416 \f
1417 /* Returns true if X is a valid address for use in a memory reference. */
1418
1419 static bool
1420 nvptx_legitimate_address_p (machine_mode, rtx x, bool)
1421 {
1422 enum rtx_code code = GET_CODE (x);
1423
1424 switch (code)
1425 {
1426 case REG:
1427 return true;
1428
1429 case PLUS:
1430 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
1431 return true;
1432 return false;
1433
1434 case CONST:
1435 case SYMBOL_REF:
1436 case LABEL_REF:
1437 return true;
1438
1439 default:
1440 return false;
1441 }
1442 }
1443 \f
1444 /* Machinery to output constant initializers. When beginning an
1445 initializer, we decide on a fragment size (which is visible in ptx
1446 in the type used), and then all initializer data is buffered until
1447 a fragment is filled and ready to be written out. */
1448
1449 static struct
1450 {
1451 unsigned HOST_WIDE_INT mask; /* Mask for storing fragment. */
1452 unsigned HOST_WIDE_INT val; /* Current fragment value. */
1453 unsigned HOST_WIDE_INT remaining; /* Remaining bytes to be written
1454 out. */
1455 unsigned size; /* Fragment size to accumulate. */
1456 unsigned offset; /* Offset within current fragment. */
1457 bool started; /* Whether we've output any initializer. */
1458 } init_frag;
1459
1460 /* The current fragment is full, write it out. SYM may provide a
1461 symbolic reference we should output, in which case the fragment
1462 value is the addend. */
1463
1464 static void
1465 output_init_frag (rtx sym)
1466 {
1467 fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
1468 unsigned HOST_WIDE_INT val = init_frag.val;
1469
1470 init_frag.started = true;
1471 init_frag.val = 0;
1472 init_frag.offset = 0;
1473 init_frag.remaining--;
1474
1475 if (sym)
1476 {
1477 fprintf (asm_out_file, "generic(");
1478 output_address (VOIDmode, sym);
1479 fprintf (asm_out_file, val ? ") + " : ")");
1480 }
1481
1482 if (!sym || val)
1483 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
1484 }
1485
1486 /* Add value VAL of size SIZE to the data we're emitting, and keep
1487 writing out chunks as they fill up. */
1488
1489 static void
1490 nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
1491 {
1492 val &= ((unsigned HOST_WIDE_INT)2 << (size * BITS_PER_UNIT - 1)) - 1;
1493
1494 for (unsigned part = 0; size; size -= part)
1495 {
1496 val >>= part * BITS_PER_UNIT;
1497 part = init_frag.size - init_frag.offset;
1498 if (part > size)
1499 part = size;
1500
1501 unsigned HOST_WIDE_INT partial
1502 = val << (init_frag.offset * BITS_PER_UNIT);
1503 init_frag.val |= partial & init_frag.mask;
1504 init_frag.offset += part;
1505
1506 if (init_frag.offset == init_frag.size)
1507 output_init_frag (NULL);
1508 }
1509 }
1510
1511 /* Target hook for assembling integer object X of size SIZE. */
1512
1513 static bool
1514 nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
1515 {
1516 HOST_WIDE_INT val = 0;
1517
1518 switch (GET_CODE (x))
1519 {
1520 default:
1521 /* Let the generic machinery figure it out, usually for a
1522 CONST_WIDE_INT. */
1523 return false;
1524
1525 case CONST_INT:
1526 nvptx_assemble_value (INTVAL (x), size);
1527 break;
1528
1529 case CONST:
1530 x = XEXP (x, 0);
1531 gcc_assert (GET_CODE (x) == PLUS);
1532 val = INTVAL (XEXP (x, 1));
1533 x = XEXP (x, 0);
1534 gcc_assert (GET_CODE (x) == SYMBOL_REF);
1535 /* FALLTHROUGH */
1536
1537 case SYMBOL_REF:
1538 gcc_assert (size == init_frag.size);
1539 if (init_frag.offset)
1540 sorry ("cannot emit unaligned pointers in ptx assembly");
1541
1542 nvptx_maybe_record_fnsym (x);
1543 init_frag.val = val;
1544 output_init_frag (x);
1545 break;
1546 }
1547
1548 return true;
1549 }
1550
1551 /* Output SIZE zero bytes. We ignore the FILE argument since the
1552 functions we're calling to perform the output just use
1553 asm_out_file. */
1554
1555 void
1556 nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
1557 {
1558 /* Finish the current fragment, if it's started. */
1559 if (init_frag.offset)
1560 {
1561 unsigned part = init_frag.size - init_frag.offset;
1562 if (part > size)
1563 part = (unsigned) size;
1564 size -= part;
1565 nvptx_assemble_value (0, part);
1566 }
1567
1568 /* If this skip doesn't terminate the initializer, write as many
1569 remaining pieces as possible directly. */
1570 if (size < init_frag.remaining * init_frag.size)
1571 {
1572 while (size >= init_frag.size)
1573 {
1574 size -= init_frag.size;
1575 output_init_frag (NULL_RTX);
1576 }
1577 if (size)
1578 nvptx_assemble_value (0, size);
1579 }
1580 }
1581
1582 /* Output a string STR with length SIZE. As in nvptx_output_skip we
1583 ignore the FILE arg. */
1584
1585 void
1586 nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
1587 {
1588 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
1589 nvptx_assemble_value (str[i], 1);
1590 }
1591
1592 /* Emit a PTX variable decl and prepare for emission of its
1593 initializer. NAME is the symbol name and SETION the PTX data
1594 area. The type is TYPE, object size SIZE and alignment is ALIGN.
1595 The caller has already emitted any indentation and linkage
1596 specifier. It is responsible for any initializer, terminating ;
1597 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
1598 this is the opposite way round that PTX wants them! */
1599
1600 static void
1601 nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
1602 const_tree type, HOST_WIDE_INT size, unsigned align)
1603 {
1604 while (TREE_CODE (type) == ARRAY_TYPE)
1605 type = TREE_TYPE (type);
1606
1607 if (TREE_CODE (type) == VECTOR_TYPE
1608 || TREE_CODE (type) == COMPLEX_TYPE)
1609 /* Neither vector nor complex types can contain the other. */
1610 type = TREE_TYPE (type);
1611
1612 unsigned elt_size = int_size_in_bytes (type);
1613
1614 /* Largest mode we're prepared to accept. For BLKmode types we
1615 don't know if it'll contain pointer constants, so have to choose
1616 pointer size, otherwise we can choose DImode. */
1617 machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
1618
1619 elt_size |= GET_MODE_SIZE (elt_mode);
1620 elt_size &= -elt_size; /* Extract LSB set. */
1621
1622 init_frag.size = elt_size;
1623 /* Avoid undefined shift behaviour by using '2'. */
1624 init_frag.mask = ((unsigned HOST_WIDE_INT)2
1625 << (elt_size * BITS_PER_UNIT - 1)) - 1;
1626 init_frag.val = 0;
1627 init_frag.offset = 0;
1628 init_frag.started = false;
1629 /* Size might not be a multiple of elt size, if there's an
1630 initialized trailing struct array with smaller type than
1631 elt_size. */
1632 init_frag.remaining = (size + elt_size - 1) / elt_size;
1633
1634 fprintf (file, "%s .align %d .u%d ",
1635 section, align / BITS_PER_UNIT,
1636 elt_size * BITS_PER_UNIT);
1637 assemble_name (file, name);
1638
1639 if (size)
1640 /* We make everything an array, to simplify any initialization
1641 emission. */
1642 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]", init_frag.remaining);
1643 }
1644
1645 /* Called when the initializer for a decl has been completely output through
1646 combinations of the three functions above. */
1647
1648 static void
1649 nvptx_assemble_decl_end (void)
1650 {
1651 if (init_frag.offset)
1652 /* This can happen with a packed struct with trailing array member. */
1653 nvptx_assemble_value (0, init_frag.size - init_frag.offset);
1654 fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
1655 }
1656
1657 /* Output an uninitialized common or file-scope variable. */
1658
1659 void
1660 nvptx_output_aligned_decl (FILE *file, const char *name,
1661 const_tree decl, HOST_WIDE_INT size, unsigned align)
1662 {
1663 write_var_marker (file, true, TREE_PUBLIC (decl), name);
1664
1665 /* If this is public, it is common. The nearest thing we have to
1666 common is weak. */
1667 fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
1668
1669 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
1670 TREE_TYPE (decl), size, align);
1671 nvptx_assemble_decl_end ();
1672 }
1673
1674 /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
1675 writing a constant variable EXP with NAME and SIZE and its
1676 initializer to FILE. */
1677
1678 static void
1679 nvptx_asm_declare_constant_name (FILE *file, const char *name,
1680 const_tree exp, HOST_WIDE_INT obj_size)
1681 {
1682 write_var_marker (file, true, false, name);
1683
1684 fprintf (file, "\t");
1685
1686 tree type = TREE_TYPE (exp);
1687 nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
1688 TYPE_ALIGN (type));
1689 }
1690
1691 /* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
1692 a variable DECL with NAME to FILE. */
1693
1694 void
1695 nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
1696 {
1697 write_var_marker (file, true, TREE_PUBLIC (decl), name);
1698
1699 fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
1700 : DECL_WEAK (decl) ? ".weak " : ".visible "));
1701
1702 tree type = TREE_TYPE (decl);
1703 HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
1704 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
1705 type, obj_size, DECL_ALIGN (decl));
1706 }
1707
1708 /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
1709
1710 static void
1711 nvptx_globalize_label (FILE *, const char *)
1712 {
1713 }
1714
1715 /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
1716 declaration only for variable DECL with NAME to FILE. */
1717
1718 static void
1719 nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
1720 {
1721 /* The middle end can place constant pool decls into the varpool as
1722 undefined. Until that is fixed, catch the problem here. */
1723 if (DECL_IN_CONSTANT_POOL (decl))
1724 return;
1725
1726 write_var_marker (file, false, TREE_PUBLIC (decl), name);
1727
1728 fprintf (file, "\t.extern ");
1729 tree size = DECL_SIZE_UNIT (decl);
1730 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
1731 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
1732 DECL_ALIGN (decl));
1733 nvptx_assemble_decl_end ();
1734 }
1735
1736 /* Output a pattern for a move instruction. */
1737
1738 const char *
1739 nvptx_output_mov_insn (rtx dst, rtx src)
1740 {
1741 machine_mode dst_mode = GET_MODE (dst);
1742 machine_mode dst_inner = (GET_CODE (dst) == SUBREG
1743 ? GET_MODE (XEXP (dst, 0)) : dst_mode);
1744 machine_mode src_inner = (GET_CODE (src) == SUBREG
1745 ? GET_MODE (XEXP (src, 0)) : dst_mode);
1746
1747 rtx sym = src;
1748 if (GET_CODE (sym) == CONST)
1749 sym = XEXP (XEXP (sym, 0), 0);
1750 if (SYMBOL_REF_P (sym))
1751 {
1752 if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
1753 return "%.\tcvta%D1%t0\t%0, %1;";
1754 nvptx_maybe_record_fnsym (sym);
1755 }
1756
1757 if (src_inner == dst_inner)
1758 return "%.\tmov%t0\t%0, %1;";
1759
1760 if (CONSTANT_P (src))
1761 return (GET_MODE_CLASS (dst_inner) == MODE_INT
1762 && GET_MODE_CLASS (src_inner) != MODE_FLOAT
1763 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
1764
1765 if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
1766 return "%.\tmov.b%T0\t%0, %1;";
1767
1768 return "%.\tcvt%t0%t1\t%0, %1;";
1769 }
1770
1771 /* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
1772 involves writing .param declarations and in/out copies into them. For
1773 indirect calls, also write the .callprototype. */
1774
1775 const char *
1776 nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
1777 {
1778 char buf[16];
1779 static int labelno;
1780 bool needs_tgt = register_operand (callee, Pmode);
1781 rtx pat = PATTERN (insn);
1782 int arg_end = XVECLEN (pat, 0);
1783 tree decl = NULL_TREE;
1784
1785 fprintf (asm_out_file, "\t{\n");
1786 if (result != NULL)
1787 fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
1788 nvptx_ptx_type_from_mode (GET_MODE (result), false),
1789 reg_names[NVPTX_RETURN_REGNUM]);
1790
1791 /* Ensure we have a ptx declaration in the output if necessary. */
1792 if (GET_CODE (callee) == SYMBOL_REF)
1793 {
1794 decl = SYMBOL_REF_DECL (callee);
1795 if (!decl
1796 || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
1797 nvptx_record_libfunc (callee, result, pat);
1798 else if (DECL_EXTERNAL (decl))
1799 nvptx_record_fndecl (decl);
1800 }
1801
1802 if (needs_tgt)
1803 {
1804 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
1805 labelno++;
1806 ASM_OUTPUT_LABEL (asm_out_file, buf);
1807 std::stringstream s;
1808 write_fn_proto_from_insn (s, NULL, result, pat);
1809 fputs (s.str().c_str(), asm_out_file);
1810 }
1811
1812 for (int argno = 1; argno < arg_end; argno++)
1813 {
1814 rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
1815 machine_mode mode = GET_MODE (t);
1816 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
1817
1818 /* Mode splitting has already been done. */
1819 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
1820 "\t\tst.param%s [%%out_arg%d], ",
1821 ptx_type, argno, ptx_type, argno);
1822 output_reg (asm_out_file, REGNO (t), VOIDmode);
1823 fprintf (asm_out_file, ";\n");
1824 }
1825
1826 fprintf (asm_out_file, "\t\tcall ");
1827 if (result != NULL_RTX)
1828 fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
1829
1830 if (decl)
1831 {
1832 const char *name = get_fnname_from_decl (decl);
1833 name = nvptx_name_replacement (name);
1834 assemble_name (asm_out_file, name);
1835 }
1836 else
1837 output_address (VOIDmode, callee);
1838
1839 const char *open = "(";
1840 for (int argno = 1; argno < arg_end; argno++)
1841 {
1842 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
1843 open = "";
1844 }
1845 if (decl && DECL_STATIC_CHAIN (decl))
1846 {
1847 fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
1848 open = "";
1849 }
1850 if (!open[0])
1851 fprintf (asm_out_file, ")");
1852
1853 if (needs_tgt)
1854 {
1855 fprintf (asm_out_file, ", ");
1856 assemble_name (asm_out_file, buf);
1857 }
1858 fprintf (asm_out_file, ";\n");
1859
1860 if (find_reg_note (insn, REG_NORETURN, NULL))
1861 /* No return functions confuse the PTX JIT, as it doesn't realize
1862 the flow control barrier they imply. It can seg fault if it
1863 encounters what looks like an unexitable loop. Emit a trailing
1864 trap, which it does grok. */
1865 fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
1866
1867 if (result)
1868 {
1869 static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
1870
1871 if (!rval[0])
1872 /* We must escape the '%' that starts RETURN_REGNUM. */
1873 sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
1874 reg_names[NVPTX_RETURN_REGNUM]);
1875 return rval;
1876 }
1877
1878 return "}";
1879 }
1880
1881 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
1882
1883 static bool
1884 nvptx_print_operand_punct_valid_p (unsigned char c)
1885 {
1886 return c == '.' || c== '#';
1887 }
1888
1889 static void nvptx_print_operand (FILE *, rtx, int);
1890
1891 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
1892
1893 static void
1894 nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
1895 {
1896 rtx off;
1897 if (GET_CODE (x) == CONST)
1898 x = XEXP (x, 0);
1899 switch (GET_CODE (x))
1900 {
1901 case PLUS:
1902 off = XEXP (x, 1);
1903 output_address (VOIDmode, XEXP (x, 0));
1904 fprintf (file, "+");
1905 output_address (VOIDmode, off);
1906 break;
1907
1908 case SYMBOL_REF:
1909 case LABEL_REF:
1910 output_addr_const (file, x);
1911 break;
1912
1913 default:
1914 gcc_assert (GET_CODE (x) != MEM);
1915 nvptx_print_operand (file, x, 0);
1916 break;
1917 }
1918 }
1919
1920 /* Write assembly language output for the address ADDR to FILE. */
1921
1922 static void
1923 nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
1924 {
1925 nvptx_print_address_operand (file, addr, mode);
1926 }
1927
1928 /* Print an operand, X, to FILE, with an optional modifier in CODE.
1929
1930 Meaning of CODE:
1931 . -- print the predicate for the instruction or an emptry string for an
1932 unconditional one.
1933 # -- print a rounding mode for the instruction
1934
1935 A -- print a data area for a MEM
1936 c -- print an opcode suffix for a comparison operator, including a type code
1937 D -- print a data area for a MEM operand
1938 S -- print a shuffle kind specified by CONST_INT
1939 t -- print a type opcode suffix, promoting QImode to 32 bits
1940 T -- print a type size in bits
1941 u -- print a type opcode suffix without promotions. */
1942
1943 static void
1944 nvptx_print_operand (FILE *file, rtx x, int code)
1945 {
1946 if (code == '.')
1947 {
1948 x = current_insn_predicate;
1949 if (x)
1950 {
1951 unsigned int regno = REGNO (XEXP (x, 0));
1952 fputs ("[", file);
1953 if (GET_CODE (x) == EQ)
1954 fputs ("!", file);
1955 fputs (reg_names [regno], file);
1956 fputs ("]", file);
1957 }
1958 return;
1959 }
1960 else if (code == '#')
1961 {
1962 fputs (".rn", file);
1963 return;
1964 }
1965
1966 enum rtx_code x_code = GET_CODE (x);
1967 machine_mode mode = GET_MODE (x);
1968
1969 switch (code)
1970 {
1971 case 'A':
1972 x = XEXP (x, 0);
1973 /* FALLTHROUGH. */
1974
1975 case 'D':
1976 if (GET_CODE (x) == CONST)
1977 x = XEXP (x, 0);
1978 if (GET_CODE (x) == PLUS)
1979 x = XEXP (x, 0);
1980
1981 if (GET_CODE (x) == SYMBOL_REF)
1982 fputs (section_for_sym (x), file);
1983 break;
1984
1985 case 't':
1986 case 'u':
1987 if (x_code == SUBREG)
1988 {
1989 mode = GET_MODE (SUBREG_REG (x));
1990 if (mode == TImode)
1991 mode = DImode;
1992 else if (COMPLEX_MODE_P (mode))
1993 mode = GET_MODE_INNER (mode);
1994 }
1995 fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
1996 break;
1997
1998 case 'S':
1999 {
2000 nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
2001 /* Same order as nvptx_shuffle_kind. */
2002 static const char *const kinds[] =
2003 {".up", ".down", ".bfly", ".idx"};
2004 fputs (kinds[kind], file);
2005 }
2006 break;
2007
2008 case 'T':
2009 fprintf (file, "%d", GET_MODE_BITSIZE (mode));
2010 break;
2011
2012 case 'j':
2013 fprintf (file, "@");
2014 goto common;
2015
2016 case 'J':
2017 fprintf (file, "@!");
2018 goto common;
2019
2020 case 'c':
2021 mode = GET_MODE (XEXP (x, 0));
2022 switch (x_code)
2023 {
2024 case EQ:
2025 fputs (".eq", file);
2026 break;
2027 case NE:
2028 if (FLOAT_MODE_P (mode))
2029 fputs (".neu", file);
2030 else
2031 fputs (".ne", file);
2032 break;
2033 case LE:
2034 fputs (".le", file);
2035 break;
2036 case GE:
2037 fputs (".ge", file);
2038 break;
2039 case LT:
2040 fputs (".lt", file);
2041 break;
2042 case GT:
2043 fputs (".gt", file);
2044 break;
2045 case LEU:
2046 fputs (".ls", file);
2047 break;
2048 case GEU:
2049 fputs (".hs", file);
2050 break;
2051 case LTU:
2052 fputs (".lo", file);
2053 break;
2054 case GTU:
2055 fputs (".hi", file);
2056 break;
2057 case LTGT:
2058 fputs (".ne", file);
2059 break;
2060 case UNEQ:
2061 fputs (".equ", file);
2062 break;
2063 case UNLE:
2064 fputs (".leu", file);
2065 break;
2066 case UNGE:
2067 fputs (".geu", file);
2068 break;
2069 case UNLT:
2070 fputs (".ltu", file);
2071 break;
2072 case UNGT:
2073 fputs (".gtu", file);
2074 break;
2075 case UNORDERED:
2076 fputs (".nan", file);
2077 break;
2078 case ORDERED:
2079 fputs (".num", file);
2080 break;
2081 default:
2082 gcc_unreachable ();
2083 }
2084 if (FLOAT_MODE_P (mode)
2085 || x_code == EQ || x_code == NE
2086 || x_code == GEU || x_code == GTU
2087 || x_code == LEU || x_code == LTU)
2088 fputs (nvptx_ptx_type_from_mode (mode, true), file);
2089 else
2090 fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
2091 break;
2092 default:
2093 common:
2094 switch (x_code)
2095 {
2096 case SUBREG:
2097 {
2098 rtx inner_x = SUBREG_REG (x);
2099 machine_mode inner_mode = GET_MODE (inner_x);
2100 machine_mode split = maybe_split_mode (inner_mode);
2101
2102 if (split != VOIDmode
2103 && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
2104 output_reg (file, REGNO (inner_x), split);
2105 else
2106 output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
2107 }
2108 break;
2109
2110 case REG:
2111 output_reg (file, REGNO (x), maybe_split_mode (mode));
2112 break;
2113
2114 case MEM:
2115 fputc ('[', file);
2116 nvptx_print_address_operand (file, XEXP (x, 0), mode);
2117 fputc (']', file);
2118 break;
2119
2120 case CONST_INT:
2121 output_addr_const (file, x);
2122 break;
2123
2124 case CONST:
2125 case SYMBOL_REF:
2126 case LABEL_REF:
2127 /* We could use output_addr_const, but that can print things like
2128 "x-8", which breaks ptxas. Need to ensure it is output as
2129 "x+-8". */
2130 nvptx_print_address_operand (file, x, VOIDmode);
2131 break;
2132
2133 case CONST_DOUBLE:
2134 long vals[2];
2135 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
2136 vals[0] &= 0xffffffff;
2137 vals[1] &= 0xffffffff;
2138 if (mode == SFmode)
2139 fprintf (file, "0f%08lx", vals[0]);
2140 else
2141 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2142 break;
2143
2144 default:
2145 output_addr_const (file, x);
2146 }
2147 }
2148 }
2149 \f
2150 /* Record replacement regs used to deal with subreg operands. */
2151 struct reg_replace
2152 {
2153 rtx replacement[MAX_RECOG_OPERANDS];
2154 machine_mode mode;
2155 int n_allocated;
2156 int n_in_use;
2157 };
2158
2159 /* Allocate or reuse a replacement in R and return the rtx. */
2160
2161 static rtx
2162 get_replacement (struct reg_replace *r)
2163 {
2164 if (r->n_allocated == r->n_in_use)
2165 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2166 return r->replacement[r->n_in_use++];
2167 }
2168
2169 /* Clean up subreg operands. In ptx assembly, everything is typed, and
2170 the presence of subregs would break the rules for most instructions.
2171 Replace them with a suitable new register of the right size, plus
2172 conversion copyin/copyout instructions. */
2173
2174 static void
2175 nvptx_reorg_subreg (void)
2176 {
2177 struct reg_replace qiregs, hiregs, siregs, diregs;
2178 rtx_insn *insn, *next;
2179
2180 qiregs.n_allocated = 0;
2181 hiregs.n_allocated = 0;
2182 siregs.n_allocated = 0;
2183 diregs.n_allocated = 0;
2184 qiregs.mode = QImode;
2185 hiregs.mode = HImode;
2186 siregs.mode = SImode;
2187 diregs.mode = DImode;
2188
2189 for (insn = get_insns (); insn; insn = next)
2190 {
2191 next = NEXT_INSN (insn);
2192 if (!NONDEBUG_INSN_P (insn)
2193 || asm_noperands (PATTERN (insn)) >= 0
2194 || GET_CODE (PATTERN (insn)) == USE
2195 || GET_CODE (PATTERN (insn)) == CLOBBER)
2196 continue;
2197
2198 qiregs.n_in_use = 0;
2199 hiregs.n_in_use = 0;
2200 siregs.n_in_use = 0;
2201 diregs.n_in_use = 0;
2202 extract_insn (insn);
2203 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
2204
2205 for (int i = 0; i < recog_data.n_operands; i++)
2206 {
2207 rtx op = recog_data.operand[i];
2208 if (GET_CODE (op) != SUBREG)
2209 continue;
2210
2211 rtx inner = SUBREG_REG (op);
2212
2213 machine_mode outer_mode = GET_MODE (op);
2214 machine_mode inner_mode = GET_MODE (inner);
2215 gcc_assert (s_ok);
2216 if (s_ok
2217 && (GET_MODE_PRECISION (inner_mode)
2218 >= GET_MODE_PRECISION (outer_mode)))
2219 continue;
2220 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2221 struct reg_replace *r = (outer_mode == QImode ? &qiregs
2222 : outer_mode == HImode ? &hiregs
2223 : outer_mode == SImode ? &siregs
2224 : &diregs);
2225 rtx new_reg = get_replacement (r);
2226
2227 if (recog_data.operand_type[i] != OP_OUT)
2228 {
2229 enum rtx_code code;
2230 if (GET_MODE_PRECISION (inner_mode)
2231 < GET_MODE_PRECISION (outer_mode))
2232 code = ZERO_EXTEND;
2233 else
2234 code = TRUNCATE;
2235
2236 rtx pat = gen_rtx_SET (new_reg,
2237 gen_rtx_fmt_e (code, outer_mode, inner));
2238 emit_insn_before (pat, insn);
2239 }
2240
2241 if (recog_data.operand_type[i] != OP_IN)
2242 {
2243 enum rtx_code code;
2244 if (GET_MODE_PRECISION (inner_mode)
2245 < GET_MODE_PRECISION (outer_mode))
2246 code = TRUNCATE;
2247 else
2248 code = ZERO_EXTEND;
2249
2250 rtx pat = gen_rtx_SET (inner,
2251 gen_rtx_fmt_e (code, inner_mode, new_reg));
2252 emit_insn_after (pat, insn);
2253 }
2254 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2255 }
2256 }
2257 }
2258
2259 /* Loop structure of the function. The entire function is described as
2260 a NULL loop. */
2261
2262 struct parallel
2263 {
2264 /* Parent parallel. */
2265 parallel *parent;
2266
2267 /* Next sibling parallel. */
2268 parallel *next;
2269
2270 /* First child parallel. */
2271 parallel *inner;
2272
2273 /* Partitioning mask of the parallel. */
2274 unsigned mask;
2275
2276 /* Partitioning used within inner parallels. */
2277 unsigned inner_mask;
2278
2279 /* Location of parallel forked and join. The forked is the first
2280 block in the parallel and the join is the first block after of
2281 the partition. */
2282 basic_block forked_block;
2283 basic_block join_block;
2284
2285 rtx_insn *forked_insn;
2286 rtx_insn *join_insn;
2287
2288 rtx_insn *fork_insn;
2289 rtx_insn *joining_insn;
2290
2291 /* Basic blocks in this parallel, but not in child parallels. The
2292 FORKED and JOINING blocks are in the partition. The FORK and JOIN
2293 blocks are not. */
2294 auto_vec<basic_block> blocks;
2295
2296 public:
2297 parallel (parallel *parent, unsigned mode);
2298 ~parallel ();
2299 };
2300
2301 /* Constructor links the new parallel into it's parent's chain of
2302 children. */
2303
2304 parallel::parallel (parallel *parent_, unsigned mask_)
2305 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
2306 {
2307 forked_block = join_block = 0;
2308 forked_insn = join_insn = 0;
2309 fork_insn = joining_insn = 0;
2310
2311 if (parent)
2312 {
2313 next = parent->inner;
2314 parent->inner = this;
2315 }
2316 }
2317
2318 parallel::~parallel ()
2319 {
2320 delete inner;
2321 delete next;
2322 }
2323
2324 /* Map of basic blocks to insns */
2325 typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
2326
2327 /* A tuple of an insn of interest and the BB in which it resides. */
2328 typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
2329 typedef auto_vec<insn_bb_t> insn_bb_vec_t;
2330
2331 /* Split basic blocks such that each forked and join unspecs are at
2332 the start of their basic blocks. Thus afterwards each block will
2333 have a single partitioning mode. We also do the same for return
2334 insns, as they are executed by every thread. Return the
2335 partitioning mode of the function as a whole. Populate MAP with
2336 head and tail blocks. We also clear the BB visited flag, which is
2337 used when finding partitions. */
2338
2339 static void
2340 nvptx_split_blocks (bb_insn_map_t *map)
2341 {
2342 insn_bb_vec_t worklist;
2343 basic_block block;
2344 rtx_insn *insn;
2345
2346 /* Locate all the reorg instructions of interest. */
2347 FOR_ALL_BB_FN (block, cfun)
2348 {
2349 bool seen_insn = false;
2350
2351 /* Clear visited flag, for use by parallel locator */
2352 block->flags &= ~BB_VISITED;
2353
2354 FOR_BB_INSNS (block, insn)
2355 {
2356 if (!INSN_P (insn))
2357 continue;
2358 switch (recog_memoized (insn))
2359 {
2360 default:
2361 seen_insn = true;
2362 continue;
2363 case CODE_FOR_nvptx_forked:
2364 case CODE_FOR_nvptx_join:
2365 break;
2366
2367 case CODE_FOR_return:
2368 /* We also need to split just before return insns, as
2369 that insn needs executing by all threads, but the
2370 block it is in probably does not. */
2371 break;
2372 }
2373
2374 if (seen_insn)
2375 /* We've found an instruction that must be at the start of
2376 a block, but isn't. Add it to the worklist. */
2377 worklist.safe_push (insn_bb_t (insn, block));
2378 else
2379 /* It was already the first instruction. Just add it to
2380 the map. */
2381 map->get_or_insert (block) = insn;
2382 seen_insn = true;
2383 }
2384 }
2385
2386 /* Split blocks on the worklist. */
2387 unsigned ix;
2388 insn_bb_t *elt;
2389 basic_block remap = 0;
2390 for (ix = 0; worklist.iterate (ix, &elt); ix++)
2391 {
2392 if (remap != elt->second)
2393 {
2394 block = elt->second;
2395 remap = block;
2396 }
2397
2398 /* Split block before insn. The insn is in the new block */
2399 edge e = split_block (block, PREV_INSN (elt->first));
2400
2401 block = e->dest;
2402 map->get_or_insert (block) = elt->first;
2403 }
2404 }
2405
2406 /* BLOCK is a basic block containing a head or tail instruction.
2407 Locate the associated prehead or pretail instruction, which must be
2408 in the single predecessor block. */
2409
2410 static rtx_insn *
2411 nvptx_discover_pre (basic_block block, int expected)
2412 {
2413 gcc_assert (block->preds->length () == 1);
2414 basic_block pre_block = (*block->preds)[0]->src;
2415 rtx_insn *pre_insn;
2416
2417 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
2418 pre_insn = PREV_INSN (pre_insn))
2419 gcc_assert (pre_insn != BB_HEAD (pre_block));
2420
2421 gcc_assert (recog_memoized (pre_insn) == expected);
2422 return pre_insn;
2423 }
2424
2425 /* Dump this parallel and all its inner parallels. */
2426
2427 static void
2428 nvptx_dump_pars (parallel *par, unsigned depth)
2429 {
2430 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
2431 depth, par->mask,
2432 par->forked_block ? par->forked_block->index : -1,
2433 par->join_block ? par->join_block->index : -1);
2434
2435 fprintf (dump_file, " blocks:");
2436
2437 basic_block block;
2438 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
2439 fprintf (dump_file, " %d", block->index);
2440 fprintf (dump_file, "\n");
2441 if (par->inner)
2442 nvptx_dump_pars (par->inner, depth + 1);
2443
2444 if (par->next)
2445 nvptx_dump_pars (par->next, depth);
2446 }
2447
2448 /* If BLOCK contains a fork/join marker, process it to create or
2449 terminate a loop structure. Add this block to the current loop,
2450 and then walk successor blocks. */
2451
2452 static parallel *
2453 nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
2454 {
2455 if (block->flags & BB_VISITED)
2456 return par;
2457 block->flags |= BB_VISITED;
2458
2459 if (rtx_insn **endp = map->get (block))
2460 {
2461 rtx_insn *end = *endp;
2462
2463 /* This is a block head or tail, or return instruction. */
2464 switch (recog_memoized (end))
2465 {
2466 case CODE_FOR_return:
2467 /* Return instructions are in their own block, and we
2468 don't need to do anything more. */
2469 return par;
2470
2471 case CODE_FOR_nvptx_forked:
2472 /* Loop head, create a new inner loop and add it into
2473 our parent's child list. */
2474 {
2475 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
2476
2477 gcc_assert (mask);
2478 par = new parallel (par, mask);
2479 par->forked_block = block;
2480 par->forked_insn = end;
2481 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
2482 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
2483 par->fork_insn
2484 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
2485 }
2486 break;
2487
2488 case CODE_FOR_nvptx_join:
2489 /* A loop tail. Finish the current loop and return to
2490 parent. */
2491 {
2492 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
2493
2494 gcc_assert (par->mask == mask);
2495 par->join_block = block;
2496 par->join_insn = end;
2497 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
2498 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
2499 par->joining_insn
2500 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
2501 par = par->parent;
2502 }
2503 break;
2504
2505 default:
2506 gcc_unreachable ();
2507 }
2508 }
2509
2510 if (par)
2511 /* Add this block onto the current loop's list of blocks. */
2512 par->blocks.safe_push (block);
2513 else
2514 /* This must be the entry block. Create a NULL parallel. */
2515 par = new parallel (0, 0);
2516
2517 /* Walk successor blocks. */
2518 edge e;
2519 edge_iterator ei;
2520
2521 FOR_EACH_EDGE (e, ei, block->succs)
2522 nvptx_find_par (map, par, e->dest);
2523
2524 return par;
2525 }
2526
2527 /* DFS walk the CFG looking for fork & join markers. Construct
2528 loop structures as we go. MAP is a mapping of basic blocks
2529 to head & tail markers, discovered when splitting blocks. This
2530 speeds up the discovery. We rely on the BB visited flag having
2531 been cleared when splitting blocks. */
2532
2533 static parallel *
2534 nvptx_discover_pars (bb_insn_map_t *map)
2535 {
2536 basic_block block;
2537
2538 /* Mark exit blocks as visited. */
2539 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
2540 block->flags |= BB_VISITED;
2541
2542 /* And entry block as not. */
2543 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
2544 block->flags &= ~BB_VISITED;
2545
2546 parallel *par = nvptx_find_par (map, 0, block);
2547
2548 if (dump_file)
2549 {
2550 fprintf (dump_file, "\nLoops\n");
2551 nvptx_dump_pars (par, 0);
2552 fprintf (dump_file, "\n");
2553 }
2554
2555 return par;
2556 }
2557
2558 /* Analyse a group of BBs within a partitioned region and create N
2559 Single-Entry-Single-Exit regions. Some of those regions will be
2560 trivial ones consisting of a single BB. The blocks of a
2561 partitioned region might form a set of disjoint graphs -- because
2562 the region encloses a differently partitoned sub region.
2563
2564 We use the linear time algorithm described in 'Finding Regions Fast:
2565 Single Entry Single Exit and control Regions in Linear Time'
2566 Johnson, Pearson & Pingali. That algorithm deals with complete
2567 CFGs, where a back edge is inserted from END to START, and thus the
2568 problem becomes one of finding equivalent loops.
2569
2570 In this case we have a partial CFG. We complete it by redirecting
2571 any incoming edge to the graph to be from an arbitrary external BB,
2572 and similarly redirecting any outgoing edge to be to that BB.
2573 Thus we end up with a closed graph.
2574
2575 The algorithm works by building a spanning tree of an undirected
2576 graph and keeping track of back edges from nodes further from the
2577 root in the tree to nodes nearer to the root in the tree. In the
2578 description below, the root is up and the tree grows downwards.
2579
2580 We avoid having to deal with degenerate back-edges to the same
2581 block, by splitting each BB into 3 -- one for input edges, one for
2582 the node itself and one for the output edges. Such back edges are
2583 referred to as 'Brackets'. Cycle equivalent nodes will have the
2584 same set of brackets.
2585
2586 Determining bracket equivalency is done by maintaining a list of
2587 brackets in such a manner that the list length and final bracket
2588 uniquely identify the set.
2589
2590 We use coloring to mark all BBs with cycle equivalency with the
2591 same color. This is the output of the 'Finding Regions Fast'
2592 algorithm. Notice it doesn't actually find the set of nodes within
2593 a particular region, just unorderd sets of nodes that are the
2594 entries and exits of SESE regions.
2595
2596 After determining cycle equivalency, we need to find the minimal
2597 set of SESE regions. Do this with a DFS coloring walk of the
2598 complete graph. We're either 'looking' or 'coloring'. When
2599 looking, and we're in the subgraph, we start coloring the color of
2600 the current node, and remember that node as the start of the
2601 current color's SESE region. Every time we go to a new node, we
2602 decrement the count of nodes with thet color. If it reaches zero,
2603 we remember that node as the end of the current color's SESE region
2604 and return to 'looking'. Otherwise we color the node the current
2605 color.
2606
2607 This way we end up with coloring the inside of non-trivial SESE
2608 regions with the color of that region. */
2609
2610 /* A pair of BBs. We use this to represent SESE regions. */
2611 typedef std::pair<basic_block, basic_block> bb_pair_t;
2612 typedef auto_vec<bb_pair_t> bb_pair_vec_t;
2613
2614 /* A node in the undirected CFG. The discriminator SECOND indicates just
2615 above or just below the BB idicated by FIRST. */
2616 typedef std::pair<basic_block, int> pseudo_node_t;
2617
2618 /* A bracket indicates an edge towards the root of the spanning tree of the
2619 undirected graph. Each bracket has a color, determined
2620 from the currrent set of brackets. */
2621 struct bracket
2622 {
2623 pseudo_node_t back; /* Back target */
2624
2625 /* Current color and size of set. */
2626 unsigned color;
2627 unsigned size;
2628
2629 bracket (pseudo_node_t back_)
2630 : back (back_), color (~0u), size (~0u)
2631 {
2632 }
2633
2634 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
2635 {
2636 if (length != size)
2637 {
2638 size = length;
2639 color = color_counts.length ();
2640 color_counts.quick_push (0);
2641 }
2642 color_counts[color]++;
2643 return color;
2644 }
2645 };
2646
2647 typedef auto_vec<bracket> bracket_vec_t;
2648
2649 /* Basic block info for finding SESE regions. */
2650
2651 struct bb_sese
2652 {
2653 int node; /* Node number in spanning tree. */
2654 int parent; /* Parent node number. */
2655
2656 /* The algorithm splits each node A into Ai, A', Ao. The incoming
2657 edges arrive at pseudo-node Ai and the outgoing edges leave at
2658 pseudo-node Ao. We have to remember which way we arrived at a
2659 particular node when generating the spanning tree. dir > 0 means
2660 we arrived at Ai, dir < 0 means we arrived at Ao. */
2661 int dir;
2662
2663 /* Lowest numbered pseudo-node reached via a backedge from thsis
2664 node, or any descendant. */
2665 pseudo_node_t high;
2666
2667 int color; /* Cycle-equivalence color */
2668
2669 /* Stack of brackets for this node. */
2670 bracket_vec_t brackets;
2671
2672 bb_sese (unsigned node_, unsigned p, int dir_)
2673 :node (node_), parent (p), dir (dir_)
2674 {
2675 }
2676 ~bb_sese ();
2677
2678 /* Push a bracket ending at BACK. */
2679 void push (const pseudo_node_t &back)
2680 {
2681 if (dump_file)
2682 fprintf (dump_file, "Pushing backedge %d:%+d\n",
2683 back.first ? back.first->index : 0, back.second);
2684 brackets.safe_push (bracket (back));
2685 }
2686
2687 void append (bb_sese *child);
2688 void remove (const pseudo_node_t &);
2689
2690 /* Set node's color. */
2691 void set_color (auto_vec<unsigned> &color_counts)
2692 {
2693 color = brackets.last ().get_color (color_counts, brackets.length ());
2694 }
2695 };
2696
2697 bb_sese::~bb_sese ()
2698 {
2699 }
2700
2701 /* Destructively append CHILD's brackets. */
2702
2703 void
2704 bb_sese::append (bb_sese *child)
2705 {
2706 if (int len = child->brackets.length ())
2707 {
2708 int ix;
2709
2710 if (dump_file)
2711 {
2712 for (ix = 0; ix < len; ix++)
2713 {
2714 const pseudo_node_t &pseudo = child->brackets[ix].back;
2715 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
2716 child->node, pseudo.first ? pseudo.first->index : 0,
2717 pseudo.second);
2718 }
2719 }
2720 if (!brackets.length ())
2721 std::swap (brackets, child->brackets);
2722 else
2723 {
2724 brackets.reserve (len);
2725 for (ix = 0; ix < len; ix++)
2726 brackets.quick_push (child->brackets[ix]);
2727 }
2728 }
2729 }
2730
2731 /* Remove brackets that terminate at PSEUDO. */
2732
2733 void
2734 bb_sese::remove (const pseudo_node_t &pseudo)
2735 {
2736 unsigned removed = 0;
2737 int len = brackets.length ();
2738
2739 for (int ix = 0; ix < len; ix++)
2740 {
2741 if (brackets[ix].back == pseudo)
2742 {
2743 if (dump_file)
2744 fprintf (dump_file, "Removing backedge %d:%+d\n",
2745 pseudo.first ? pseudo.first->index : 0, pseudo.second);
2746 removed++;
2747 }
2748 else if (removed)
2749 brackets[ix-removed] = brackets[ix];
2750 }
2751 while (removed--)
2752 brackets.pop ();
2753 }
2754
2755 /* Accessors for BB's aux pointer. */
2756 #define BB_SET_SESE(B, S) ((B)->aux = (S))
2757 #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
2758
2759 /* DFS walk creating SESE data structures. Only cover nodes with
2760 BB_VISITED set. Append discovered blocks to LIST. We number in
2761 increments of 3 so that the above and below pseudo nodes can be
2762 implicitly numbered too. */
2763
2764 static int
2765 nvptx_sese_number (int n, int p, int dir, basic_block b,
2766 auto_vec<basic_block> *list)
2767 {
2768 if (BB_GET_SESE (b))
2769 return n;
2770
2771 if (dump_file)
2772 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
2773 b->index, n, p, dir);
2774
2775 BB_SET_SESE (b, new bb_sese (n, p, dir));
2776 p = n;
2777
2778 n += 3;
2779 list->quick_push (b);
2780
2781 /* First walk the nodes on the 'other side' of this node, then walk
2782 the nodes on the same side. */
2783 for (unsigned ix = 2; ix; ix--)
2784 {
2785 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
2786 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
2787 : offsetof (edge_def, src));
2788 edge e;
2789 edge_iterator (ei);
2790
2791 FOR_EACH_EDGE (e, ei, edges)
2792 {
2793 basic_block target = *(basic_block *)((char *)e + offset);
2794
2795 if (target->flags & BB_VISITED)
2796 n = nvptx_sese_number (n, p, dir, target, list);
2797 }
2798 dir = -dir;
2799 }
2800 return n;
2801 }
2802
2803 /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
2804 EDGES are the outgoing edges and OFFSET is the offset to the src
2805 or dst block on the edges. */
2806
2807 static void
2808 nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
2809 vec<edge, va_gc> *edges, size_t offset)
2810 {
2811 edge e;
2812 edge_iterator (ei);
2813 int hi_back = depth;
2814 pseudo_node_t node_back (0, depth);
2815 int hi_child = depth;
2816 pseudo_node_t node_child (0, depth);
2817 basic_block child = NULL;
2818 unsigned num_children = 0;
2819 int usd = -dir * sese->dir;
2820
2821 if (dump_file)
2822 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
2823 me->index, sese->node, dir);
2824
2825 if (dir < 0)
2826 {
2827 /* This is the above pseudo-child. It has the BB itself as an
2828 additional child node. */
2829 node_child = sese->high;
2830 hi_child = node_child.second;
2831 if (node_child.first)
2832 hi_child += BB_GET_SESE (node_child.first)->node;
2833 num_children++;
2834 }
2835
2836 /* Examine each edge.
2837 - if it is a child (a) append its bracket list and (b) record
2838 whether it is the child with the highest reaching bracket.
2839 - if it is an edge to ancestor, record whether it's the highest
2840 reaching backlink. */
2841 FOR_EACH_EDGE (e, ei, edges)
2842 {
2843 basic_block target = *(basic_block *)((char *)e + offset);
2844
2845 if (bb_sese *t_sese = BB_GET_SESE (target))
2846 {
2847 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
2848 {
2849 /* Child node. Append its bracket list. */
2850 num_children++;
2851 sese->append (t_sese);
2852
2853 /* Compare it's hi value. */
2854 int t_hi = t_sese->high.second;
2855
2856 if (basic_block child_hi_block = t_sese->high.first)
2857 t_hi += BB_GET_SESE (child_hi_block)->node;
2858
2859 if (hi_child > t_hi)
2860 {
2861 hi_child = t_hi;
2862 node_child = t_sese->high;
2863 child = target;
2864 }
2865 }
2866 else if (t_sese->node < sese->node + dir
2867 && !(dir < 0 && sese->parent == t_sese->node))
2868 {
2869 /* Non-parental ancestor node -- a backlink. */
2870 int d = usd * t_sese->dir;
2871 int back = t_sese->node + d;
2872
2873 if (hi_back > back)
2874 {
2875 hi_back = back;
2876 node_back = pseudo_node_t (target, d);
2877 }
2878 }
2879 }
2880 else
2881 { /* Fallen off graph, backlink to entry node. */
2882 hi_back = 0;
2883 node_back = pseudo_node_t (0, 0);
2884 }
2885 }
2886
2887 /* Remove any brackets that terminate at this pseudo node. */
2888 sese->remove (pseudo_node_t (me, dir));
2889
2890 /* Now push any backlinks from this pseudo node. */
2891 FOR_EACH_EDGE (e, ei, edges)
2892 {
2893 basic_block target = *(basic_block *)((char *)e + offset);
2894 if (bb_sese *t_sese = BB_GET_SESE (target))
2895 {
2896 if (t_sese->node < sese->node + dir
2897 && !(dir < 0 && sese->parent == t_sese->node))
2898 /* Non-parental ancestor node - backedge from me. */
2899 sese->push (pseudo_node_t (target, usd * t_sese->dir));
2900 }
2901 else
2902 {
2903 /* back edge to entry node */
2904 sese->push (pseudo_node_t (0, 0));
2905 }
2906 }
2907
2908 /* If this node leads directly or indirectly to a no-return region of
2909 the graph, then fake a backedge to entry node. */
2910 if (!sese->brackets.length () || !edges || !edges->length ())
2911 {
2912 hi_back = 0;
2913 node_back = pseudo_node_t (0, 0);
2914 sese->push (node_back);
2915 }
2916
2917 /* Record the highest reaching backedge from us or a descendant. */
2918 sese->high = hi_back < hi_child ? node_back : node_child;
2919
2920 if (num_children > 1)
2921 {
2922 /* There is more than one child -- this is a Y shaped piece of
2923 spanning tree. We have to insert a fake backedge from this
2924 node to the highest ancestor reached by not-the-highest
2925 reaching child. Note that there may be multiple children
2926 with backedges to the same highest node. That's ok and we
2927 insert the edge to that highest node. */
2928 hi_child = depth;
2929 if (dir < 0 && child)
2930 {
2931 node_child = sese->high;
2932 hi_child = node_child.second;
2933 if (node_child.first)
2934 hi_child += BB_GET_SESE (node_child.first)->node;
2935 }
2936
2937 FOR_EACH_EDGE (e, ei, edges)
2938 {
2939 basic_block target = *(basic_block *)((char *)e + offset);
2940
2941 if (target == child)
2942 /* Ignore the highest child. */
2943 continue;
2944
2945 bb_sese *t_sese = BB_GET_SESE (target);
2946 if (!t_sese)
2947 continue;
2948 if (t_sese->parent != sese->node)
2949 /* Not a child. */
2950 continue;
2951
2952 /* Compare its hi value. */
2953 int t_hi = t_sese->high.second;
2954
2955 if (basic_block child_hi_block = t_sese->high.first)
2956 t_hi += BB_GET_SESE (child_hi_block)->node;
2957
2958 if (hi_child > t_hi)
2959 {
2960 hi_child = t_hi;
2961 node_child = t_sese->high;
2962 }
2963 }
2964
2965 sese->push (node_child);
2966 }
2967 }
2968
2969
2970 /* DFS walk of BB graph. Color node BLOCK according to COLORING then
2971 proceed to successors. Set SESE entry and exit nodes of
2972 REGIONS. */
2973
2974 static void
2975 nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
2976 basic_block block, int coloring)
2977 {
2978 bb_sese *sese = BB_GET_SESE (block);
2979
2980 if (block->flags & BB_VISITED)
2981 {
2982 /* If we've already encountered this block, either we must not
2983 be coloring, or it must have been colored the current color. */
2984 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
2985 return;
2986 }
2987
2988 block->flags |= BB_VISITED;
2989
2990 if (sese)
2991 {
2992 if (coloring < 0)
2993 {
2994 /* Start coloring a region. */
2995 regions[sese->color].first = block;
2996 coloring = sese->color;
2997 }
2998
2999 if (!--color_counts[sese->color] && sese->color == coloring)
3000 {
3001 /* Found final block of SESE region. */
3002 regions[sese->color].second = block;
3003 coloring = -1;
3004 }
3005 else
3006 /* Color the node, so we can assert on revisiting the node
3007 that the graph is indeed SESE. */
3008 sese->color = coloring;
3009 }
3010 else
3011 /* Fallen off the subgraph, we cannot be coloring. */
3012 gcc_assert (coloring < 0);
3013
3014 /* Walk each successor block. */
3015 if (block->succs && block->succs->length ())
3016 {
3017 edge e;
3018 edge_iterator ei;
3019
3020 FOR_EACH_EDGE (e, ei, block->succs)
3021 nvptx_sese_color (color_counts, regions, e->dest, coloring);
3022 }
3023 else
3024 gcc_assert (coloring < 0);
3025 }
3026
3027 /* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3028 end up with NULL entries in it. */
3029
3030 static void
3031 nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
3032 {
3033 basic_block block;
3034 int ix;
3035
3036 /* First clear each BB of the whole function. */
3037 FOR_EACH_BB_FN (block, cfun)
3038 {
3039 block->flags &= ~BB_VISITED;
3040 BB_SET_SESE (block, 0);
3041 }
3042 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3043 block->flags &= ~BB_VISITED;
3044 BB_SET_SESE (block, 0);
3045 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3046 block->flags &= ~BB_VISITED;
3047 BB_SET_SESE (block, 0);
3048
3049 /* Mark blocks in the function that are in this graph. */
3050 for (ix = 0; blocks.iterate (ix, &block); ix++)
3051 block->flags |= BB_VISITED;
3052
3053 /* Counts of nodes assigned to each color. There cannot be more
3054 colors than blocks (and hopefully there will be fewer). */
3055 auto_vec<unsigned> color_counts;
3056 color_counts.reserve (blocks.length ());
3057
3058 /* Worklist of nodes in the spanning tree. Again, there cannot be
3059 more nodes in the tree than blocks (there will be fewer if the
3060 CFG of blocks is disjoint). */
3061 auto_vec<basic_block> spanlist;
3062 spanlist.reserve (blocks.length ());
3063
3064 /* Make sure every block has its cycle class determined. */
3065 for (ix = 0; blocks.iterate (ix, &block); ix++)
3066 {
3067 if (BB_GET_SESE (block))
3068 /* We already met this block in an earlier graph solve. */
3069 continue;
3070
3071 if (dump_file)
3072 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3073
3074 /* Number the nodes reachable from block initial DFS order. */
3075 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3076
3077 /* Now walk in reverse DFS order to find cycle equivalents. */
3078 while (spanlist.length ())
3079 {
3080 block = spanlist.pop ();
3081 bb_sese *sese = BB_GET_SESE (block);
3082
3083 /* Do the pseudo node below. */
3084 nvptx_sese_pseudo (block, sese, depth, +1,
3085 sese->dir > 0 ? block->succs : block->preds,
3086 (sese->dir > 0 ? offsetof (edge_def, dest)
3087 : offsetof (edge_def, src)));
3088 sese->set_color (color_counts);
3089 /* Do the pseudo node above. */
3090 nvptx_sese_pseudo (block, sese, depth, -1,
3091 sese->dir < 0 ? block->succs : block->preds,
3092 (sese->dir < 0 ? offsetof (edge_def, dest)
3093 : offsetof (edge_def, src)));
3094 }
3095 if (dump_file)
3096 fprintf (dump_file, "\n");
3097 }
3098
3099 if (dump_file)
3100 {
3101 unsigned count;
3102 const char *comma = "";
3103
3104 fprintf (dump_file, "Found %d cycle equivalents\n",
3105 color_counts.length ());
3106 for (ix = 0; color_counts.iterate (ix, &count); ix++)
3107 {
3108 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3109
3110 comma = "";
3111 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3112 if (BB_GET_SESE (block)->color == ix)
3113 {
3114 block->flags |= BB_VISITED;
3115 fprintf (dump_file, "%s%d", comma, block->index);
3116 comma=",";
3117 }
3118 fprintf (dump_file, "}");
3119 comma = ", ";
3120 }
3121 fprintf (dump_file, "\n");
3122 }
3123
3124 /* Now we've colored every block in the subgraph. We now need to
3125 determine the minimal set of SESE regions that cover that
3126 subgraph. Do this with a DFS walk of the complete function.
3127 During the walk we're either 'looking' or 'coloring'. When we
3128 reach the last node of a particular color, we stop coloring and
3129 return to looking. */
3130
3131 /* There cannot be more SESE regions than colors. */
3132 regions.reserve (color_counts.length ());
3133 for (ix = color_counts.length (); ix--;)
3134 regions.quick_push (bb_pair_t (0, 0));
3135
3136 for (ix = 0; blocks.iterate (ix, &block); ix++)
3137 block->flags &= ~BB_VISITED;
3138
3139 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
3140
3141 if (dump_file)
3142 {
3143 const char *comma = "";
3144 int len = regions.length ();
3145
3146 fprintf (dump_file, "SESE regions:");
3147 for (ix = 0; ix != len; ix++)
3148 {
3149 basic_block from = regions[ix].first;
3150 basic_block to = regions[ix].second;
3151
3152 if (from)
3153 {
3154 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
3155 if (to != from)
3156 fprintf (dump_file, "->%d", to->index);
3157
3158 int color = BB_GET_SESE (from)->color;
3159
3160 /* Print the blocks within the region (excluding ends). */
3161 FOR_EACH_BB_FN (block, cfun)
3162 {
3163 bb_sese *sese = BB_GET_SESE (block);
3164
3165 if (sese && sese->color == color
3166 && block != from && block != to)
3167 fprintf (dump_file, ".%d", block->index);
3168 }
3169 fprintf (dump_file, "}");
3170 }
3171 comma = ",";
3172 }
3173 fprintf (dump_file, "\n\n");
3174 }
3175
3176 for (ix = 0; blocks.iterate (ix, &block); ix++)
3177 delete BB_GET_SESE (block);
3178 }
3179
3180 #undef BB_SET_SESE
3181 #undef BB_GET_SESE
3182
3183 /* Propagate live state at the start of a partitioned region. BLOCK
3184 provides the live register information, and might not contain
3185 INSN. Propagation is inserted just after INSN. RW indicates whether
3186 we are reading and/or writing state. This
3187 separation is needed for worker-level proppagation where we
3188 essentially do a spill & fill. FN is the underlying worker
3189 function to generate the propagation instructions for single
3190 register. DATA is user data.
3191
3192 We propagate the live register set and the entire frame. We could
3193 do better by (a) propagating just the live set that is used within
3194 the partitioned regions and (b) only propagating stack entries that
3195 are used. The latter might be quite hard to determine. */
3196
3197 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
3198
3199 static void
3200 nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
3201 propagator_fn fn, void *data)
3202 {
3203 bitmap live = DF_LIVE_IN (block);
3204 bitmap_iterator iterator;
3205 unsigned ix;
3206
3207 /* Copy the frame array. */
3208 HOST_WIDE_INT fs = get_frame_size ();
3209 if (fs)
3210 {
3211 rtx tmp = gen_reg_rtx (DImode);
3212 rtx idx = NULL_RTX;
3213 rtx ptr = gen_reg_rtx (Pmode);
3214 rtx pred = NULL_RTX;
3215 rtx_code_label *label = NULL;
3216
3217 gcc_assert (!(fs & (GET_MODE_SIZE (DImode) - 1)));
3218 fs /= GET_MODE_SIZE (DImode);
3219 /* Detect single iteration loop. */
3220 if (fs == 1)
3221 fs = 0;
3222
3223 start_sequence ();
3224 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
3225 if (fs)
3226 {
3227 idx = gen_reg_rtx (SImode);
3228 pred = gen_reg_rtx (BImode);
3229 label = gen_label_rtx ();
3230
3231 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
3232 /* Allow worker function to initialize anything needed. */
3233 rtx init = fn (tmp, PM_loop_begin, fs, data);
3234 if (init)
3235 emit_insn (init);
3236 emit_label (label);
3237 LABEL_NUSES (label)++;
3238 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
3239 }
3240 if (rw & PM_read)
3241 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
3242 emit_insn (fn (tmp, rw, fs, data));
3243 if (rw & PM_write)
3244 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
3245 if (fs)
3246 {
3247 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
3248 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
3249 emit_insn (gen_br_true_uni (pred, label));
3250 rtx fini = fn (tmp, PM_loop_end, fs, data);
3251 if (fini)
3252 emit_insn (fini);
3253 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
3254 }
3255 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
3256 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
3257 rtx cpy = get_insns ();
3258 end_sequence ();
3259 insn = emit_insn_after (cpy, insn);
3260 }
3261
3262 /* Copy live registers. */
3263 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
3264 {
3265 rtx reg = regno_reg_rtx[ix];
3266
3267 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
3268 {
3269 rtx bcast = fn (reg, rw, 0, data);
3270
3271 insn = emit_insn_after (bcast, insn);
3272 }
3273 }
3274 }
3275
3276 /* Worker for nvptx_vpropagate. */
3277
3278 static rtx
3279 vprop_gen (rtx reg, propagate_mask pm,
3280 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data))
3281 {
3282 if (!(pm & PM_read_write))
3283 return 0;
3284
3285 return nvptx_gen_vcast (reg);
3286 }
3287
3288 /* Propagate state that is live at start of BLOCK across the vectors
3289 of a single warp. Propagation is inserted just after INSN. */
3290
3291 static void
3292 nvptx_vpropagate (basic_block block, rtx_insn *insn)
3293 {
3294 nvptx_propagate (block, insn, PM_read_write, vprop_gen, 0);
3295 }
3296
3297 /* Worker for nvptx_wpropagate. */
3298
3299 static rtx
3300 wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
3301 {
3302 wcast_data_t *data = (wcast_data_t *)data_;
3303
3304 if (pm & PM_loop_begin)
3305 {
3306 /* Starting a loop, initialize pointer. */
3307 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
3308
3309 if (align > worker_bcast_align)
3310 worker_bcast_align = align;
3311 data->offset = (data->offset + align - 1) & ~(align - 1);
3312
3313 data->ptr = gen_reg_rtx (Pmode);
3314
3315 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
3316 }
3317 else if (pm & PM_loop_end)
3318 {
3319 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
3320 data->ptr = NULL_RTX;
3321 return clobber;
3322 }
3323 else
3324 return nvptx_gen_wcast (reg, pm, rep, data);
3325 }
3326
3327 /* Spill or fill live state that is live at start of BLOCK. PRE_P
3328 indicates if this is just before partitioned mode (do spill), or
3329 just after it starts (do fill). Sequence is inserted just after
3330 INSN. */
3331
3332 static void
3333 nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
3334 {
3335 wcast_data_t data;
3336
3337 data.base = gen_reg_rtx (Pmode);
3338 data.offset = 0;
3339 data.ptr = NULL_RTX;
3340
3341 nvptx_propagate (block, insn, pre_p ? PM_read : PM_write, wprop_gen, &data);
3342 if (data.offset)
3343 {
3344 /* Stuff was emitted, initialize the base pointer now. */
3345 rtx init = gen_rtx_SET (data.base, worker_bcast_sym);
3346 emit_insn_after (init, insn);
3347
3348 if (worker_bcast_size < data.offset)
3349 worker_bcast_size = data.offset;
3350 }
3351 }
3352
3353 /* Emit a worker-level synchronization barrier. We use different
3354 markers for before and after synchronizations. */
3355
3356 static rtx
3357 nvptx_wsync (bool after)
3358 {
3359 return gen_nvptx_barsync (GEN_INT (after));
3360 }
3361
3362 /* Single neutering according to MASK. FROM is the incoming block and
3363 TO is the outgoing block. These may be the same block. Insert at
3364 start of FROM:
3365
3366 if (tid.<axis>) goto end.
3367
3368 and insert before ending branch of TO (if there is such an insn):
3369
3370 end:
3371 <possibly-broadcast-cond>
3372 <branch>
3373
3374 We currently only use differnt FROM and TO when skipping an entire
3375 loop. We could do more if we detected superblocks. */
3376
3377 static void
3378 nvptx_single (unsigned mask, basic_block from, basic_block to)
3379 {
3380 rtx_insn *head = BB_HEAD (from);
3381 rtx_insn *tail = BB_END (to);
3382 unsigned skip_mask = mask;
3383
3384 /* Find first insn of from block */
3385 while (head != BB_END (from) && !INSN_P (head))
3386 head = NEXT_INSN (head);
3387
3388 /* Find last insn of to block */
3389 rtx_insn *limit = from == to ? head : BB_HEAD (to);
3390 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
3391 tail = PREV_INSN (tail);
3392
3393 /* Detect if tail is a branch. */
3394 rtx tail_branch = NULL_RTX;
3395 rtx cond_branch = NULL_RTX;
3396 if (tail && INSN_P (tail))
3397 {
3398 tail_branch = PATTERN (tail);
3399 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
3400 tail_branch = NULL_RTX;
3401 else
3402 {
3403 cond_branch = SET_SRC (tail_branch);
3404 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
3405 cond_branch = NULL_RTX;
3406 }
3407 }
3408
3409 if (tail == head)
3410 {
3411 /* If this is empty, do nothing. */
3412 if (!head || !INSN_P (head))
3413 return;
3414
3415 /* If this is a dummy insn, do nothing. */
3416 switch (recog_memoized (head))
3417 {
3418 default:
3419 break;
3420 case CODE_FOR_nvptx_fork:
3421 case CODE_FOR_nvptx_forked:
3422 case CODE_FOR_nvptx_joining:
3423 case CODE_FOR_nvptx_join:
3424 return;
3425 }
3426
3427 if (cond_branch)
3428 {
3429 /* If we're only doing vector single, there's no need to
3430 emit skip code because we'll not insert anything. */
3431 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
3432 skip_mask = 0;
3433 }
3434 else if (tail_branch)
3435 /* Block with only unconditional branch. Nothing to do. */
3436 return;
3437 }
3438
3439 /* Insert the vector test inside the worker test. */
3440 unsigned mode;
3441 rtx_insn *before = tail;
3442 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
3443 if (GOMP_DIM_MASK (mode) & skip_mask)
3444 {
3445 rtx_code_label *label = gen_label_rtx ();
3446 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
3447
3448 if (!pred)
3449 {
3450 pred = gen_reg_rtx (BImode);
3451 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
3452 }
3453
3454 rtx br;
3455 if (mode == GOMP_DIM_VECTOR)
3456 br = gen_br_true (pred, label);
3457 else
3458 br = gen_br_true_uni (pred, label);
3459 emit_insn_before (br, head);
3460
3461 LABEL_NUSES (label)++;
3462 if (tail_branch)
3463 before = emit_label_before (label, before);
3464 else
3465 emit_label_after (label, tail);
3466 }
3467
3468 /* Now deal with propagating the branch condition. */
3469 if (cond_branch)
3470 {
3471 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
3472
3473 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
3474 {
3475 /* Vector mode only, do a shuffle. */
3476 emit_insn_before (nvptx_gen_vcast (pvar), tail);
3477 }
3478 else
3479 {
3480 /* Includes worker mode, do spill & fill. By construction
3481 we should never have worker mode only. */
3482 wcast_data_t data;
3483
3484 data.base = worker_bcast_sym;
3485 data.ptr = 0;
3486
3487 if (worker_bcast_size < GET_MODE_SIZE (SImode))
3488 worker_bcast_size = GET_MODE_SIZE (SImode);
3489
3490 data.offset = 0;
3491 emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data),
3492 before);
3493 /* Barrier so other workers can see the write. */
3494 emit_insn_before (nvptx_wsync (false), tail);
3495 data.offset = 0;
3496 emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data), tail);
3497 /* This barrier is needed to avoid worker zero clobbering
3498 the broadcast buffer before all the other workers have
3499 had a chance to read this instance of it. */
3500 emit_insn_before (nvptx_wsync (true), tail);
3501 }
3502
3503 extract_insn (tail);
3504 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
3505 UNSPEC_BR_UNIFIED);
3506 validate_change (tail, recog_data.operand_loc[0], unsp, false);
3507 }
3508 }
3509
3510 /* PAR is a parallel that is being skipped in its entirety according to
3511 MASK. Treat this as skipping a superblock starting at forked
3512 and ending at joining. */
3513
3514 static void
3515 nvptx_skip_par (unsigned mask, parallel *par)
3516 {
3517 basic_block tail = par->join_block;
3518 gcc_assert (tail->preds->length () == 1);
3519
3520 basic_block pre_tail = (*tail->preds)[0]->src;
3521 gcc_assert (pre_tail->succs->length () == 1);
3522
3523 nvptx_single (mask, par->forked_block, pre_tail);
3524 }
3525
3526 /* If PAR has a single inner parallel and PAR itself only contains
3527 empty entry and exit blocks, swallow the inner PAR. */
3528
3529 static void
3530 nvptx_optimize_inner (parallel *par)
3531 {
3532 parallel *inner = par->inner;
3533
3534 /* We mustn't be the outer dummy par. */
3535 if (!par->mask)
3536 return;
3537
3538 /* We must have a single inner par. */
3539 if (!inner || inner->next)
3540 return;
3541
3542 /* We must only contain 2 blocks ourselves -- the head and tail of
3543 the inner par. */
3544 if (par->blocks.length () != 2)
3545 return;
3546
3547 /* We must be disjoint partitioning. As we only have vector and
3548 worker partitioning, this is sufficient to guarantee the pars
3549 have adjacent partitioning. */
3550 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
3551 /* This indicates malformed code generation. */
3552 return;
3553
3554 /* The outer forked insn should be immediately followed by the inner
3555 fork insn. */
3556 rtx_insn *forked = par->forked_insn;
3557 rtx_insn *fork = BB_END (par->forked_block);
3558
3559 if (NEXT_INSN (forked) != fork)
3560 return;
3561 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
3562
3563 /* The outer joining insn must immediately follow the inner join
3564 insn. */
3565 rtx_insn *joining = par->joining_insn;
3566 rtx_insn *join = inner->join_insn;
3567 if (NEXT_INSN (join) != joining)
3568 return;
3569
3570 /* Preconditions met. Swallow the inner par. */
3571 if (dump_file)
3572 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
3573 inner->mask, inner->forked_block->index,
3574 inner->join_block->index,
3575 par->mask, par->forked_block->index, par->join_block->index);
3576
3577 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
3578
3579 par->blocks.reserve (inner->blocks.length ());
3580 while (inner->blocks.length ())
3581 par->blocks.quick_push (inner->blocks.pop ());
3582
3583 par->inner = inner->inner;
3584 inner->inner = NULL;
3585
3586 delete inner;
3587 }
3588
3589 /* Process the parallel PAR and all its contained
3590 parallels. We do everything but the neutering. Return mask of
3591 partitioned modes used within this parallel. */
3592
3593 static unsigned
3594 nvptx_process_pars (parallel *par)
3595 {
3596 if (nvptx_optimize)
3597 nvptx_optimize_inner (par);
3598
3599 unsigned inner_mask = par->mask;
3600
3601 /* Do the inner parallels first. */
3602 if (par->inner)
3603 {
3604 par->inner_mask = nvptx_process_pars (par->inner);
3605 inner_mask |= par->inner_mask;
3606 }
3607
3608 if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
3609 /* No propagation needed for a call. */;
3610 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
3611 {
3612 nvptx_wpropagate (false, par->forked_block, par->forked_insn);
3613 nvptx_wpropagate (true, par->forked_block, par->fork_insn);
3614 /* Insert begin and end synchronizations. */
3615 emit_insn_after (nvptx_wsync (false), par->forked_insn);
3616 emit_insn_before (nvptx_wsync (true), par->joining_insn);
3617 }
3618 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
3619 nvptx_vpropagate (par->forked_block, par->forked_insn);
3620
3621 /* Now do siblings. */
3622 if (par->next)
3623 inner_mask |= nvptx_process_pars (par->next);
3624 return inner_mask;
3625 }
3626
3627 /* Neuter the parallel described by PAR. We recurse in depth-first
3628 order. MODES are the partitioning of the execution and OUTER is
3629 the partitioning of the parallels we are contained in. */
3630
3631 static void
3632 nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
3633 {
3634 unsigned me = (par->mask
3635 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
3636 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
3637 unsigned skip_mask = 0, neuter_mask = 0;
3638
3639 if (par->inner)
3640 nvptx_neuter_pars (par->inner, modes, outer | me);
3641
3642 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
3643 {
3644 if ((outer | me) & GOMP_DIM_MASK (mode))
3645 {} /* Mode is partitioned: no neutering. */
3646 else if (!(modes & GOMP_DIM_MASK (mode)))
3647 {} /* Mode is not used: nothing to do. */
3648 else if (par->inner_mask & GOMP_DIM_MASK (mode)
3649 || !par->forked_insn)
3650 /* Partitioned in inner parallels, or we're not a partitioned
3651 at all: neuter individual blocks. */
3652 neuter_mask |= GOMP_DIM_MASK (mode);
3653 else if (!par->parent || !par->parent->forked_insn
3654 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
3655 /* Parent isn't a parallel or contains this paralleling: skip
3656 parallel at this level. */
3657 skip_mask |= GOMP_DIM_MASK (mode);
3658 else
3659 {} /* Parent will skip this parallel itself. */
3660 }
3661
3662 if (neuter_mask)
3663 {
3664 int ix, len;
3665
3666 if (nvptx_optimize)
3667 {
3668 /* Neuter whole SESE regions. */
3669 bb_pair_vec_t regions;
3670
3671 nvptx_find_sese (par->blocks, regions);
3672 len = regions.length ();
3673 for (ix = 0; ix != len; ix++)
3674 {
3675 basic_block from = regions[ix].first;
3676 basic_block to = regions[ix].second;
3677
3678 if (from)
3679 nvptx_single (neuter_mask, from, to);
3680 else
3681 gcc_assert (!to);
3682 }
3683 }
3684 else
3685 {
3686 /* Neuter each BB individually. */
3687 len = par->blocks.length ();
3688 for (ix = 0; ix != len; ix++)
3689 {
3690 basic_block block = par->blocks[ix];
3691
3692 nvptx_single (neuter_mask, block, block);
3693 }
3694 }
3695 }
3696
3697 if (skip_mask)
3698 nvptx_skip_par (skip_mask, par);
3699
3700 if (par->next)
3701 nvptx_neuter_pars (par->next, modes, outer);
3702 }
3703
3704 /* PTX-specific reorganization
3705 - Split blocks at fork and join instructions
3706 - Compute live registers
3707 - Mark now-unused registers, so function begin doesn't declare
3708 unused registers.
3709 - Insert state propagation when entering partitioned mode
3710 - Insert neutering instructions when in single mode
3711 - Replace subregs with suitable sequences.
3712 */
3713
3714 static void
3715 nvptx_reorg (void)
3716 {
3717 /* We are freeing block_for_insn in the toplev to keep compatibility
3718 with old MDEP_REORGS that are not CFG based. Recompute it now. */
3719 compute_bb_for_insn ();
3720
3721 thread_prologue_and_epilogue_insns ();
3722
3723 /* Split blocks and record interesting unspecs. */
3724 bb_insn_map_t bb_insn_map;
3725
3726 nvptx_split_blocks (&bb_insn_map);
3727
3728 /* Compute live regs */
3729 df_clear_flags (DF_LR_RUN_DCE);
3730 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
3731 df_live_add_problem ();
3732 df_live_set_all_dirty ();
3733 df_analyze ();
3734 regstat_init_n_sets_and_refs ();
3735
3736 if (dump_file)
3737 df_dump (dump_file);
3738
3739 /* Mark unused regs as unused. */
3740 int max_regs = max_reg_num ();
3741 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
3742 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
3743 regno_reg_rtx[i] = const0_rtx;
3744
3745 /* Determine launch dimensions of the function. If it is not an
3746 offloaded function (i.e. this is a regular compiler), the
3747 function has no neutering. */
3748 tree attr = get_oacc_fn_attrib (current_function_decl);
3749 if (attr)
3750 {
3751 /* If we determined this mask before RTL expansion, we could
3752 elide emission of some levels of forks and joins. */
3753 unsigned mask = 0;
3754 tree dims = TREE_VALUE (attr);
3755 unsigned ix;
3756
3757 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
3758 {
3759 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
3760 tree allowed = TREE_PURPOSE (dims);
3761
3762 if (size != 1 && !(allowed && integer_zerop (allowed)))
3763 mask |= GOMP_DIM_MASK (ix);
3764 }
3765 /* If there is worker neutering, there must be vector
3766 neutering. Otherwise the hardware will fail. */
3767 gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
3768 || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
3769
3770 /* Discover & process partitioned regions. */
3771 parallel *pars = nvptx_discover_pars (&bb_insn_map);
3772 nvptx_process_pars (pars);
3773 nvptx_neuter_pars (pars, mask, 0);
3774 delete pars;
3775 }
3776
3777 /* Replace subregs. */
3778 nvptx_reorg_subreg ();
3779
3780 regstat_free_n_sets_and_refs ();
3781
3782 df_finish_pass (true);
3783 }
3784 \f
3785 /* Handle a "kernel" attribute; arguments as in
3786 struct attribute_spec.handler. */
3787
3788 static tree
3789 nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
3790 int ARG_UNUSED (flags), bool *no_add_attrs)
3791 {
3792 tree decl = *node;
3793
3794 if (TREE_CODE (decl) != FUNCTION_DECL)
3795 {
3796 error ("%qE attribute only applies to functions", name);
3797 *no_add_attrs = true;
3798 }
3799 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
3800 {
3801 error ("%qE attribute requires a void return type", name);
3802 *no_add_attrs = true;
3803 }
3804
3805 return NULL_TREE;
3806 }
3807
3808 /* Table of valid machine attributes. */
3809 static const struct attribute_spec nvptx_attribute_table[] =
3810 {
3811 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
3812 affects_type_identity } */
3813 { "kernel", 0, 0, true, false, false, nvptx_handle_kernel_attribute, false },
3814 { NULL, 0, 0, false, false, false, NULL, false }
3815 };
3816 \f
3817 /* Limit vector alignments to BIGGEST_ALIGNMENT. */
3818
3819 static HOST_WIDE_INT
3820 nvptx_vector_alignment (const_tree type)
3821 {
3822 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
3823
3824 return MIN (align, BIGGEST_ALIGNMENT);
3825 }
3826
3827 /* Indicate that INSN cannot be duplicated. */
3828
3829 static bool
3830 nvptx_cannot_copy_insn_p (rtx_insn *insn)
3831 {
3832 switch (recog_memoized (insn))
3833 {
3834 case CODE_FOR_nvptx_shufflesi:
3835 case CODE_FOR_nvptx_shufflesf:
3836 case CODE_FOR_nvptx_barsync:
3837 case CODE_FOR_nvptx_fork:
3838 case CODE_FOR_nvptx_forked:
3839 case CODE_FOR_nvptx_joining:
3840 case CODE_FOR_nvptx_join:
3841 return true;
3842 default:
3843 return false;
3844 }
3845 }
3846
3847 /* Section anchors do not work. Initialization for flag_section_anchor
3848 probes the existence of the anchoring target hooks and prevents
3849 anchoring if they don't exist. However, we may be being used with
3850 a host-side compiler that does support anchoring, and hence see
3851 the anchor flag set (as it's not recalculated). So provide an
3852 implementation denying anchoring. */
3853
3854 static bool
3855 nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
3856 {
3857 return false;
3858 }
3859 \f
3860 /* Record a symbol for mkoffload to enter into the mapping table. */
3861
3862 static void
3863 nvptx_record_offload_symbol (tree decl)
3864 {
3865 switch (TREE_CODE (decl))
3866 {
3867 case VAR_DECL:
3868 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
3869 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
3870 break;
3871
3872 case FUNCTION_DECL:
3873 {
3874 tree attr = get_oacc_fn_attrib (decl);
3875 tree dims = TREE_VALUE (attr);
3876 unsigned ix;
3877
3878 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
3879 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
3880
3881 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
3882 {
3883 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
3884
3885 gcc_assert (!TREE_PURPOSE (dims));
3886 fprintf (asm_out_file, ", %#x", size);
3887 }
3888
3889 fprintf (asm_out_file, "\n");
3890 }
3891 break;
3892
3893 default:
3894 gcc_unreachable ();
3895 }
3896 }
3897
3898 /* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
3899 at the start of a file. */
3900
3901 static void
3902 nvptx_file_start (void)
3903 {
3904 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
3905 fputs ("\t.version\t3.1\n", asm_out_file);
3906 fputs ("\t.target\tsm_30\n", asm_out_file);
3907 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
3908 fputs ("// END PREAMBLE\n", asm_out_file);
3909 }
3910
3911 /* Emit a declaration for a worker-level buffer in .shared memory. */
3912
3913 static void
3914 write_worker_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
3915 {
3916 const char *name = XSTR (sym, 0);
3917
3918 write_var_marker (file, true, false, name);
3919 fprintf (file, ".shared .align %d .u8 %s[%d];\n",
3920 align, name, size);
3921 }
3922
3923 /* Write out the function declarations we've collected and declare storage
3924 for the broadcast buffer. */
3925
3926 static void
3927 nvptx_file_end (void)
3928 {
3929 hash_table<tree_hasher>::iterator iter;
3930 tree decl;
3931 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
3932 nvptx_record_fndecl (decl);
3933 fputs (func_decls.str().c_str(), asm_out_file);
3934
3935 if (worker_bcast_size)
3936 write_worker_buffer (asm_out_file, worker_bcast_sym,
3937 worker_bcast_align, worker_bcast_size);
3938
3939 if (worker_red_size)
3940 write_worker_buffer (asm_out_file, worker_red_sym,
3941 worker_red_align, worker_red_size);
3942 }
3943
3944 /* Expander for the shuffle builtins. */
3945
3946 static rtx
3947 nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
3948 {
3949 if (ignore)
3950 return target;
3951
3952 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
3953 NULL_RTX, mode, EXPAND_NORMAL);
3954 if (!REG_P (src))
3955 src = copy_to_mode_reg (mode, src);
3956
3957 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
3958 NULL_RTX, SImode, EXPAND_NORMAL);
3959 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
3960 NULL_RTX, SImode, EXPAND_NORMAL);
3961
3962 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
3963 idx = copy_to_mode_reg (SImode, idx);
3964
3965 rtx pat = nvptx_gen_shuffle (target, src, idx,
3966 (nvptx_shuffle_kind) INTVAL (op));
3967 if (pat)
3968 emit_insn (pat);
3969
3970 return target;
3971 }
3972
3973 /* Worker reduction address expander. */
3974
3975 static rtx
3976 nvptx_expand_worker_addr (tree exp, rtx target,
3977 machine_mode ARG_UNUSED (mode), int ignore)
3978 {
3979 if (ignore)
3980 return target;
3981
3982 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
3983 if (align > worker_red_align)
3984 worker_red_align = align;
3985
3986 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
3987 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
3988 if (size + offset > worker_red_size)
3989 worker_red_size = size + offset;
3990
3991 rtx addr = worker_red_sym;
3992 if (offset)
3993 {
3994 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
3995 addr = gen_rtx_CONST (Pmode, addr);
3996 }
3997
3998 emit_move_insn (target, addr);
3999
4000 return target;
4001 }
4002
4003 /* Expand the CMP_SWAP PTX builtins. We have our own versions that do
4004 not require taking the address of any object, other than the memory
4005 cell being operated on. */
4006
4007 static rtx
4008 nvptx_expand_cmp_swap (tree exp, rtx target,
4009 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
4010 {
4011 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
4012
4013 if (!target)
4014 target = gen_reg_rtx (mode);
4015
4016 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
4017 NULL_RTX, Pmode, EXPAND_NORMAL);
4018 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
4019 NULL_RTX, mode, EXPAND_NORMAL);
4020 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
4021 NULL_RTX, mode, EXPAND_NORMAL);
4022 rtx pat;
4023
4024 mem = gen_rtx_MEM (mode, mem);
4025 if (!REG_P (cmp))
4026 cmp = copy_to_mode_reg (mode, cmp);
4027 if (!REG_P (src))
4028 src = copy_to_mode_reg (mode, src);
4029
4030 if (mode == SImode)
4031 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
4032 else
4033 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
4034
4035 emit_insn (pat);
4036
4037 return target;
4038 }
4039
4040
4041 /* Codes for all the NVPTX builtins. */
4042 enum nvptx_builtins
4043 {
4044 NVPTX_BUILTIN_SHUFFLE,
4045 NVPTX_BUILTIN_SHUFFLELL,
4046 NVPTX_BUILTIN_WORKER_ADDR,
4047 NVPTX_BUILTIN_CMP_SWAP,
4048 NVPTX_BUILTIN_CMP_SWAPLL,
4049 NVPTX_BUILTIN_MAX
4050 };
4051
4052 static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
4053
4054 /* Return the NVPTX builtin for CODE. */
4055
4056 static tree
4057 nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
4058 {
4059 if (code >= NVPTX_BUILTIN_MAX)
4060 return error_mark_node;
4061
4062 return nvptx_builtin_decls[code];
4063 }
4064
4065 /* Set up all builtin functions for this target. */
4066
4067 static void
4068 nvptx_init_builtins (void)
4069 {
4070 #define DEF(ID, NAME, T) \
4071 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
4072 = add_builtin_function ("__builtin_nvptx_" NAME, \
4073 build_function_type_list T, \
4074 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
4075 #define ST sizetype
4076 #define UINT unsigned_type_node
4077 #define LLUINT long_long_unsigned_type_node
4078 #define PTRVOID ptr_type_node
4079
4080 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
4081 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
4082 DEF (WORKER_ADDR, "worker_addr",
4083 (PTRVOID, ST, UINT, UINT, NULL_TREE));
4084 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
4085 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
4086
4087 #undef DEF
4088 #undef ST
4089 #undef UINT
4090 #undef LLUINT
4091 #undef PTRVOID
4092 }
4093
4094 /* Expand an expression EXP that calls a built-in function,
4095 with result going to TARGET if that's convenient
4096 (and in mode MODE if that's convenient).
4097 SUBTARGET may be used as the target for computing one of EXP's operands.
4098 IGNORE is nonzero if the value is to be ignored. */
4099
4100 static rtx
4101 nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
4102 machine_mode mode, int ignore)
4103 {
4104 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4105 switch (DECL_FUNCTION_CODE (fndecl))
4106 {
4107 case NVPTX_BUILTIN_SHUFFLE:
4108 case NVPTX_BUILTIN_SHUFFLELL:
4109 return nvptx_expand_shuffle (exp, target, mode, ignore);
4110
4111 case NVPTX_BUILTIN_WORKER_ADDR:
4112 return nvptx_expand_worker_addr (exp, target, mode, ignore);
4113
4114 case NVPTX_BUILTIN_CMP_SWAP:
4115 case NVPTX_BUILTIN_CMP_SWAPLL:
4116 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
4117
4118 default: gcc_unreachable ();
4119 }
4120 }
4121 \f
4122 /* Define dimension sizes for known hardware. */
4123 #define PTX_VECTOR_LENGTH 32
4124 #define PTX_WORKER_LENGTH 32
4125 #define PTX_GANG_DEFAULT 32
4126
4127 /* Validate compute dimensions of an OpenACC offload or routine, fill
4128 in non-unity defaults. FN_LEVEL indicates the level at which a
4129 routine might spawn a loop. It is negative for non-routines. If
4130 DECL is null, we are validating the default dimensions. */
4131
4132 static bool
4133 nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
4134 {
4135 bool changed = false;
4136
4137 /* The vector size must be 32, unless this is a SEQ routine. */
4138 if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1
4139 && dims[GOMP_DIM_VECTOR] >= 0
4140 && dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH)
4141 {
4142 if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0)
4143 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
4144 dims[GOMP_DIM_VECTOR]
4145 ? "using vector_length (%d), ignoring %d"
4146 : "using vector_length (%d), ignoring runtime setting",
4147 PTX_VECTOR_LENGTH, dims[GOMP_DIM_VECTOR]);
4148 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
4149 changed = true;
4150 }
4151
4152 /* Check the num workers is not too large. */
4153 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
4154 {
4155 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
4156 "using num_workers (%d), ignoring %d",
4157 PTX_WORKER_LENGTH, dims[GOMP_DIM_WORKER]);
4158 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
4159 changed = true;
4160 }
4161
4162 if (!decl)
4163 {
4164 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
4165 if (dims[GOMP_DIM_WORKER] < 0)
4166 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
4167 if (dims[GOMP_DIM_GANG] < 0)
4168 dims[GOMP_DIM_GANG] = PTX_GANG_DEFAULT;
4169 changed = true;
4170 }
4171
4172 return changed;
4173 }
4174
4175 /* Return maximum dimension size, or zero for unbounded. */
4176
4177 static int
4178 nvptx_dim_limit (int axis)
4179 {
4180 switch (axis)
4181 {
4182 case GOMP_DIM_WORKER:
4183 return PTX_WORKER_LENGTH;
4184
4185 case GOMP_DIM_VECTOR:
4186 return PTX_VECTOR_LENGTH;
4187
4188 default:
4189 break;
4190 }
4191 return 0;
4192 }
4193
4194 /* Determine whether fork & joins are needed. */
4195
4196 static bool
4197 nvptx_goacc_fork_join (gcall *call, const int dims[],
4198 bool ARG_UNUSED (is_fork))
4199 {
4200 tree arg = gimple_call_arg (call, 2);
4201 unsigned axis = TREE_INT_CST_LOW (arg);
4202
4203 /* We only care about worker and vector partitioning. */
4204 if (axis < GOMP_DIM_WORKER)
4205 return false;
4206
4207 /* If the size is 1, there's no partitioning. */
4208 if (dims[axis] == 1)
4209 return false;
4210
4211 return true;
4212 }
4213
4214 /* Generate a PTX builtin function call that returns the address in
4215 the worker reduction buffer at OFFSET. TYPE is the type of the
4216 data at that location. */
4217
4218 static tree
4219 nvptx_get_worker_red_addr (tree type, tree offset)
4220 {
4221 machine_mode mode = TYPE_MODE (type);
4222 tree fndecl = nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR, true);
4223 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
4224 tree align = build_int_cst (unsigned_type_node,
4225 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
4226 tree call = build_call_expr (fndecl, 3, offset, size, align);
4227
4228 return fold_convert (build_pointer_type (type), call);
4229 }
4230
4231 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
4232 will cast the variable if necessary. */
4233
4234 static void
4235 nvptx_generate_vector_shuffle (location_t loc,
4236 tree dest_var, tree var, unsigned shift,
4237 gimple_seq *seq)
4238 {
4239 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
4240 tree_code code = NOP_EXPR;
4241 tree arg_type = unsigned_type_node;
4242 tree var_type = TREE_TYPE (var);
4243 tree dest_type = var_type;
4244
4245 if (TREE_CODE (var_type) == COMPLEX_TYPE)
4246 var_type = TREE_TYPE (var_type);
4247
4248 if (TREE_CODE (var_type) == REAL_TYPE)
4249 code = VIEW_CONVERT_EXPR;
4250
4251 if (TYPE_SIZE (var_type)
4252 == TYPE_SIZE (long_long_unsigned_type_node))
4253 {
4254 fn = NVPTX_BUILTIN_SHUFFLELL;
4255 arg_type = long_long_unsigned_type_node;
4256 }
4257
4258 tree call = nvptx_builtin_decl (fn, true);
4259 tree bits = build_int_cst (unsigned_type_node, shift);
4260 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
4261 tree expr;
4262
4263 if (var_type != dest_type)
4264 {
4265 /* Do real and imaginary parts separately. */
4266 tree real = fold_build1 (REALPART_EXPR, var_type, var);
4267 real = fold_build1 (code, arg_type, real);
4268 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
4269 real = fold_build1 (code, var_type, real);
4270
4271 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
4272 imag = fold_build1 (code, arg_type, imag);
4273 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
4274 imag = fold_build1 (code, var_type, imag);
4275
4276 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
4277 }
4278 else
4279 {
4280 expr = fold_build1 (code, arg_type, var);
4281 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
4282 expr = fold_build1 (code, dest_type, expr);
4283 }
4284
4285 gimplify_assign (dest_var, expr, seq);
4286 }
4287
4288 /* Lazily generate the global lock var decl and return its address. */
4289
4290 static tree
4291 nvptx_global_lock_addr ()
4292 {
4293 tree v = global_lock_var;
4294
4295 if (!v)
4296 {
4297 tree name = get_identifier ("__reduction_lock");
4298 tree type = build_qualified_type (unsigned_type_node,
4299 TYPE_QUAL_VOLATILE);
4300 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
4301 global_lock_var = v;
4302 DECL_ARTIFICIAL (v) = 1;
4303 DECL_EXTERNAL (v) = 1;
4304 TREE_STATIC (v) = 1;
4305 TREE_PUBLIC (v) = 1;
4306 TREE_USED (v) = 1;
4307 mark_addressable (v);
4308 mark_decl_referenced (v);
4309 }
4310
4311 return build_fold_addr_expr (v);
4312 }
4313
4314 /* Insert code to locklessly update *PTR with *PTR OP VAR just before
4315 GSI. We use a lockless scheme for nearly all case, which looks
4316 like:
4317 actual = initval(OP);
4318 do {
4319 guess = actual;
4320 write = guess OP myval;
4321 actual = cmp&swap (ptr, guess, write)
4322 } while (actual bit-different-to guess);
4323 return write;
4324
4325 This relies on a cmp&swap instruction, which is available for 32-
4326 and 64-bit types. Larger types must use a locking scheme. */
4327
4328 static tree
4329 nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
4330 tree ptr, tree var, tree_code op)
4331 {
4332 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
4333 tree_code code = NOP_EXPR;
4334 tree arg_type = unsigned_type_node;
4335 tree var_type = TREE_TYPE (var);
4336
4337 if (TREE_CODE (var_type) == COMPLEX_TYPE
4338 || TREE_CODE (var_type) == REAL_TYPE)
4339 code = VIEW_CONVERT_EXPR;
4340
4341 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
4342 {
4343 arg_type = long_long_unsigned_type_node;
4344 fn = NVPTX_BUILTIN_CMP_SWAPLL;
4345 }
4346
4347 tree swap_fn = nvptx_builtin_decl (fn, true);
4348
4349 gimple_seq init_seq = NULL;
4350 tree init_var = make_ssa_name (arg_type);
4351 tree init_expr = omp_reduction_init_op (loc, op, var_type);
4352 init_expr = fold_build1 (code, arg_type, init_expr);
4353 gimplify_assign (init_var, init_expr, &init_seq);
4354 gimple *init_end = gimple_seq_last (init_seq);
4355
4356 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
4357
4358 /* Split the block just after the init stmts. */
4359 basic_block pre_bb = gsi_bb (*gsi);
4360 edge pre_edge = split_block (pre_bb, init_end);
4361 basic_block loop_bb = pre_edge->dest;
4362 pre_bb = pre_edge->src;
4363 /* Reset the iterator. */
4364 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4365
4366 tree expect_var = make_ssa_name (arg_type);
4367 tree actual_var = make_ssa_name (arg_type);
4368 tree write_var = make_ssa_name (arg_type);
4369
4370 /* Build and insert the reduction calculation. */
4371 gimple_seq red_seq = NULL;
4372 tree write_expr = fold_build1 (code, var_type, expect_var);
4373 write_expr = fold_build2 (op, var_type, write_expr, var);
4374 write_expr = fold_build1 (code, arg_type, write_expr);
4375 gimplify_assign (write_var, write_expr, &red_seq);
4376
4377 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
4378
4379 /* Build & insert the cmp&swap sequence. */
4380 gimple_seq latch_seq = NULL;
4381 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
4382 ptr, expect_var, write_var);
4383 gimplify_assign (actual_var, swap_expr, &latch_seq);
4384
4385 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
4386 NULL_TREE, NULL_TREE);
4387 gimple_seq_add_stmt (&latch_seq, cond);
4388
4389 gimple *latch_end = gimple_seq_last (latch_seq);
4390 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
4391
4392 /* Split the block just after the latch stmts. */
4393 edge post_edge = split_block (loop_bb, latch_end);
4394 basic_block post_bb = post_edge->dest;
4395 loop_bb = post_edge->src;
4396 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4397
4398 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
4399 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
4400 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
4401 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
4402
4403 gphi *phi = create_phi_node (expect_var, loop_bb);
4404 add_phi_arg (phi, init_var, pre_edge, loc);
4405 add_phi_arg (phi, actual_var, loop_edge, loc);
4406
4407 loop *loop = alloc_loop ();
4408 loop->header = loop_bb;
4409 loop->latch = loop_bb;
4410 add_loop (loop, loop_bb->loop_father);
4411
4412 return fold_build1 (code, var_type, write_var);
4413 }
4414
4415 /* Insert code to lockfully update *PTR with *PTR OP VAR just before
4416 GSI. This is necessary for types larger than 64 bits, where there
4417 is no cmp&swap instruction to implement a lockless scheme. We use
4418 a lock variable in global memory.
4419
4420 while (cmp&swap (&lock_var, 0, 1))
4421 continue;
4422 T accum = *ptr;
4423 accum = accum OP var;
4424 *ptr = accum;
4425 cmp&swap (&lock_var, 1, 0);
4426 return accum;
4427
4428 A lock in global memory is necessary to force execution engine
4429 descheduling and avoid resource starvation that can occur if the
4430 lock is in .shared memory. */
4431
4432 static tree
4433 nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
4434 tree ptr, tree var, tree_code op)
4435 {
4436 tree var_type = TREE_TYPE (var);
4437 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
4438 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
4439 tree uns_locked = build_int_cst (unsigned_type_node, 1);
4440
4441 /* Split the block just before the gsi. Insert a gimple nop to make
4442 this easier. */
4443 gimple *nop = gimple_build_nop ();
4444 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
4445 basic_block entry_bb = gsi_bb (*gsi);
4446 edge entry_edge = split_block (entry_bb, nop);
4447 basic_block lock_bb = entry_edge->dest;
4448 /* Reset the iterator. */
4449 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4450
4451 /* Build and insert the locking sequence. */
4452 gimple_seq lock_seq = NULL;
4453 tree lock_var = make_ssa_name (unsigned_type_node);
4454 tree lock_expr = nvptx_global_lock_addr ();
4455 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
4456 uns_unlocked, uns_locked);
4457 gimplify_assign (lock_var, lock_expr, &lock_seq);
4458 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
4459 NULL_TREE, NULL_TREE);
4460 gimple_seq_add_stmt (&lock_seq, cond);
4461 gimple *lock_end = gimple_seq_last (lock_seq);
4462 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
4463
4464 /* Split the block just after the lock sequence. */
4465 edge locked_edge = split_block (lock_bb, lock_end);
4466 basic_block update_bb = locked_edge->dest;
4467 lock_bb = locked_edge->src;
4468 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4469
4470 /* Create the lock loop ... */
4471 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
4472 make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
4473 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
4474 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
4475
4476 /* ... and the loop structure. */
4477 loop *lock_loop = alloc_loop ();
4478 lock_loop->header = lock_bb;
4479 lock_loop->latch = lock_bb;
4480 lock_loop->nb_iterations_estimate = 1;
4481 lock_loop->any_estimate = true;
4482 add_loop (lock_loop, entry_bb->loop_father);
4483
4484 /* Build and insert the reduction calculation. */
4485 gimple_seq red_seq = NULL;
4486 tree acc_in = make_ssa_name (var_type);
4487 tree ref_in = build_simple_mem_ref (ptr);
4488 TREE_THIS_VOLATILE (ref_in) = 1;
4489 gimplify_assign (acc_in, ref_in, &red_seq);
4490
4491 tree acc_out = make_ssa_name (var_type);
4492 tree update_expr = fold_build2 (op, var_type, ref_in, var);
4493 gimplify_assign (acc_out, update_expr, &red_seq);
4494
4495 tree ref_out = build_simple_mem_ref (ptr);
4496 TREE_THIS_VOLATILE (ref_out) = 1;
4497 gimplify_assign (ref_out, acc_out, &red_seq);
4498
4499 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
4500
4501 /* Build & insert the unlock sequence. */
4502 gimple_seq unlock_seq = NULL;
4503 tree unlock_expr = nvptx_global_lock_addr ();
4504 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
4505 uns_locked, uns_unlocked);
4506 gimplify_and_add (unlock_expr, &unlock_seq);
4507 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
4508
4509 return acc_out;
4510 }
4511
4512 /* Emit a sequence to update a reduction accumlator at *PTR with the
4513 value held in VAR using operator OP. Return the updated value.
4514
4515 TODO: optimize for atomic ops and indepedent complex ops. */
4516
4517 static tree
4518 nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
4519 tree ptr, tree var, tree_code op)
4520 {
4521 tree type = TREE_TYPE (var);
4522 tree size = TYPE_SIZE (type);
4523
4524 if (size == TYPE_SIZE (unsigned_type_node)
4525 || size == TYPE_SIZE (long_long_unsigned_type_node))
4526 return nvptx_lockless_update (loc, gsi, ptr, var, op);
4527 else
4528 return nvptx_lockfull_update (loc, gsi, ptr, var, op);
4529 }
4530
4531 /* NVPTX implementation of GOACC_REDUCTION_SETUP. */
4532
4533 static void
4534 nvptx_goacc_reduction_setup (gcall *call)
4535 {
4536 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4537 tree lhs = gimple_call_lhs (call);
4538 tree var = gimple_call_arg (call, 2);
4539 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4540 gimple_seq seq = NULL;
4541
4542 push_gimplify_context (true);
4543
4544 if (level != GOMP_DIM_GANG)
4545 {
4546 /* Copy the receiver object. */
4547 tree ref_to_res = gimple_call_arg (call, 1);
4548
4549 if (!integer_zerop (ref_to_res))
4550 var = build_simple_mem_ref (ref_to_res);
4551 }
4552
4553 if (level == GOMP_DIM_WORKER)
4554 {
4555 /* Store incoming value to worker reduction buffer. */
4556 tree offset = gimple_call_arg (call, 5);
4557 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
4558 tree ptr = make_ssa_name (TREE_TYPE (call));
4559
4560 gimplify_assign (ptr, call, &seq);
4561 tree ref = build_simple_mem_ref (ptr);
4562 TREE_THIS_VOLATILE (ref) = 1;
4563 gimplify_assign (ref, var, &seq);
4564 }
4565
4566 if (lhs)
4567 gimplify_assign (lhs, var, &seq);
4568
4569 pop_gimplify_context (NULL);
4570 gsi_replace_with_seq (&gsi, seq, true);
4571 }
4572
4573 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
4574
4575 static void
4576 nvptx_goacc_reduction_init (gcall *call)
4577 {
4578 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4579 tree lhs = gimple_call_lhs (call);
4580 tree var = gimple_call_arg (call, 2);
4581 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4582 enum tree_code rcode
4583 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
4584 tree init = omp_reduction_init_op (gimple_location (call), rcode,
4585 TREE_TYPE (var));
4586 gimple_seq seq = NULL;
4587
4588 push_gimplify_context (true);
4589
4590 if (level == GOMP_DIM_VECTOR)
4591 {
4592 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
4593 tree tid = make_ssa_name (integer_type_node);
4594 tree dim_vector = gimple_call_arg (call, 3);
4595 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
4596 dim_vector);
4597 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
4598 NULL_TREE, NULL_TREE);
4599
4600 gimple_call_set_lhs (tid_call, tid);
4601 gimple_seq_add_stmt (&seq, tid_call);
4602 gimple_seq_add_stmt (&seq, cond_stmt);
4603
4604 /* Split the block just after the call. */
4605 edge init_edge = split_block (gsi_bb (gsi), call);
4606 basic_block init_bb = init_edge->dest;
4607 basic_block call_bb = init_edge->src;
4608
4609 /* Fixup flags from call_bb to init_bb. */
4610 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
4611
4612 /* Set the initialization stmts. */
4613 gimple_seq init_seq = NULL;
4614 tree init_var = make_ssa_name (TREE_TYPE (var));
4615 gimplify_assign (init_var, init, &init_seq);
4616 gsi = gsi_start_bb (init_bb);
4617 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
4618
4619 /* Split block just after the init stmt. */
4620 gsi_prev (&gsi);
4621 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
4622 basic_block dst_bb = inited_edge->dest;
4623
4624 /* Create false edge from call_bb to dst_bb. */
4625 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
4626
4627 /* Create phi node in dst block. */
4628 gphi *phi = create_phi_node (lhs, dst_bb);
4629 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
4630 add_phi_arg (phi, var, nop_edge, gimple_location (call));
4631
4632 /* Reset dominator of dst bb. */
4633 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
4634
4635 /* Reset the gsi. */
4636 gsi = gsi_for_stmt (call);
4637 }
4638 else
4639 {
4640 if (level == GOMP_DIM_GANG)
4641 {
4642 /* If there's no receiver object, propagate the incoming VAR. */
4643 tree ref_to_res = gimple_call_arg (call, 1);
4644 if (integer_zerop (ref_to_res))
4645 init = var;
4646 }
4647
4648 gimplify_assign (lhs, init, &seq);
4649 }
4650
4651 pop_gimplify_context (NULL);
4652 gsi_replace_with_seq (&gsi, seq, true);
4653 }
4654
4655 /* NVPTX implementation of GOACC_REDUCTION_FINI. */
4656
4657 static void
4658 nvptx_goacc_reduction_fini (gcall *call)
4659 {
4660 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4661 tree lhs = gimple_call_lhs (call);
4662 tree ref_to_res = gimple_call_arg (call, 1);
4663 tree var = gimple_call_arg (call, 2);
4664 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4665 enum tree_code op
4666 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
4667 gimple_seq seq = NULL;
4668 tree r = NULL_TREE;;
4669
4670 push_gimplify_context (true);
4671
4672 if (level == GOMP_DIM_VECTOR)
4673 {
4674 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
4675 but that requires a method of emitting a unified jump at the
4676 gimple level. */
4677 for (int shfl = PTX_VECTOR_LENGTH / 2; shfl > 0; shfl = shfl >> 1)
4678 {
4679 tree other_var = make_ssa_name (TREE_TYPE (var));
4680 nvptx_generate_vector_shuffle (gimple_location (call),
4681 other_var, var, shfl, &seq);
4682
4683 r = make_ssa_name (TREE_TYPE (var));
4684 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
4685 var, other_var), &seq);
4686 var = r;
4687 }
4688 }
4689 else
4690 {
4691 tree accum = NULL_TREE;
4692
4693 if (level == GOMP_DIM_WORKER)
4694 {
4695 /* Get reduction buffer address. */
4696 tree offset = gimple_call_arg (call, 5);
4697 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
4698 tree ptr = make_ssa_name (TREE_TYPE (call));
4699
4700 gimplify_assign (ptr, call, &seq);
4701 accum = ptr;
4702 }
4703 else if (integer_zerop (ref_to_res))
4704 r = var;
4705 else
4706 accum = ref_to_res;
4707
4708 if (accum)
4709 {
4710 /* UPDATE the accumulator. */
4711 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4712 seq = NULL;
4713 r = nvptx_reduction_update (gimple_location (call), &gsi,
4714 accum, var, op);
4715 }
4716 }
4717
4718 if (lhs)
4719 gimplify_assign (lhs, r, &seq);
4720 pop_gimplify_context (NULL);
4721
4722 gsi_replace_with_seq (&gsi, seq, true);
4723 }
4724
4725 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
4726
4727 static void
4728 nvptx_goacc_reduction_teardown (gcall *call)
4729 {
4730 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4731 tree lhs = gimple_call_lhs (call);
4732 tree var = gimple_call_arg (call, 2);
4733 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4734 gimple_seq seq = NULL;
4735
4736 push_gimplify_context (true);
4737 if (level == GOMP_DIM_WORKER)
4738 {
4739 /* Read the worker reduction buffer. */
4740 tree offset = gimple_call_arg (call, 5);
4741 tree call = nvptx_get_worker_red_addr(TREE_TYPE (var), offset);
4742 tree ptr = make_ssa_name (TREE_TYPE (call));
4743
4744 gimplify_assign (ptr, call, &seq);
4745 var = build_simple_mem_ref (ptr);
4746 TREE_THIS_VOLATILE (var) = 1;
4747 }
4748
4749 if (level != GOMP_DIM_GANG)
4750 {
4751 /* Write to the receiver object. */
4752 tree ref_to_res = gimple_call_arg (call, 1);
4753
4754 if (!integer_zerop (ref_to_res))
4755 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
4756 }
4757
4758 if (lhs)
4759 gimplify_assign (lhs, var, &seq);
4760
4761 pop_gimplify_context (NULL);
4762
4763 gsi_replace_with_seq (&gsi, seq, true);
4764 }
4765
4766 /* NVPTX reduction expander. */
4767
4768 static void
4769 nvptx_goacc_reduction (gcall *call)
4770 {
4771 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
4772
4773 switch (code)
4774 {
4775 case IFN_GOACC_REDUCTION_SETUP:
4776 nvptx_goacc_reduction_setup (call);
4777 break;
4778
4779 case IFN_GOACC_REDUCTION_INIT:
4780 nvptx_goacc_reduction_init (call);
4781 break;
4782
4783 case IFN_GOACC_REDUCTION_FINI:
4784 nvptx_goacc_reduction_fini (call);
4785 break;
4786
4787 case IFN_GOACC_REDUCTION_TEARDOWN:
4788 nvptx_goacc_reduction_teardown (call);
4789 break;
4790
4791 default:
4792 gcc_unreachable ();
4793 }
4794 }
4795
4796 #undef TARGET_OPTION_OVERRIDE
4797 #define TARGET_OPTION_OVERRIDE nvptx_option_override
4798
4799 #undef TARGET_ATTRIBUTE_TABLE
4800 #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
4801
4802 #undef TARGET_LEGITIMATE_ADDRESS_P
4803 #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
4804
4805 #undef TARGET_PROMOTE_FUNCTION_MODE
4806 #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
4807
4808 #undef TARGET_FUNCTION_ARG
4809 #define TARGET_FUNCTION_ARG nvptx_function_arg
4810 #undef TARGET_FUNCTION_INCOMING_ARG
4811 #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
4812 #undef TARGET_FUNCTION_ARG_ADVANCE
4813 #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
4814 #undef TARGET_PASS_BY_REFERENCE
4815 #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
4816 #undef TARGET_FUNCTION_VALUE_REGNO_P
4817 #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
4818 #undef TARGET_FUNCTION_VALUE
4819 #define TARGET_FUNCTION_VALUE nvptx_function_value
4820 #undef TARGET_LIBCALL_VALUE
4821 #define TARGET_LIBCALL_VALUE nvptx_libcall_value
4822 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
4823 #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
4824 #undef TARGET_GET_DRAP_RTX
4825 #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
4826 #undef TARGET_SPLIT_COMPLEX_ARG
4827 #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
4828 #undef TARGET_RETURN_IN_MEMORY
4829 #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
4830 #undef TARGET_OMIT_STRUCT_RETURN_REG
4831 #define TARGET_OMIT_STRUCT_RETURN_REG true
4832 #undef TARGET_STRICT_ARGUMENT_NAMING
4833 #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
4834 #undef TARGET_CALL_ARGS
4835 #define TARGET_CALL_ARGS nvptx_call_args
4836 #undef TARGET_END_CALL_ARGS
4837 #define TARGET_END_CALL_ARGS nvptx_end_call_args
4838
4839 #undef TARGET_ASM_FILE_START
4840 #define TARGET_ASM_FILE_START nvptx_file_start
4841 #undef TARGET_ASM_FILE_END
4842 #define TARGET_ASM_FILE_END nvptx_file_end
4843 #undef TARGET_ASM_GLOBALIZE_LABEL
4844 #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
4845 #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
4846 #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
4847 #undef TARGET_PRINT_OPERAND
4848 #define TARGET_PRINT_OPERAND nvptx_print_operand
4849 #undef TARGET_PRINT_OPERAND_ADDRESS
4850 #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
4851 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
4852 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
4853 #undef TARGET_ASM_INTEGER
4854 #define TARGET_ASM_INTEGER nvptx_assemble_integer
4855 #undef TARGET_ASM_DECL_END
4856 #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
4857 #undef TARGET_ASM_DECLARE_CONSTANT_NAME
4858 #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
4859 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
4860 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
4861 #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
4862 #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
4863
4864 #undef TARGET_MACHINE_DEPENDENT_REORG
4865 #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
4866 #undef TARGET_NO_REGISTER_ALLOCATION
4867 #define TARGET_NO_REGISTER_ALLOCATION true
4868
4869 #undef TARGET_ENCODE_SECTION_INFO
4870 #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
4871 #undef TARGET_RECORD_OFFLOAD_SYMBOL
4872 #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
4873
4874 #undef TARGET_VECTOR_ALIGNMENT
4875 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
4876
4877 #undef TARGET_CANNOT_COPY_INSN_P
4878 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
4879
4880 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
4881 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
4882
4883 #undef TARGET_INIT_BUILTINS
4884 #define TARGET_INIT_BUILTINS nvptx_init_builtins
4885 #undef TARGET_EXPAND_BUILTIN
4886 #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
4887 #undef TARGET_BUILTIN_DECL
4888 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
4889
4890 #undef TARGET_GOACC_VALIDATE_DIMS
4891 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
4892
4893 #undef TARGET_GOACC_DIM_LIMIT
4894 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
4895
4896 #undef TARGET_GOACC_FORK_JOIN
4897 #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
4898
4899 #undef TARGET_GOACC_REDUCTION
4900 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
4901
4902 struct gcc_target targetm = TARGET_INITIALIZER;
4903
4904 #include "gt-nvptx.h"