1 /* Target code for NVPTX.
2 Copyright (C) 2014-2015 Free Software Foundation, Inc.
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
37 #include "diagnostic.h"
39 #include "insn-flags.h"
41 #include "insn-attr.h"
50 #include "tm-constrs.h"
51 #include "langhooks.h"
55 #include "stor-layout.h"
58 #include "gomp-constants.h"
60 #include "internal-fn.h"
61 #include "gimple-iterator.h"
62 #include "stringpool.h"
63 #include "tree-ssa-operands.h"
64 #include "tree-ssanames.h"
66 #include "tree-phinodes.h"
68 #include "fold-const.h"
70 /* This file should be included last. */
71 #include "target-def.h"
74 #define SHUFFLE_DOWN 1
75 #define SHUFFLE_BFLY 2
78 /* Record the function decls we've written, and the libfuncs and function
79 decls corresponding to them. */
80 static std::stringstream func_decls
;
82 struct declared_libfunc_hasher
: ggc_cache_ptr_hash
<rtx_def
>
84 static hashval_t
hash (rtx x
) { return htab_hash_pointer (x
); }
85 static bool equal (rtx a
, rtx b
) { return a
== b
; }
89 hash_table
<declared_libfunc_hasher
> *declared_libfuncs_htab
;
91 struct tree_hasher
: ggc_cache_ptr_hash
<tree_node
>
93 static hashval_t
hash (tree t
) { return htab_hash_pointer (t
); }
94 static bool equal (tree a
, tree b
) { return a
== b
; }
97 static GTY((cache
)) hash_table
<tree_hasher
> *declared_fndecls_htab
;
98 static GTY((cache
)) hash_table
<tree_hasher
> *needed_fndecls_htab
;
100 /* Buffer needed to broadcast across workers. This is used for both
101 worker-neutering and worker broadcasting. It is shared by all
102 functions emitted. The buffer is placed in shared memory. It'd be
103 nice if PTX supported common blocks, because then this could be
104 shared across TUs (taking the largest size). */
105 static unsigned worker_bcast_size
;
106 static unsigned worker_bcast_align
;
107 #define worker_bcast_name "__worker_bcast"
108 static GTY(()) rtx worker_bcast_sym
;
110 /* Buffer needed for worker reductions. This has to be distinct from
111 the worker broadcast array, as both may be live concurrently. */
112 static unsigned worker_red_size
;
113 static unsigned worker_red_align
;
114 #define worker_red_name "__worker_red"
115 static GTY(()) rtx worker_red_sym
;
117 /* Global lock variable, needed for 128bit worker & gang reductions. */
118 static GTY(()) tree global_lock_var
;
120 /* Allocate a new, cleared machine_function structure. */
122 static struct machine_function
*
123 nvptx_init_machine_status (void)
125 struct machine_function
*p
= ggc_cleared_alloc
<machine_function
> ();
126 p
->ret_reg_mode
= VOIDmode
;
130 /* Implement TARGET_OPTION_OVERRIDE. */
133 nvptx_option_override (void)
135 init_machine_status
= nvptx_init_machine_status
;
136 /* Gives us a predictable order, which we need especially for variables. */
137 flag_toplevel_reorder
= 1;
138 /* Assumes that it will see only hard registers. */
139 flag_var_tracking
= 0;
140 write_symbols
= NO_DEBUG
;
141 debug_info_level
= DINFO_LEVEL_NONE
;
143 if (nvptx_optimize
< 0)
144 nvptx_optimize
= optimize
> 0;
146 declared_fndecls_htab
= hash_table
<tree_hasher
>::create_ggc (17);
147 needed_fndecls_htab
= hash_table
<tree_hasher
>::create_ggc (17);
148 declared_libfuncs_htab
149 = hash_table
<declared_libfunc_hasher
>::create_ggc (17);
151 worker_bcast_sym
= gen_rtx_SYMBOL_REF (Pmode
, worker_bcast_name
);
152 worker_bcast_align
= GET_MODE_ALIGNMENT (SImode
) / BITS_PER_UNIT
;
154 worker_red_sym
= gen_rtx_SYMBOL_REF (Pmode
, worker_red_name
);
155 worker_red_align
= GET_MODE_ALIGNMENT (SImode
) / BITS_PER_UNIT
;
158 /* Return the mode to be used when declaring a ptx object for OBJ.
159 For objects with subparts such as complex modes this is the mode
163 nvptx_underlying_object_mode (rtx obj
)
165 if (GET_CODE (obj
) == SUBREG
)
166 obj
= SUBREG_REG (obj
);
167 machine_mode mode
= GET_MODE (obj
);
170 if (COMPLEX_MODE_P (mode
))
171 return GET_MODE_INNER (mode
);
175 /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
176 deal with ptx ideosyncracies. */
179 nvptx_ptx_type_from_mode (machine_mode mode
, bool promote
)
209 /* Determine the address space to use for SYMBOL_REF SYM. */
212 nvptx_addr_space_from_sym (rtx sym
)
214 tree decl
= SYMBOL_REF_DECL (sym
);
215 if (decl
== NULL_TREE
|| TREE_CODE (decl
) == FUNCTION_DECL
)
216 return ADDR_SPACE_GENERIC
;
218 bool is_const
= (CONSTANT_CLASS_P (decl
)
219 || TREE_CODE (decl
) == CONST_DECL
220 || TREE_READONLY (decl
));
222 return ADDR_SPACE_CONST
;
224 return ADDR_SPACE_GLOBAL
;
227 /* Check NAME for special function names and redirect them by returning a
228 replacement. This applies to malloc, free and realloc, for which we
229 want to use libgcc wrappers, and call, which triggers a bug in ptxas. */
232 nvptx_name_replacement (const char *name
)
234 if (strcmp (name
, "call") == 0)
235 return "__nvptx_call";
236 if (strcmp (name
, "malloc") == 0)
237 return "__nvptx_malloc";
238 if (strcmp (name
, "free") == 0)
239 return "__nvptx_free";
240 if (strcmp (name
, "realloc") == 0)
241 return "__nvptx_realloc";
245 /* If MODE should be treated as two registers of an inner mode, return
246 that inner mode. Otherwise return VOIDmode. */
249 maybe_split_mode (machine_mode mode
)
251 if (COMPLEX_MODE_P (mode
))
252 return GET_MODE_INNER (mode
);
260 /* Emit forking instructions for MASK. */
263 nvptx_emit_forking (unsigned mask
, bool is_call
)
265 mask
&= (GOMP_DIM_MASK (GOMP_DIM_WORKER
)
266 | GOMP_DIM_MASK (GOMP_DIM_VECTOR
));
269 rtx op
= GEN_INT (mask
| (is_call
<< GOMP_DIM_MAX
));
271 /* Emit fork at all levels. This helps form SESE regions, as
272 it creates a block with a single successor before entering a
273 partitooned region. That is a good candidate for the end of
276 emit_insn (gen_nvptx_fork (op
));
277 emit_insn (gen_nvptx_forked (op
));
281 /* Emit joining instructions for MASK. */
284 nvptx_emit_joining (unsigned mask
, bool is_call
)
286 mask
&= (GOMP_DIM_MASK (GOMP_DIM_WORKER
)
287 | GOMP_DIM_MASK (GOMP_DIM_VECTOR
));
290 rtx op
= GEN_INT (mask
| (is_call
<< GOMP_DIM_MAX
));
292 /* Emit joining for all non-call pars to ensure there's a single
293 predecessor for the block the join insn ends up in. This is
294 needed for skipping entire loops. */
296 emit_insn (gen_nvptx_joining (op
));
297 emit_insn (gen_nvptx_join (op
));
301 #define PASS_IN_REG_P(MODE, TYPE) \
302 ((GET_MODE_CLASS (MODE) == MODE_INT \
303 || GET_MODE_CLASS (MODE) == MODE_FLOAT \
304 || ((GET_MODE_CLASS (MODE) == MODE_COMPLEX_INT \
305 || GET_MODE_CLASS (MODE) == MODE_COMPLEX_FLOAT) \
306 && !AGGREGATE_TYPE_P (TYPE))) \
309 #define RETURN_IN_REG_P(MODE) \
310 ((GET_MODE_CLASS (MODE) == MODE_INT \
311 || GET_MODE_CLASS (MODE) == MODE_FLOAT) \
312 && GET_MODE_SIZE (MODE) <= 8)
314 /* Perform a mode promotion for a function argument with MODE. Return
315 the promoted mode. */
318 arg_promotion (machine_mode mode
)
320 if (mode
== QImode
|| mode
== HImode
)
325 /* Write the declaration of a function arg of TYPE to S. I is the index
326 of the argument, MODE its mode. NO_ARG_TYPES is true if this is for
327 a decl with zero TYPE_ARG_TYPES, i.e. an old-style C decl. */
330 write_one_arg (std::stringstream
&s
, const char *sep
, int i
,
331 tree type
, machine_mode mode
, bool no_arg_types
)
333 if (!PASS_IN_REG_P (mode
, type
))
336 machine_mode split
= maybe_split_mode (mode
);
337 if (split
!= VOIDmode
)
339 i
= write_one_arg (s
, sep
, i
, TREE_TYPE (type
), split
, false);
344 if (no_arg_types
&& !AGGREGATE_TYPE_P (type
))
348 mode
= arg_promotion (mode
);
352 s
<< ".param" << nvptx_ptx_type_from_mode (mode
, false) << " %in_ar"
353 << i
<< (mode
== QImode
|| mode
== HImode
? "[1]" : "");
355 s
<< "[" << int_size_in_bytes (type
) << "]";
359 /* Look for attributes in ATTRS that would indicate we must write a function
360 as a .entry kernel rather than a .func. Return true if one is found. */
363 write_as_kernel (tree attrs
)
365 return (lookup_attribute ("kernel", attrs
) != NULL_TREE
366 || lookup_attribute ("omp target entrypoint", attrs
) != NULL_TREE
);
369 /* Write a .func or .kernel declaration or definition along with
370 a helper comment for use by ld. S is the stream to write to, DECL
371 the decl for the function with name NAME. For definitions, emit
372 a declaration too. */
375 write_fn_proto (std::stringstream
&s
, bool is_defn
,
376 const char *name
, const_tree decl
)
379 /* Emit a declaration. The PTX assembler gets upset without it. */
380 name
= write_fn_proto (s
, false, name
, decl
);
383 /* Avoid repeating the name replacement. */
384 name
= nvptx_name_replacement (name
);
389 /* Emit the linker marker. */
391 if (TREE_PUBLIC (decl
))
393 s
<< " FUNCTION " << (is_defn
? "DEF" : "DECL") << ": " << name
<< "\n";
395 /* PTX declaration. */
396 if (DECL_EXTERNAL (decl
))
398 else if (TREE_PUBLIC (decl
))
399 s
<< (DECL_WEAK (decl
) ? ".weak " : ".visible ");
400 s
<< (write_as_kernel (DECL_ATTRIBUTES (decl
)) ? ".entry " : ".func ");
402 tree fntype
= TREE_TYPE (decl
);
403 tree result_type
= TREE_TYPE (fntype
);
405 /* Declare the result. */
406 bool return_in_mem
= false;
407 if (TYPE_MODE (result_type
) != VOIDmode
)
409 machine_mode mode
= TYPE_MODE (result_type
);
410 if (!RETURN_IN_REG_P (mode
))
411 return_in_mem
= true;
414 mode
= arg_promotion (mode
);
415 s
<< "(.param" << nvptx_ptx_type_from_mode (mode
, false)
422 const char *sep
= " (";
425 /* Emit argument list. */
428 s
<< sep
<< ".param.u" << GET_MODE_BITSIZE (Pmode
) << " %in_ar0";
434 NULL in TYPE_ARG_TYPES, for old-style functions
435 NULL in DECL_ARGUMENTS, for builtin functions without another
437 So we have to pick the best one we have. */
438 tree args
= TYPE_ARG_TYPES (fntype
);
439 bool null_type_args
= !args
;
441 args
= DECL_ARGUMENTS (decl
);
443 for (; args
; args
= TREE_CHAIN (args
))
445 tree type
= null_type_args
? TREE_TYPE (args
) : TREE_VALUE (args
);
446 machine_mode mode
= TYPE_MODE (type
);
448 if (mode
== VOIDmode
)
450 i
= write_one_arg (s
, sep
, i
, type
, mode
, null_type_args
);
454 if (stdarg_p (fntype
))
456 s
<< sep
<< ".param.u" << GET_MODE_BITSIZE (Pmode
) << " %in_argp";
461 if (DECL_STATIC_CHAIN (decl
))
463 s
<< sep
<< ".reg.u" << GET_MODE_BITSIZE (Pmode
)
464 << reg_names
[STATIC_CHAIN_REGNUM
];
469 if (!i
&& strcmp (name
, "main") == 0)
472 << ".param.u32 %argc, .param.u" << GET_MODE_BITSIZE (Pmode
)
481 s
<< (is_defn
? "\n" : ";\n");
486 /* Construct a function declaration from a call insn. This can be
487 necessary for two reasons - either we have an indirect call which
488 requires a .callprototype declaration, or we have a libcall
489 generated by emit_library_call for which no decl exists. */
492 write_fn_proto_from_insn (std::stringstream
&s
, const char *name
,
497 s
<< "\t.callprototype ";
502 name
= nvptx_name_replacement (name
);
503 s
<< "\n// BEGIN GLOBAL FUNCTION DECL: " << name
<< "\n";
504 s
<< "\t.extern .func ";
507 if (result
!= NULL_RTX
)
509 << nvptx_ptx_type_from_mode (arg_promotion (GET_MODE (result
)), false)
514 const char *sep
= " (";
515 int arg_end
= XVECLEN (pat
, 0);
516 for (int i
= 1; i
< arg_end
; i
++)
518 /* We don't have to deal with mode splitting here, as that was
519 already done when generating the call sequence. */
520 machine_mode mode
= GET_MODE (XEXP (XVECEXP (pat
, 0, i
), 0));
524 << nvptx_ptx_type_from_mode (mode
, false)
527 if (mode
== QImode
|| mode
== HImode
)
536 /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
537 table and and write a ptx prototype. These are emitted at end of
541 nvptx_record_fndecl (tree decl
)
543 tree
*slot
= declared_fndecls_htab
->find_slot (decl
, INSERT
);
547 const char *name
= get_fnname_from_decl (decl
);
548 write_fn_proto (func_decls
, false, name
, decl
);
552 /* Record a libcall or unprototyped external function. CALLEE is the
553 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
554 declaration for it. */
557 nvptx_record_libfunc (rtx callee
, rtx retval
, rtx pat
)
559 rtx
*slot
= declared_libfuncs_htab
->find_slot (callee
, INSERT
);
564 const char *name
= XSTR (callee
, 0);
565 write_fn_proto_from_insn (func_decls
, name
, retval
, pat
);
569 /* DECL is an external FUNCTION_DECL, that we're referencing. If it
570 is prototyped, record it now. Otherwise record it as needed at end
571 of compilation, when we might have more information about it. */
574 nvptx_record_needed_fndecl (tree decl
)
576 if (TYPE_ARG_TYPES (TREE_TYPE (decl
)) == NULL_TREE
)
578 tree
*slot
= needed_fndecls_htab
->find_slot (decl
, INSERT
);
583 nvptx_record_fndecl (decl
);
586 /* SYM is a SYMBOL_REF. If it refers to an external function, record
590 nvptx_maybe_record_fnsym (rtx sym
)
592 tree decl
= SYMBOL_REF_DECL (sym
);
594 if (decl
&& TREE_CODE (decl
) == FUNCTION_DECL
&& DECL_EXTERNAL (decl
))
595 nvptx_record_needed_fndecl (decl
);
598 /* Emit code to initialize the REGNO predicate register to indicate
599 whether we are not lane zero on the NAME axis. */
602 nvptx_init_axis_predicate (FILE *file
, int regno
, const char *name
)
604 fprintf (file
, "\t{\n");
605 fprintf (file
, "\t\t.reg.u32\t%%%s;\n", name
);
606 fprintf (file
, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name
, name
);
607 fprintf (file
, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno
, name
);
608 fprintf (file
, "\t}\n");
611 /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
612 function, including local var decls and copies from the arguments to
616 nvptx_declare_function_name (FILE *file
, const char *name
, const_tree decl
)
618 tree fntype
= TREE_TYPE (decl
);
619 tree result_type
= TREE_TYPE (fntype
);
623 write_fn_proto (s
, true, name
, decl
);
624 fprintf (file
, "%s", s
.str().c_str());
625 fprintf (file
, "{\n");
627 bool return_in_mem
= (TYPE_MODE (result_type
) != VOIDmode
628 && !RETURN_IN_REG_P (TYPE_MODE (result_type
)));
631 fprintf (file
, "\t.reg.u%d %%ar%d;\n", GET_MODE_BITSIZE (Pmode
), argno
);
632 fprintf (file
, "\tld.param.u%d %%ar%d, [%%in_ar%d];\n",
633 GET_MODE_BITSIZE (Pmode
), argno
, argno
);
637 /* Declare and initialize incoming arguments. */
638 tree args
= DECL_ARGUMENTS (decl
);
639 bool prototyped
= false;
640 if (TYPE_ARG_TYPES (fntype
))
642 args
= TYPE_ARG_TYPES (fntype
);
646 for (; args
!= NULL_TREE
; args
= TREE_CHAIN (args
))
648 tree type
= prototyped
? TREE_VALUE (args
) : TREE_TYPE (args
);
649 machine_mode mode
= TYPE_MODE (type
);
652 if (mode
== VOIDmode
)
655 if (!PASS_IN_REG_P (mode
, type
))
658 machine_mode split
= maybe_split_mode (mode
);
659 if (split
!= VOIDmode
)
664 else if (!prototyped
&& !AGGREGATE_TYPE_P (type
) && mode
== SFmode
)
667 mode
= arg_promotion (mode
);
670 fprintf (file
, "\t.reg%s %%ar%d;\n",
671 nvptx_ptx_type_from_mode (mode
, false), argno
);
672 fprintf (file
, "\tld.param%s %%ar%d, [%%in_ar%d];\n",
673 nvptx_ptx_type_from_mode (mode
, false), argno
, argno
);
678 /* C++11 ABI causes us to return a reference to the passed in
679 pointer for return_in_mem. */
680 if (cfun
->machine
->ret_reg_mode
!= VOIDmode
)
682 machine_mode mode
= arg_promotion
683 ((machine_mode
)cfun
->machine
->ret_reg_mode
);
684 fprintf (file
, "\t.reg%s %%retval;\n",
685 nvptx_ptx_type_from_mode (mode
, false));
688 if (stdarg_p (fntype
))
690 fprintf (file
, "\t.reg.u%d %%argp;\n", GET_MODE_BITSIZE (Pmode
));
691 fprintf (file
, "\tld.param.u%d %%argp, [%%in_argp];\n",
692 GET_MODE_BITSIZE (Pmode
));
695 fprintf (file
, "\t.reg.u%d %s;\n", GET_MODE_BITSIZE (Pmode
),
696 reg_names
[OUTGOING_STATIC_CHAIN_REGNUM
]);
698 /* Declare the pseudos we have as ptx registers. */
699 int maxregs
= max_reg_num ();
700 for (int i
= LAST_VIRTUAL_REGISTER
+ 1; i
< maxregs
; i
++)
702 if (regno_reg_rtx
[i
] != const0_rtx
)
704 machine_mode mode
= PSEUDO_REGNO_MODE (i
);
705 machine_mode split
= maybe_split_mode (mode
);
706 if (split
!= VOIDmode
)
708 fprintf (file
, "\t.reg%s %%r%d$%d;\n",
709 nvptx_ptx_type_from_mode (split
, true), i
, 0);
710 fprintf (file
, "\t.reg%s %%r%d$%d;\n",
711 nvptx_ptx_type_from_mode (split
, true), i
, 1);
714 fprintf (file
, "\t.reg%s %%r%d;\n",
715 nvptx_ptx_type_from_mode (mode
, true), i
);
719 /* The only reason we might be using outgoing args is if we call a stdargs
720 function. Allocate the space for this. If we called varargs functions
721 without passing any variadic arguments, we'll see a reference to outargs
722 even with a zero outgoing_args_size. */
723 HOST_WIDE_INT sz
= crtl
->outgoing_args_size
;
726 if (cfun
->machine
->has_call_with_varargs
)
728 fprintf (file
, "\t.reg.u%d %%outargs;\n"
729 "\t.local.align 8 .b8 %%outargs_ar["
730 HOST_WIDE_INT_PRINT_DEC
"];\n",
732 fprintf (file
, "\tcvta.local.u%d %%outargs, %%outargs_ar;\n",
736 if (cfun
->machine
->punning_buffer_size
> 0)
738 fprintf (file
, "\t.reg.u%d %%punbuffer;\n"
739 "\t.local.align 8 .b8 %%punbuffer_ar[%d];\n",
740 BITS_PER_WORD
, cfun
->machine
->punning_buffer_size
);
741 fprintf (file
, "\tcvta.local.u%d %%punbuffer, %%punbuffer_ar;\n",
745 /* Declare a local variable for the frame. */
746 sz
= get_frame_size ();
747 if (sz
> 0 || cfun
->machine
->has_call_with_sc
)
749 int alignment
= crtl
->stack_alignment_needed
/ BITS_PER_UNIT
;
751 fprintf (file
, "\t.reg.u%d %%frame;\n"
752 "\t.local.align %d .b8 %%farray[" HOST_WIDE_INT_PRINT_DEC
"];\n",
753 BITS_PER_WORD
, alignment
, sz
== 0 ? 1 : sz
);
754 fprintf (file
, "\tcvta.local.u%d %%frame, %%farray;\n",
758 /* Emit axis predicates. */
759 if (cfun
->machine
->axis_predicate
[0])
760 nvptx_init_axis_predicate (file
,
761 REGNO (cfun
->machine
->axis_predicate
[0]), "y");
762 if (cfun
->machine
->axis_predicate
[1])
763 nvptx_init_axis_predicate (file
,
764 REGNO (cfun
->machine
->axis_predicate
[1]), "x");
767 /* Output a return instruction. Also copy the return value to its outgoing
771 nvptx_output_return (void)
773 machine_mode mode
= (machine_mode
)cfun
->machine
->ret_reg_mode
;
775 if (mode
!= VOIDmode
)
777 mode
= arg_promotion (mode
);
778 fprintf (asm_out_file
, "\tst.param%s\t[%%out_retval], %%retval;\n",
779 nvptx_ptx_type_from_mode (mode
, false));
785 /* Terminate a function by writing a closing brace to FILE. */
788 nvptx_function_end (FILE *file
)
790 fprintf (file
, "}\n");
793 /* Decide whether we can make a sibling call to a function. For ptx, we
797 nvptx_function_ok_for_sibcall (tree
, tree
)
802 /* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
805 nvptx_get_drap_rtx (void)
810 /* Implement the TARGET_CALL_ARGS hook. Record information about one
811 argument to the next call. */
814 nvptx_call_args (rtx arg
, tree funtype
)
816 if (cfun
->machine
->start_call
== NULL_RTX
)
818 cfun
->machine
->call_args
= NULL
;
819 cfun
->machine
->funtype
= funtype
;
820 cfun
->machine
->start_call
= const0_rtx
;
825 rtx_expr_list
*args_so_far
= cfun
->machine
->call_args
;
827 cfun
->machine
->call_args
= alloc_EXPR_LIST (VOIDmode
, arg
, args_so_far
);
830 /* Implement the corresponding END_CALL_ARGS hook. Clear and free the
831 information we recorded. */
834 nvptx_end_call_args (void)
836 cfun
->machine
->start_call
= NULL_RTX
;
837 free_EXPR_LIST_list (&cfun
->machine
->call_args
);
840 /* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
841 track of whether calls involving static chains or varargs were seen
842 in the current function.
843 For libcalls, maintain a hash table of decls we have seen, and
844 record a function decl for later when encountering a new one. */
847 nvptx_expand_call (rtx retval
, rtx address
)
850 rtx callee
= XEXP (address
, 0);
853 rtx varargs
= NULL_RTX
;
854 unsigned parallel
= 0;
856 for (t
= cfun
->machine
->call_args
; t
; t
= XEXP (t
, 1))
859 if (!call_insn_operand (callee
, Pmode
))
861 callee
= force_reg (Pmode
, callee
);
862 address
= change_address (address
, QImode
, callee
);
865 if (GET_CODE (callee
) == SYMBOL_REF
)
867 tree decl
= SYMBOL_REF_DECL (callee
);
868 if (decl
!= NULL_TREE
)
870 if (DECL_STATIC_CHAIN (decl
))
871 cfun
->machine
->has_call_with_sc
= true;
873 tree attr
= get_oacc_fn_attrib (decl
);
876 tree dims
= TREE_VALUE (attr
);
878 parallel
= GOMP_DIM_MASK (GOMP_DIM_MAX
) - 1;
879 for (int ix
= 0; ix
!= GOMP_DIM_MAX
; ix
++)
881 if (TREE_PURPOSE (dims
)
882 && !integer_zerop (TREE_PURPOSE (dims
)))
884 /* Not on this axis. */
885 parallel
^= GOMP_DIM_MASK (ix
);
886 dims
= TREE_CHAIN (dims
);
892 if (cfun
->machine
->funtype
893 /* It's possible to construct testcases where we call a variable.
894 See compile/20020129-1.c. stdarg_p will crash so avoid calling it
896 && (TREE_CODE (cfun
->machine
->funtype
) == FUNCTION_TYPE
897 || TREE_CODE (cfun
->machine
->funtype
) == METHOD_TYPE
)
898 && stdarg_p (cfun
->machine
->funtype
))
900 varargs
= gen_reg_rtx (Pmode
);
901 emit_move_insn (varargs
, stack_pointer_rtx
);
902 cfun
->machine
->has_call_with_varargs
= true;
904 vec
= rtvec_alloc (nargs
+ 1 + (varargs
? 1 : 0));
905 pat
= gen_rtx_PARALLEL (VOIDmode
, vec
);
909 rtx tmp_retval
= retval
;
910 t
= gen_rtx_CALL (VOIDmode
, address
, const0_rtx
);
911 if (retval
!= NULL_RTX
)
913 if (!nvptx_register_operand (retval
, GET_MODE (retval
)))
914 tmp_retval
= gen_reg_rtx (GET_MODE (retval
));
915 t
= gen_rtx_SET (tmp_retval
, t
);
917 XVECEXP (pat
, 0, vec_pos
++) = t
;
919 /* Construct the call insn, including a USE for each argument pseudo
920 register. These will be used when printing the insn. */
921 for (rtx arg
= cfun
->machine
->call_args
; arg
; arg
= XEXP (arg
, 1))
923 rtx this_arg
= XEXP (arg
, 0);
924 XVECEXP (pat
, 0, vec_pos
++) = gen_rtx_USE (VOIDmode
, this_arg
);
928 XVECEXP (pat
, 0, vec_pos
++) = gen_rtx_USE (VOIDmode
, varargs
);
930 gcc_assert (vec_pos
= XVECLEN (pat
, 0));
932 nvptx_emit_forking (parallel
, true);
933 emit_call_insn (pat
);
934 nvptx_emit_joining (parallel
, true);
936 if (tmp_retval
!= retval
)
937 emit_move_insn (retval
, tmp_retval
);
940 /* Implement TARGET_FUNCTION_ARG. */
943 nvptx_function_arg (cumulative_args_t
, machine_mode mode
,
944 const_tree
, bool named
)
946 if (mode
== VOIDmode
)
950 return gen_reg_rtx (mode
);
954 /* Implement TARGET_FUNCTION_INCOMING_ARG. */
957 nvptx_function_incoming_arg (cumulative_args_t cum_v
, machine_mode mode
,
958 const_tree
, bool named
)
960 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
961 if (mode
== VOIDmode
)
967 /* No need to deal with split modes here, the only case that can
968 happen is complex modes and those are dealt with by
969 TARGET_SPLIT_COMPLEX_ARG. */
970 return gen_rtx_UNSPEC (mode
,
971 gen_rtvec (1, GEN_INT (cum
->count
)),
975 /* Implement TARGET_FUNCTION_ARG_ADVANCE. */
978 nvptx_function_arg_advance (cumulative_args_t cum_v
,
979 machine_mode
ARG_UNUSED (mode
),
980 const_tree
ARG_UNUSED (type
),
981 bool ARG_UNUSED (named
))
983 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
987 /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
989 For nvptx, we know how to handle functions declared as stdarg: by
990 passing an extra pointer to the unnamed arguments. However, the
991 Fortran frontend can produce a different situation, where a
992 function pointer is declared with no arguments, but the actual
993 function and calls to it take more arguments. In that case, we
994 want to ensure the call matches the definition of the function. */
997 nvptx_strict_argument_naming (cumulative_args_t cum_v
)
999 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
1000 return cum
->fntype
== NULL_TREE
|| stdarg_p (cum
->fntype
);
1003 /* Implement TARGET_FUNCTION_ARG_BOUNDARY. */
1006 nvptx_function_arg_boundary (machine_mode mode
, const_tree type
)
1008 unsigned int boundary
= type
? TYPE_ALIGN (type
) : GET_MODE_BITSIZE (mode
);
1010 if (boundary
> BITS_PER_WORD
)
1011 return 2 * BITS_PER_WORD
;
1013 if (mode
== BLKmode
)
1015 HOST_WIDE_INT size
= int_size_in_bytes (type
);
1017 return 2 * BITS_PER_WORD
;
1018 if (boundary
< BITS_PER_WORD
)
1021 return BITS_PER_WORD
;
1023 return 2 * BITS_PER_UNIT
;
1029 /* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
1030 where function FUNC returns or receives a value of data type TYPE. */
1033 nvptx_function_value (const_tree type
, const_tree func ATTRIBUTE_UNUSED
,
1036 int unsignedp
= TYPE_UNSIGNED (type
);
1037 machine_mode orig_mode
= TYPE_MODE (type
);
1038 machine_mode mode
= promote_function_mode (type
, orig_mode
,
1039 &unsignedp
, NULL_TREE
, 1);
1041 return gen_rtx_REG (mode
, NVPTX_RETURN_REGNUM
);
1042 if (cfun
->machine
->start_call
== NULL_RTX
)
1043 /* Pretend to return in a hard reg for early uses before pseudos can be
1045 return gen_rtx_REG (mode
, NVPTX_RETURN_REGNUM
);
1046 return gen_reg_rtx (mode
);
1049 /* Implement TARGET_LIBCALL_VALUE. */
1052 nvptx_libcall_value (machine_mode mode
, const_rtx
)
1054 if (cfun
->machine
->start_call
== NULL_RTX
)
1055 /* Pretend to return in a hard reg for early uses before pseudos can be
1057 return gen_rtx_REG (mode
, NVPTX_RETURN_REGNUM
);
1058 return gen_reg_rtx (mode
);
1061 /* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
1064 nvptx_function_value_regno_p (const unsigned int regno
)
1066 return regno
== NVPTX_RETURN_REGNUM
;
1069 /* Types with a mode other than those supported by the machine are passed by
1070 reference in memory. */
1073 nvptx_pass_by_reference (cumulative_args_t
, machine_mode mode
,
1074 const_tree type
, bool)
1076 return !PASS_IN_REG_P (mode
, type
);
1079 /* Implement TARGET_RETURN_IN_MEMORY. */
1082 nvptx_return_in_memory (const_tree type
, const_tree
)
1084 machine_mode mode
= TYPE_MODE (type
);
1085 if (!RETURN_IN_REG_P (mode
))
1090 /* Implement TARGET_PROMOTE_FUNCTION_MODE. */
1093 nvptx_promote_function_mode (const_tree type
, machine_mode mode
,
1095 const_tree funtype
, int for_return
)
1097 if (type
== NULL_TREE
)
1100 return promote_mode (type
, mode
, punsignedp
);
1101 /* For K&R-style functions, try to match the language promotion rules to
1102 minimize type mismatches at assembly time. */
1103 if (TYPE_ARG_TYPES (funtype
) == NULL_TREE
1104 && type
!= NULL_TREE
1105 && !AGGREGATE_TYPE_P (type
))
1109 mode
= arg_promotion (mode
);
1115 /* Implement TARGET_STATIC_CHAIN. */
1118 nvptx_static_chain (const_tree fndecl
, bool incoming_p
)
1120 if (!DECL_STATIC_CHAIN (fndecl
))
1124 return gen_rtx_REG (Pmode
, STATIC_CHAIN_REGNUM
);
1126 return gen_rtx_REG (Pmode
, OUTGOING_STATIC_CHAIN_REGNUM
);
1129 /* Emit a comparison COMPARE, and return the new test to be used in the
1133 nvptx_expand_compare (rtx compare
)
1135 rtx pred
= gen_reg_rtx (BImode
);
1136 rtx cmp
= gen_rtx_fmt_ee (GET_CODE (compare
), BImode
,
1137 XEXP (compare
, 0), XEXP (compare
, 1));
1138 emit_insn (gen_rtx_SET (pred
, cmp
));
1139 return gen_rtx_NE (BImode
, pred
, const0_rtx
);
1142 /* Expand the oacc fork & join primitive into ptx-required unspecs. */
1145 nvptx_expand_oacc_fork (unsigned mode
)
1147 nvptx_emit_forking (GOMP_DIM_MASK (mode
), false);
1151 nvptx_expand_oacc_join (unsigned mode
)
1153 nvptx_emit_joining (GOMP_DIM_MASK (mode
), false);
1156 /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1160 nvptx_gen_unpack (rtx dst0
, rtx dst1
, rtx src
)
1164 switch (GET_MODE (src
))
1167 res
= gen_unpackdisi2 (dst0
, dst1
, src
);
1170 res
= gen_unpackdfsi2 (dst0
, dst1
, src
);
1172 default: gcc_unreachable ();
1177 /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1181 nvptx_gen_pack (rtx dst
, rtx src0
, rtx src1
)
1185 switch (GET_MODE (dst
))
1188 res
= gen_packsidi2 (dst
, src0
, src1
);
1191 res
= gen_packsidf2 (dst
, src0
, src1
);
1193 default: gcc_unreachable ();
1198 /* Generate an instruction or sequence to broadcast register REG
1199 across the vectors of a single warp. */
1202 nvptx_gen_shuffle (rtx dst
, rtx src
, rtx idx
, unsigned kind
)
1206 switch (GET_MODE (dst
))
1209 res
= gen_nvptx_shufflesi (dst
, src
, idx
, GEN_INT (kind
));
1212 res
= gen_nvptx_shufflesf (dst
, src
, idx
, GEN_INT (kind
));
1217 rtx tmp0
= gen_reg_rtx (SImode
);
1218 rtx tmp1
= gen_reg_rtx (SImode
);
1221 emit_insn (nvptx_gen_unpack (tmp0
, tmp1
, src
));
1222 emit_insn (nvptx_gen_shuffle (tmp0
, tmp0
, idx
, kind
));
1223 emit_insn (nvptx_gen_shuffle (tmp1
, tmp1
, idx
, kind
));
1224 emit_insn (nvptx_gen_pack (dst
, tmp0
, tmp1
));
1231 rtx tmp
= gen_reg_rtx (SImode
);
1234 emit_insn (gen_sel_truesi (tmp
, src
, GEN_INT (1), const0_rtx
));
1235 emit_insn (nvptx_gen_shuffle (tmp
, tmp
, idx
, kind
));
1236 emit_insn (gen_rtx_SET (dst
, gen_rtx_NE (BImode
, tmp
, const0_rtx
)));
1248 /* Generate an instruction or sequence to broadcast register REG
1249 across the vectors of a single warp. */
1252 nvptx_gen_vcast (rtx reg
)
1254 return nvptx_gen_shuffle (reg
, reg
, const0_rtx
, SHUFFLE_IDX
);
1257 /* Structure used when generating a worker-level spill or fill. */
1261 rtx base
; /* Register holding base addr of buffer. */
1262 rtx ptr
; /* Iteration var, if needed. */
1263 unsigned offset
; /* Offset into worker buffer. */
1266 /* Direction of the spill/fill and looping setup/teardown indicator. */
1272 PM_loop_begin
= 1 << 2,
1273 PM_loop_end
= 1 << 3,
1275 PM_read_write
= PM_read
| PM_write
1278 /* Generate instruction(s) to spill or fill register REG to/from the
1279 worker broadcast array. PM indicates what is to be done, REP
1280 how many loop iterations will be executed (0 for not a loop). */
1283 nvptx_gen_wcast (rtx reg
, propagate_mask pm
, unsigned rep
, wcast_data_t
*data
)
1286 machine_mode mode
= GET_MODE (reg
);
1292 rtx tmp
= gen_reg_rtx (SImode
);
1296 emit_insn (gen_sel_truesi (tmp
, reg
, GEN_INT (1), const0_rtx
));
1297 emit_insn (nvptx_gen_wcast (tmp
, pm
, rep
, data
));
1299 emit_insn (gen_rtx_SET (reg
, gen_rtx_NE (BImode
, tmp
, const0_rtx
)));
1307 rtx addr
= data
->ptr
;
1311 unsigned align
= GET_MODE_ALIGNMENT (mode
) / BITS_PER_UNIT
;
1313 if (align
> worker_bcast_align
)
1314 worker_bcast_align
= align
;
1315 data
->offset
= (data
->offset
+ align
- 1) & ~(align
- 1);
1318 addr
= gen_rtx_PLUS (Pmode
, addr
, GEN_INT (data
->offset
));
1321 addr
= gen_rtx_MEM (mode
, addr
);
1322 addr
= gen_rtx_UNSPEC (mode
, gen_rtvec (1, addr
), UNSPEC_SHARED_DATA
);
1324 res
= gen_rtx_SET (addr
, reg
);
1325 else if (pm
== PM_write
)
1326 res
= gen_rtx_SET (reg
, addr
);
1332 /* We're using a ptr, increment it. */
1336 emit_insn (gen_adddi3 (data
->ptr
, data
->ptr
,
1337 GEN_INT (GET_MODE_SIZE (GET_MODE (reg
)))));
1343 data
->offset
+= rep
* GET_MODE_SIZE (GET_MODE (reg
));
1350 /* When loading an operand ORIG_OP, verify whether an address space
1351 conversion to generic is required, and if so, perform it. Check
1352 for SYMBOL_REFs and record them if needed. Return either the
1353 original operand, or the converted one. */
1356 nvptx_maybe_convert_symbolic_operand (rtx op
)
1358 if (GET_MODE (op
) != Pmode
)
1362 if (GET_CODE (sym
) == CONST
)
1363 sym
= XEXP (sym
, 0);
1364 if (GET_CODE (sym
) == PLUS
)
1365 sym
= XEXP (sym
, 0);
1367 if (GET_CODE (sym
) != SYMBOL_REF
)
1370 nvptx_maybe_record_fnsym (sym
);
1372 addr_space_t as
= nvptx_addr_space_from_sym (sym
);
1373 if (as
== ADDR_SPACE_GENERIC
)
1377 code
= (as
== ADDR_SPACE_GLOBAL
? UNSPEC_FROM_GLOBAL
1378 : as
== ADDR_SPACE_LOCAL
? UNSPEC_FROM_LOCAL
1379 : as
== ADDR_SPACE_SHARED
? UNSPEC_FROM_SHARED
1380 : as
== ADDR_SPACE_CONST
? UNSPEC_FROM_CONST
1381 : UNSPEC_FROM_PARAM
);
1383 rtx dest
= gen_reg_rtx (Pmode
);
1384 emit_insn (gen_rtx_SET (dest
,
1385 gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, op
), code
)));
1389 /* Returns true if X is a valid address for use in a memory reference. */
1392 nvptx_legitimate_address_p (machine_mode
, rtx x
, bool)
1394 enum rtx_code code
= GET_CODE (x
);
1402 if (REG_P (XEXP (x
, 0)) && CONST_INT_P (XEXP (x
, 1)))
1416 /* Implement HARD_REGNO_MODE_OK. We barely use hard regs, but we want
1417 to ensure that the return register's mode isn't changed. */
1420 nvptx_hard_regno_mode_ok (int regno
, machine_mode mode
)
1422 if (regno
!= NVPTX_RETURN_REGNUM
1423 || cfun
== NULL
|| cfun
->machine
->ret_reg_mode
== VOIDmode
)
1425 return mode
== cfun
->machine
->ret_reg_mode
;
1428 /* Convert an address space AS to the corresponding ptx string. */
1431 nvptx_section_from_addr_space (addr_space_t as
)
1435 case ADDR_SPACE_CONST
:
1438 case ADDR_SPACE_GLOBAL
:
1441 case ADDR_SPACE_SHARED
:
1444 case ADDR_SPACE_GENERIC
:
1452 /* Determine whether DECL goes into .const or .global. */
1455 nvptx_section_for_decl (const_tree decl
)
1457 bool is_const
= (CONSTANT_CLASS_P (decl
)
1458 || TREE_CODE (decl
) == CONST_DECL
1459 || TREE_READONLY (decl
));
1467 /* Machinery to output constant initializers. When beginning an initializer,
1468 we decide on a chunk size (which is visible in ptx in the type used), and
1469 then all initializer data is buffered until a chunk is filled and ready to
1472 /* Used when assembling integers to ensure data is emitted in
1473 pieces whose size matches the declaration we printed. */
1474 static unsigned int decl_chunk_size
;
1475 static machine_mode decl_chunk_mode
;
1476 /* Used in the same situation, to keep track of the byte offset
1477 into the initializer. */
1478 static unsigned HOST_WIDE_INT decl_offset
;
1479 /* The initializer part we are currently processing. */
1480 static HOST_WIDE_INT init_part
;
1481 /* The total size of the object. */
1482 static unsigned HOST_WIDE_INT object_size
;
1483 /* True if we found a skip extending to the end of the object. Used to
1484 assert that no data follows. */
1485 static bool object_finished
;
1487 /* Write the necessary separator string to begin a new initializer value. */
1490 begin_decl_field (void)
1492 /* We never see decl_offset at zero by the time we get here. */
1493 if (decl_offset
== decl_chunk_size
)
1494 fprintf (asm_out_file
, " = { ");
1496 fprintf (asm_out_file
, ", ");
1499 /* Output the currently stored chunk as an initializer value. */
1502 output_decl_chunk (void)
1504 begin_decl_field ();
1505 output_address (VOIDmode
, gen_int_mode (init_part
, decl_chunk_mode
));
1509 /* Add value VAL sized SIZE to the data we're emitting, and keep writing
1510 out chunks as they fill up. */
1513 nvptx_assemble_value (HOST_WIDE_INT val
, unsigned int size
)
1515 unsigned HOST_WIDE_INT chunk_offset
= decl_offset
% decl_chunk_size
;
1516 gcc_assert (!object_finished
);
1519 int this_part
= size
;
1520 if (chunk_offset
+ this_part
> decl_chunk_size
)
1521 this_part
= decl_chunk_size
- chunk_offset
;
1522 HOST_WIDE_INT val_part
;
1523 HOST_WIDE_INT mask
= 2;
1524 mask
<<= this_part
* BITS_PER_UNIT
- 1;
1525 val_part
= val
& (mask
- 1);
1526 init_part
|= val_part
<< (BITS_PER_UNIT
* chunk_offset
);
1527 val
>>= BITS_PER_UNIT
* this_part
;
1529 decl_offset
+= this_part
;
1530 if (decl_offset
% decl_chunk_size
== 0)
1531 output_decl_chunk ();
1537 /* Target hook for assembling integer object X of size SIZE. */
1540 nvptx_assemble_integer (rtx x
, unsigned int size
, int ARG_UNUSED (aligned_p
))
1542 HOST_WIDE_INT val
= 0;
1544 switch (GET_CODE (x
))
1551 nvptx_assemble_value (val
, size
);
1556 gcc_assert (GET_CODE (x
) == PLUS
);
1557 val
= INTVAL (XEXP (x
, 1));
1559 gcc_assert (GET_CODE (x
) == SYMBOL_REF
);
1563 gcc_assert (size
= decl_chunk_size
);
1564 if (decl_offset
% decl_chunk_size
!= 0)
1565 sorry ("cannot emit unaligned pointers in ptx assembly");
1566 decl_offset
+= size
;
1567 begin_decl_field ();
1569 nvptx_maybe_record_fnsym (x
);
1570 fprintf (asm_out_file
, "generic(");
1571 output_address (VOIDmode
, x
);
1572 fprintf (asm_out_file
, ")");
1575 fprintf (asm_out_file
, " + " HOST_WIDE_INT_PRINT_DEC
, val
);
1582 /* Output SIZE zero bytes. We ignore the FILE argument since the
1583 functions we're calling to perform the output just use
1587 nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size
)
1589 if (decl_offset
+ size
>= object_size
)
1591 if (decl_offset
% decl_chunk_size
!= 0)
1592 nvptx_assemble_value (0, decl_chunk_size
);
1593 object_finished
= true;
1597 while (size
> decl_chunk_size
)
1599 nvptx_assemble_value (0, decl_chunk_size
);
1600 size
-= decl_chunk_size
;
1603 nvptx_assemble_value (0, 1);
1606 /* Output a string STR with length SIZE. As in nvptx_output_skip we
1607 ignore the FILE arg. */
1610 nvptx_output_ascii (FILE *, const char *str
, unsigned HOST_WIDE_INT size
)
1612 for (unsigned HOST_WIDE_INT i
= 0; i
< size
; i
++)
1613 nvptx_assemble_value (str
[i
], 1);
1616 /* Called when the initializer for a decl has been completely output through
1617 combinations of the three functions above. */
1620 nvptx_assemble_decl_end (void)
1622 if (decl_offset
!= 0)
1624 if (!object_finished
&& decl_offset
% decl_chunk_size
!= 0)
1625 nvptx_assemble_value (0, decl_chunk_size
);
1627 fprintf (asm_out_file
, " }");
1629 fprintf (asm_out_file
, ";\n");
1632 /* Start a declaration of a variable of TYPE with NAME to
1633 FILE. IS_PUBLIC says whether this will be externally visible.
1634 Here we just write the linker hint and decide on the chunk size
1638 init_output_initializer (FILE *file
, const char *name
, const_tree type
,
1641 fprintf (file
, "\n// BEGIN%s VAR DEF: ", is_public
? " GLOBAL" : "");
1642 assemble_name_raw (file
, name
);
1645 if (TREE_CODE (type
) == ARRAY_TYPE
)
1646 type
= TREE_TYPE (type
);
1647 int sz
= int_size_in_bytes (type
);
1648 if ((TREE_CODE (type
) != INTEGER_TYPE
1649 && TREE_CODE (type
) != ENUMERAL_TYPE
1650 && TREE_CODE (type
) != REAL_TYPE
)
1652 || sz
> HOST_BITS_PER_WIDE_INT
)
1653 type
= ptr_type_node
;
1654 decl_chunk_size
= int_size_in_bytes (type
);
1655 decl_chunk_mode
= int_mode_for_mode (TYPE_MODE (type
));
1658 object_finished
= false;
1661 /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
1662 writing a constant variable EXP with NAME and SIZE and its
1663 initializer to FILE. */
1666 nvptx_asm_declare_constant_name (FILE *file
, const char *name
,
1667 const_tree exp
, HOST_WIDE_INT size
)
1669 tree type
= TREE_TYPE (exp
);
1670 init_output_initializer (file
, name
, type
, false);
1671 fprintf (file
, "\t.const .align %d .u%d ",
1672 TYPE_ALIGN (TREE_TYPE (exp
)) / BITS_PER_UNIT
,
1673 decl_chunk_size
* BITS_PER_UNIT
);
1674 assemble_name (file
, name
);
1675 fprintf (file
, "[" HOST_WIDE_INT_PRINT_DEC
"]",
1676 (size
+ decl_chunk_size
- 1) / decl_chunk_size
);
1680 /* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
1681 a variable DECL with NAME to FILE. */
1684 nvptx_declare_object_name (FILE *file
, const char *name
, const_tree decl
)
1686 if (decl
&& DECL_SIZE (decl
))
1688 tree type
= TREE_TYPE (decl
);
1689 unsigned HOST_WIDE_INT size
;
1691 init_output_initializer (file
, name
, type
, TREE_PUBLIC (decl
));
1692 size
= tree_to_uhwi (DECL_SIZE_UNIT (decl
));
1693 const char *section
= nvptx_section_for_decl (decl
);
1694 fprintf (file
, "\t%s%s .align %d .u%d ",
1695 !TREE_PUBLIC (decl
) ? ""
1696 : DECL_WEAK (decl
) ? ".weak" : ".visible",
1697 section
, DECL_ALIGN (decl
) / BITS_PER_UNIT
,
1698 decl_chunk_size
* BITS_PER_UNIT
);
1699 assemble_name (file
, name
);
1701 fprintf (file
, "[" HOST_WIDE_INT_PRINT_DEC
"]",
1702 (size
+ decl_chunk_size
- 1) / decl_chunk_size
);
1704 object_finished
= true;
1709 /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
1712 nvptx_globalize_label (FILE *, const char *)
1716 /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
1717 declaration only for variable DECL with NAME to FILE. */
1719 nvptx_assemble_undefined_decl (FILE *file
, const char *name
, const_tree decl
)
1721 if (TREE_CODE (decl
) != VAR_DECL
)
1723 const char *section
= nvptx_section_for_decl (decl
);
1724 fprintf (file
, "\n// BEGIN%s VAR DECL: ",
1725 TREE_PUBLIC (decl
) ? " GLOBAL" : "");
1726 assemble_name_raw (file
, name
);
1728 HOST_WIDE_INT size
= int_size_in_bytes (TREE_TYPE (decl
));
1729 fprintf (file
, ".extern %s .b8 ", section
);
1730 assemble_name_raw (file
, name
);
1732 fprintf (file
, "[" HOST_WIDE_INT_PRINT_DEC
"]", size
);
1733 fprintf (file
, ";\n\n");
1736 /* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
1737 involves writing .param declarations and in/out copies into them. For
1738 indirect calls, also write the .callprototype. */
1741 nvptx_output_call_insn (rtx_insn
*insn
, rtx result
, rtx callee
)
1745 bool needs_tgt
= register_operand (callee
, Pmode
);
1746 rtx pat
= PATTERN (insn
);
1747 int arg_end
= XVECLEN (pat
, 0);
1748 tree decl
= NULL_TREE
;
1750 fprintf (asm_out_file
, "\t{\n");
1752 fprintf (asm_out_file
, "\t\t.param%s %%retval_in;\n",
1753 nvptx_ptx_type_from_mode (arg_promotion (GET_MODE (result
)),
1756 /* Ensure we have a ptx declaration in the output if necessary. */
1757 if (GET_CODE (callee
) == SYMBOL_REF
)
1759 decl
= SYMBOL_REF_DECL (callee
);
1761 || (DECL_EXTERNAL (decl
) && !TYPE_ARG_TYPES (TREE_TYPE (decl
))))
1762 nvptx_record_libfunc (callee
, result
, pat
);
1763 else if (DECL_EXTERNAL (decl
))
1764 nvptx_record_fndecl (decl
);
1769 ASM_GENERATE_INTERNAL_LABEL (buf
, "LCT", labelno
);
1771 ASM_OUTPUT_LABEL (asm_out_file
, buf
);
1772 std::stringstream s
;
1773 write_fn_proto_from_insn (s
, NULL
, result
, pat
);
1774 fputs (s
.str().c_str(), asm_out_file
);
1777 for (int argno
= 1; argno
< arg_end
; argno
++)
1779 rtx t
= XEXP (XVECEXP (pat
, 0, argno
), 0);
1780 machine_mode mode
= GET_MODE (t
);
1782 /* Mode splitting has already been done. */
1783 fprintf (asm_out_file
, "\t\t.param%s %%out_arg%d%s;\n",
1784 nvptx_ptx_type_from_mode (mode
, false), argno
,
1785 mode
== QImode
|| mode
== HImode
? "[1]" : "");
1786 fprintf (asm_out_file
, "\t\tst.param%s [%%out_arg%d], %%r%d;\n",
1787 nvptx_ptx_type_from_mode (mode
, false), argno
,
1791 fprintf (asm_out_file
, "\t\tcall ");
1792 if (result
!= NULL_RTX
)
1793 fprintf (asm_out_file
, "(%%retval_in), ");
1797 const char *name
= get_fnname_from_decl (decl
);
1798 name
= nvptx_name_replacement (name
);
1799 assemble_name (asm_out_file
, name
);
1802 output_address (VOIDmode
, callee
);
1804 const char *open
= "(";
1805 for (int argno
= 1; argno
< arg_end
; argno
++)
1807 fprintf (asm_out_file
, ", %s%%out_arg%d", open
, argno
);
1810 if (decl
&& DECL_STATIC_CHAIN (decl
))
1812 fprintf (asm_out_file
, ", %s%s", open
,
1813 reg_names
[OUTGOING_STATIC_CHAIN_REGNUM
]);
1817 fprintf (asm_out_file
, ")");
1821 fprintf (asm_out_file
, ", ");
1822 assemble_name (asm_out_file
, buf
);
1824 fprintf (asm_out_file
, ";\n");
1826 return result
!= NULL_RTX
? "\tld.param%t0\t%0, [%%retval_in];\n\t}" : "}";
1829 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
1832 nvptx_print_operand_punct_valid_p (unsigned char c
)
1834 return c
== '.' || c
== '#';
1837 static void nvptx_print_operand (FILE *, rtx
, int);
1839 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
1842 nvptx_print_address_operand (FILE *file
, rtx x
, machine_mode
)
1845 if (GET_CODE (x
) == CONST
)
1847 switch (GET_CODE (x
))
1851 output_address (VOIDmode
, XEXP (x
, 0));
1852 fprintf (file
, "+");
1853 output_address (VOIDmode
, off
);
1858 output_addr_const (file
, x
);
1862 gcc_assert (GET_CODE (x
) != MEM
);
1863 nvptx_print_operand (file
, x
, 0);
1868 /* Write assembly language output for the address ADDR to FILE. */
1871 nvptx_print_operand_address (FILE *file
, machine_mode mode
, rtx addr
)
1873 nvptx_print_address_operand (file
, addr
, mode
);
1876 /* Print an operand, X, to FILE, with an optional modifier in CODE.
1879 . -- print the predicate for the instruction or an emptry string for an
1881 # -- print a rounding mode for the instruction
1883 A -- print an address space identifier for a MEM
1884 c -- print an opcode suffix for a comparison operator, including a type code
1885 f -- print a full reg even for something that must always be split
1886 S -- print a shuffle kind specified by CONST_INT
1887 t -- print a type opcode suffix, promoting QImode to 32 bits
1888 T -- print a type size in bits
1889 u -- print a type opcode suffix without promotions. */
1892 nvptx_print_operand (FILE *file
, rtx x
, int code
)
1895 machine_mode op_mode
;
1899 x
= current_insn_predicate
;
1902 unsigned int regno
= REGNO (XEXP (x
, 0));
1904 if (GET_CODE (x
) == EQ
)
1906 fputs (reg_names
[regno
], file
);
1911 else if (code
== '#')
1913 fputs (".rn", file
);
1917 enum rtx_code x_code
= GET_CODE (x
);
1923 addr_space_t as
= ADDR_SPACE_GENERIC
;
1924 rtx sym
= XEXP (x
, 0);
1926 if (GET_CODE (sym
) == CONST
)
1927 sym
= XEXP (sym
, 0);
1928 if (GET_CODE (sym
) == PLUS
)
1929 sym
= XEXP (sym
, 0);
1931 if (GET_CODE (sym
) == SYMBOL_REF
)
1932 as
= nvptx_addr_space_from_sym (sym
);
1934 fputs (nvptx_section_from_addr_space (as
), file
);
1939 op_mode
= nvptx_underlying_object_mode (x
);
1940 fprintf (file
, "%s", nvptx_ptx_type_from_mode (op_mode
, true));
1944 op_mode
= nvptx_underlying_object_mode (x
);
1945 fprintf (file
, "%s", nvptx_ptx_type_from_mode (op_mode
, false));
1950 unsigned kind
= UINTVAL (x
);
1951 static const char *const kinds
[] =
1952 {"up", "down", "bfly", "idx"};
1953 fprintf (file
, ".%s", kinds
[kind
]);
1958 fprintf (file
, "%d", GET_MODE_BITSIZE (GET_MODE (x
)));
1962 fprintf (file
, "@");
1966 fprintf (file
, "@!");
1970 op_mode
= GET_MODE (XEXP (x
, 0));
1974 fputs (".eq", file
);
1977 if (FLOAT_MODE_P (op_mode
))
1978 fputs (".neu", file
);
1980 fputs (".ne", file
);
1983 fputs (".le", file
);
1986 fputs (".ge", file
);
1989 fputs (".lt", file
);
1992 fputs (".gt", file
);
1995 fputs (".ls", file
);
1998 fputs (".hs", file
);
2001 fputs (".lo", file
);
2004 fputs (".hi", file
);
2007 fputs (".ne", file
);
2010 fputs (".equ", file
);
2013 fputs (".leu", file
);
2016 fputs (".geu", file
);
2019 fputs (".ltu", file
);
2022 fputs (".gtu", file
);
2025 fputs (".nan", file
);
2028 fputs (".num", file
);
2033 if (FLOAT_MODE_P (op_mode
)
2034 || x_code
== EQ
|| x_code
== NE
2035 || x_code
== GEU
|| x_code
== GTU
2036 || x_code
== LEU
|| x_code
== LTU
)
2037 fputs (nvptx_ptx_type_from_mode (op_mode
, true), file
);
2039 fprintf (file
, ".s%d", GET_MODE_BITSIZE (op_mode
));
2050 if (HARD_REGISTER_P (x
))
2051 fprintf (file
, "%s", reg_names
[REGNO (x
)]);
2053 fprintf (file
, "%%r%d", REGNO (x
));
2054 if (code
!= 'f' && maybe_split_mode (GET_MODE (x
)) != VOIDmode
)
2056 gcc_assert (GET_CODE (orig_x
) == SUBREG
2057 && maybe_split_mode (GET_MODE (orig_x
)) == VOIDmode
);
2058 fprintf (file
, "$%d", SUBREG_BYTE (orig_x
) / UNITS_PER_WORD
);
2064 nvptx_print_address_operand (file
, XEXP (x
, 0), GET_MODE (x
));
2069 output_addr_const (file
, x
);
2075 /* We could use output_addr_const, but that can print things like
2076 "x-8", which breaks ptxas. Need to ensure it is output as
2078 nvptx_print_address_operand (file
, x
, VOIDmode
);
2083 real_to_target (vals
, CONST_DOUBLE_REAL_VALUE (x
), GET_MODE (x
));
2084 vals
[0] &= 0xffffffff;
2085 vals
[1] &= 0xffffffff;
2086 if (GET_MODE (x
) == SFmode
)
2087 fprintf (file
, "0f%08lx", vals
[0]);
2089 fprintf (file
, "0d%08lx%08lx", vals
[1], vals
[0]);
2093 output_addr_const (file
, x
);
2098 /* Record replacement regs used to deal with subreg operands. */
2101 rtx replacement
[MAX_RECOG_OPERANDS
];
2107 /* Allocate or reuse a replacement in R and return the rtx. */
2110 get_replacement (struct reg_replace
*r
)
2112 if (r
->n_allocated
== r
->n_in_use
)
2113 r
->replacement
[r
->n_allocated
++] = gen_reg_rtx (r
->mode
);
2114 return r
->replacement
[r
->n_in_use
++];
2117 /* Clean up subreg operands. In ptx assembly, everything is typed, and
2118 the presence of subregs would break the rules for most instructions.
2119 Replace them with a suitable new register of the right size, plus
2120 conversion copyin/copyout instructions. */
2123 nvptx_reorg_subreg (void)
2125 struct reg_replace qiregs
, hiregs
, siregs
, diregs
;
2126 rtx_insn
*insn
, *next
;
2128 qiregs
.n_allocated
= 0;
2129 hiregs
.n_allocated
= 0;
2130 siregs
.n_allocated
= 0;
2131 diregs
.n_allocated
= 0;
2132 qiregs
.mode
= QImode
;
2133 hiregs
.mode
= HImode
;
2134 siregs
.mode
= SImode
;
2135 diregs
.mode
= DImode
;
2137 for (insn
= get_insns (); insn
; insn
= next
)
2139 next
= NEXT_INSN (insn
);
2140 if (!NONDEBUG_INSN_P (insn
)
2141 || asm_noperands (PATTERN (insn
)) >= 0
2142 || GET_CODE (PATTERN (insn
)) == USE
2143 || GET_CODE (PATTERN (insn
)) == CLOBBER
)
2146 qiregs
.n_in_use
= 0;
2147 hiregs
.n_in_use
= 0;
2148 siregs
.n_in_use
= 0;
2149 diregs
.n_in_use
= 0;
2150 extract_insn (insn
);
2151 enum attr_subregs_ok s_ok
= get_attr_subregs_ok (insn
);
2153 for (int i
= 0; i
< recog_data
.n_operands
; i
++)
2155 rtx op
= recog_data
.operand
[i
];
2156 if (GET_CODE (op
) != SUBREG
)
2159 rtx inner
= SUBREG_REG (op
);
2161 machine_mode outer_mode
= GET_MODE (op
);
2162 machine_mode inner_mode
= GET_MODE (inner
);
2165 && (GET_MODE_PRECISION (inner_mode
)
2166 >= GET_MODE_PRECISION (outer_mode
)))
2168 gcc_assert (SCALAR_INT_MODE_P (outer_mode
));
2169 struct reg_replace
*r
= (outer_mode
== QImode
? &qiregs
2170 : outer_mode
== HImode
? &hiregs
2171 : outer_mode
== SImode
? &siregs
2173 rtx new_reg
= get_replacement (r
);
2175 if (recog_data
.operand_type
[i
] != OP_OUT
)
2178 if (GET_MODE_PRECISION (inner_mode
)
2179 < GET_MODE_PRECISION (outer_mode
))
2184 rtx pat
= gen_rtx_SET (new_reg
,
2185 gen_rtx_fmt_e (code
, outer_mode
, inner
));
2186 emit_insn_before (pat
, insn
);
2189 if (recog_data
.operand_type
[i
] != OP_IN
)
2192 if (GET_MODE_PRECISION (inner_mode
)
2193 < GET_MODE_PRECISION (outer_mode
))
2198 rtx pat
= gen_rtx_SET (inner
,
2199 gen_rtx_fmt_e (code
, inner_mode
, new_reg
));
2200 emit_insn_after (pat
, insn
);
2202 validate_change (insn
, recog_data
.operand_loc
[i
], new_reg
, false);
2207 /* Loop structure of the function. The entire function is described as
2212 /* Parent parallel. */
2215 /* Next sibling parallel. */
2218 /* First child parallel. */
2221 /* Partitioning mask of the parallel. */
2224 /* Partitioning used within inner parallels. */
2225 unsigned inner_mask
;
2227 /* Location of parallel forked and join. The forked is the first
2228 block in the parallel and the join is the first block after of
2230 basic_block forked_block
;
2231 basic_block join_block
;
2233 rtx_insn
*forked_insn
;
2234 rtx_insn
*join_insn
;
2236 rtx_insn
*fork_insn
;
2237 rtx_insn
*joining_insn
;
2239 /* Basic blocks in this parallel, but not in child parallels. The
2240 FORKED and JOINING blocks are in the partition. The FORK and JOIN
2242 auto_vec
<basic_block
> blocks
;
2245 parallel (parallel
*parent
, unsigned mode
);
2249 /* Constructor links the new parallel into it's parent's chain of
2252 parallel::parallel (parallel
*parent_
, unsigned mask_
)
2253 :parent (parent_
), next (0), inner (0), mask (mask_
), inner_mask (0)
2255 forked_block
= join_block
= 0;
2256 forked_insn
= join_insn
= 0;
2257 fork_insn
= joining_insn
= 0;
2261 next
= parent
->inner
;
2262 parent
->inner
= this;
2266 parallel::~parallel ()
2272 /* Map of basic blocks to insns */
2273 typedef hash_map
<basic_block
, rtx_insn
*> bb_insn_map_t
;
2275 /* A tuple of an insn of interest and the BB in which it resides. */
2276 typedef std::pair
<rtx_insn
*, basic_block
> insn_bb_t
;
2277 typedef auto_vec
<insn_bb_t
> insn_bb_vec_t
;
2279 /* Split basic blocks such that each forked and join unspecs are at
2280 the start of their basic blocks. Thus afterwards each block will
2281 have a single partitioning mode. We also do the same for return
2282 insns, as they are executed by every thread. Return the
2283 partitioning mode of the function as a whole. Populate MAP with
2284 head and tail blocks. We also clear the BB visited flag, which is
2285 used when finding partitions. */
2288 nvptx_split_blocks (bb_insn_map_t
*map
)
2290 insn_bb_vec_t worklist
;
2294 /* Locate all the reorg instructions of interest. */
2295 FOR_ALL_BB_FN (block
, cfun
)
2297 bool seen_insn
= false;
2299 /* Clear visited flag, for use by parallel locator */
2300 block
->flags
&= ~BB_VISITED
;
2302 FOR_BB_INSNS (block
, insn
)
2306 switch (recog_memoized (insn
))
2311 case CODE_FOR_nvptx_forked
:
2312 case CODE_FOR_nvptx_join
:
2315 case CODE_FOR_return
:
2316 /* We also need to split just before return insns, as
2317 that insn needs executing by all threads, but the
2318 block it is in probably does not. */
2323 /* We've found an instruction that must be at the start of
2324 a block, but isn't. Add it to the worklist. */
2325 worklist
.safe_push (insn_bb_t (insn
, block
));
2327 /* It was already the first instruction. Just add it to
2329 map
->get_or_insert (block
) = insn
;
2334 /* Split blocks on the worklist. */
2337 basic_block remap
= 0;
2338 for (ix
= 0; worklist
.iterate (ix
, &elt
); ix
++)
2340 if (remap
!= elt
->second
)
2342 block
= elt
->second
;
2346 /* Split block before insn. The insn is in the new block */
2347 edge e
= split_block (block
, PREV_INSN (elt
->first
));
2350 map
->get_or_insert (block
) = elt
->first
;
2354 /* BLOCK is a basic block containing a head or tail instruction.
2355 Locate the associated prehead or pretail instruction, which must be
2356 in the single predecessor block. */
2359 nvptx_discover_pre (basic_block block
, int expected
)
2361 gcc_assert (block
->preds
->length () == 1);
2362 basic_block pre_block
= (*block
->preds
)[0]->src
;
2365 for (pre_insn
= BB_END (pre_block
); !INSN_P (pre_insn
);
2366 pre_insn
= PREV_INSN (pre_insn
))
2367 gcc_assert (pre_insn
!= BB_HEAD (pre_block
));
2369 gcc_assert (recog_memoized (pre_insn
) == expected
);
2373 /* Dump this parallel and all its inner parallels. */
2376 nvptx_dump_pars (parallel
*par
, unsigned depth
)
2378 fprintf (dump_file
, "%u: mask %d head=%d, tail=%d\n",
2380 par
->forked_block
? par
->forked_block
->index
: -1,
2381 par
->join_block
? par
->join_block
->index
: -1);
2383 fprintf (dump_file
, " blocks:");
2386 for (unsigned ix
= 0; par
->blocks
.iterate (ix
, &block
); ix
++)
2387 fprintf (dump_file
, " %d", block
->index
);
2388 fprintf (dump_file
, "\n");
2390 nvptx_dump_pars (par
->inner
, depth
+ 1);
2393 nvptx_dump_pars (par
->next
, depth
);
2396 /* If BLOCK contains a fork/join marker, process it to create or
2397 terminate a loop structure. Add this block to the current loop,
2398 and then walk successor blocks. */
2401 nvptx_find_par (bb_insn_map_t
*map
, parallel
*par
, basic_block block
)
2403 if (block
->flags
& BB_VISITED
)
2405 block
->flags
|= BB_VISITED
;
2407 if (rtx_insn
**endp
= map
->get (block
))
2409 rtx_insn
*end
= *endp
;
2411 /* This is a block head or tail, or return instruction. */
2412 switch (recog_memoized (end
))
2414 case CODE_FOR_return
:
2415 /* Return instructions are in their own block, and we
2416 don't need to do anything more. */
2419 case CODE_FOR_nvptx_forked
:
2420 /* Loop head, create a new inner loop and add it into
2421 our parent's child list. */
2423 unsigned mask
= UINTVAL (XVECEXP (PATTERN (end
), 0, 0));
2426 par
= new parallel (par
, mask
);
2427 par
->forked_block
= block
;
2428 par
->forked_insn
= end
;
2429 if (!(mask
& GOMP_DIM_MASK (GOMP_DIM_MAX
))
2430 && (mask
& GOMP_DIM_MASK (GOMP_DIM_WORKER
)))
2432 = nvptx_discover_pre (block
, CODE_FOR_nvptx_fork
);
2436 case CODE_FOR_nvptx_join
:
2437 /* A loop tail. Finish the current loop and return to
2440 unsigned mask
= UINTVAL (XVECEXP (PATTERN (end
), 0, 0));
2442 gcc_assert (par
->mask
== mask
);
2443 par
->join_block
= block
;
2444 par
->join_insn
= end
;
2445 if (!(mask
& GOMP_DIM_MASK (GOMP_DIM_MAX
))
2446 && (mask
& GOMP_DIM_MASK (GOMP_DIM_WORKER
)))
2448 = nvptx_discover_pre (block
, CODE_FOR_nvptx_joining
);
2459 /* Add this block onto the current loop's list of blocks. */
2460 par
->blocks
.safe_push (block
);
2462 /* This must be the entry block. Create a NULL parallel. */
2463 par
= new parallel (0, 0);
2465 /* Walk successor blocks. */
2469 FOR_EACH_EDGE (e
, ei
, block
->succs
)
2470 nvptx_find_par (map
, par
, e
->dest
);
2475 /* DFS walk the CFG looking for fork & join markers. Construct
2476 loop structures as we go. MAP is a mapping of basic blocks
2477 to head & tail markers, discovered when splitting blocks. This
2478 speeds up the discovery. We rely on the BB visited flag having
2479 been cleared when splitting blocks. */
2482 nvptx_discover_pars (bb_insn_map_t
*map
)
2486 /* Mark exit blocks as visited. */
2487 block
= EXIT_BLOCK_PTR_FOR_FN (cfun
);
2488 block
->flags
|= BB_VISITED
;
2490 /* And entry block as not. */
2491 block
= ENTRY_BLOCK_PTR_FOR_FN (cfun
);
2492 block
->flags
&= ~BB_VISITED
;
2494 parallel
*par
= nvptx_find_par (map
, 0, block
);
2498 fprintf (dump_file
, "\nLoops\n");
2499 nvptx_dump_pars (par
, 0);
2500 fprintf (dump_file
, "\n");
2506 /* Analyse a group of BBs within a partitioned region and create N
2507 Single-Entry-Single-Exit regions. Some of those regions will be
2508 trivial ones consisting of a single BB. The blocks of a
2509 partitioned region might form a set of disjoint graphs -- because
2510 the region encloses a differently partitoned sub region.
2512 We use the linear time algorithm described in 'Finding Regions Fast:
2513 Single Entry Single Exit and control Regions in Linear Time'
2514 Johnson, Pearson & Pingali. That algorithm deals with complete
2515 CFGs, where a back edge is inserted from END to START, and thus the
2516 problem becomes one of finding equivalent loops.
2518 In this case we have a partial CFG. We complete it by redirecting
2519 any incoming edge to the graph to be from an arbitrary external BB,
2520 and similarly redirecting any outgoing edge to be to that BB.
2521 Thus we end up with a closed graph.
2523 The algorithm works by building a spanning tree of an undirected
2524 graph and keeping track of back edges from nodes further from the
2525 root in the tree to nodes nearer to the root in the tree. In the
2526 description below, the root is up and the tree grows downwards.
2528 We avoid having to deal with degenerate back-edges to the same
2529 block, by splitting each BB into 3 -- one for input edges, one for
2530 the node itself and one for the output edges. Such back edges are
2531 referred to as 'Brackets'. Cycle equivalent nodes will have the
2532 same set of brackets.
2534 Determining bracket equivalency is done by maintaining a list of
2535 brackets in such a manner that the list length and final bracket
2536 uniquely identify the set.
2538 We use coloring to mark all BBs with cycle equivalency with the
2539 same color. This is the output of the 'Finding Regions Fast'
2540 algorithm. Notice it doesn't actually find the set of nodes within
2541 a particular region, just unorderd sets of nodes that are the
2542 entries and exits of SESE regions.
2544 After determining cycle equivalency, we need to find the minimal
2545 set of SESE regions. Do this with a DFS coloring walk of the
2546 complete graph. We're either 'looking' or 'coloring'. When
2547 looking, and we're in the subgraph, we start coloring the color of
2548 the current node, and remember that node as the start of the
2549 current color's SESE region. Every time we go to a new node, we
2550 decrement the count of nodes with thet color. If it reaches zero,
2551 we remember that node as the end of the current color's SESE region
2552 and return to 'looking'. Otherwise we color the node the current
2555 This way we end up with coloring the inside of non-trivial SESE
2556 regions with the color of that region. */
2558 /* A pair of BBs. We use this to represent SESE regions. */
2559 typedef std::pair
<basic_block
, basic_block
> bb_pair_t
;
2560 typedef auto_vec
<bb_pair_t
> bb_pair_vec_t
;
2562 /* A node in the undirected CFG. The discriminator SECOND indicates just
2563 above or just below the BB idicated by FIRST. */
2564 typedef std::pair
<basic_block
, int> pseudo_node_t
;
2566 /* A bracket indicates an edge towards the root of the spanning tree of the
2567 undirected graph. Each bracket has a color, determined
2568 from the currrent set of brackets. */
2571 pseudo_node_t back
; /* Back target */
2573 /* Current color and size of set. */
2577 bracket (pseudo_node_t back_
)
2578 : back (back_
), color (~0u), size (~0u)
2582 unsigned get_color (auto_vec
<unsigned> &color_counts
, unsigned length
)
2587 color
= color_counts
.length ();
2588 color_counts
.quick_push (0);
2590 color_counts
[color
]++;
2595 typedef auto_vec
<bracket
> bracket_vec_t
;
2597 /* Basic block info for finding SESE regions. */
2601 int node
; /* Node number in spanning tree. */
2602 int parent
; /* Parent node number. */
2604 /* The algorithm splits each node A into Ai, A', Ao. The incoming
2605 edges arrive at pseudo-node Ai and the outgoing edges leave at
2606 pseudo-node Ao. We have to remember which way we arrived at a
2607 particular node when generating the spanning tree. dir > 0 means
2608 we arrived at Ai, dir < 0 means we arrived at Ao. */
2611 /* Lowest numbered pseudo-node reached via a backedge from thsis
2612 node, or any descendant. */
2615 int color
; /* Cycle-equivalence color */
2617 /* Stack of brackets for this node. */
2618 bracket_vec_t brackets
;
2620 bb_sese (unsigned node_
, unsigned p
, int dir_
)
2621 :node (node_
), parent (p
), dir (dir_
)
2626 /* Push a bracket ending at BACK. */
2627 void push (const pseudo_node_t
&back
)
2630 fprintf (dump_file
, "Pushing backedge %d:%+d\n",
2631 back
.first
? back
.first
->index
: 0, back
.second
);
2632 brackets
.safe_push (bracket (back
));
2635 void append (bb_sese
*child
);
2636 void remove (const pseudo_node_t
&);
2638 /* Set node's color. */
2639 void set_color (auto_vec
<unsigned> &color_counts
)
2641 color
= brackets
.last ().get_color (color_counts
, brackets
.length ());
2645 bb_sese::~bb_sese ()
2649 /* Destructively append CHILD's brackets. */
2652 bb_sese::append (bb_sese
*child
)
2654 if (int len
= child
->brackets
.length ())
2660 for (ix
= 0; ix
< len
; ix
++)
2662 const pseudo_node_t
&pseudo
= child
->brackets
[ix
].back
;
2663 fprintf (dump_file
, "Appending (%d)'s backedge %d:%+d\n",
2664 child
->node
, pseudo
.first
? pseudo
.first
->index
: 0,
2668 if (!brackets
.length ())
2669 std::swap (brackets
, child
->brackets
);
2672 brackets
.reserve (len
);
2673 for (ix
= 0; ix
< len
; ix
++)
2674 brackets
.quick_push (child
->brackets
[ix
]);
2679 /* Remove brackets that terminate at PSEUDO. */
2682 bb_sese::remove (const pseudo_node_t
&pseudo
)
2684 unsigned removed
= 0;
2685 int len
= brackets
.length ();
2687 for (int ix
= 0; ix
< len
; ix
++)
2689 if (brackets
[ix
].back
== pseudo
)
2692 fprintf (dump_file
, "Removing backedge %d:%+d\n",
2693 pseudo
.first
? pseudo
.first
->index
: 0, pseudo
.second
);
2697 brackets
[ix
-removed
] = brackets
[ix
];
2703 /* Accessors for BB's aux pointer. */
2704 #define BB_SET_SESE(B, S) ((B)->aux = (S))
2705 #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
2707 /* DFS walk creating SESE data structures. Only cover nodes with
2708 BB_VISITED set. Append discovered blocks to LIST. We number in
2709 increments of 3 so that the above and below pseudo nodes can be
2710 implicitly numbered too. */
2713 nvptx_sese_number (int n
, int p
, int dir
, basic_block b
,
2714 auto_vec
<basic_block
> *list
)
2716 if (BB_GET_SESE (b
))
2720 fprintf (dump_file
, "Block %d(%d), parent (%d), orientation %+d\n",
2721 b
->index
, n
, p
, dir
);
2723 BB_SET_SESE (b
, new bb_sese (n
, p
, dir
));
2727 list
->quick_push (b
);
2729 /* First walk the nodes on the 'other side' of this node, then walk
2730 the nodes on the same side. */
2731 for (unsigned ix
= 2; ix
; ix
--)
2733 vec
<edge
, va_gc
> *edges
= dir
> 0 ? b
->succs
: b
->preds
;
2734 size_t offset
= (dir
> 0 ? offsetof (edge_def
, dest
)
2735 : offsetof (edge_def
, src
));
2739 FOR_EACH_EDGE (e
, ei
, edges
)
2741 basic_block target
= *(basic_block
*)((char *)e
+ offset
);
2743 if (target
->flags
& BB_VISITED
)
2744 n
= nvptx_sese_number (n
, p
, dir
, target
, list
);
2751 /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
2752 EDGES are the outgoing edges and OFFSET is the offset to the src
2753 or dst block on the edges. */
2756 nvptx_sese_pseudo (basic_block me
, bb_sese
*sese
, int depth
, int dir
,
2757 vec
<edge
, va_gc
> *edges
, size_t offset
)
2761 int hi_back
= depth
;
2762 pseudo_node_t
node_back (0, depth
);
2763 int hi_child
= depth
;
2764 pseudo_node_t
node_child (0, depth
);
2765 basic_block child
= NULL
;
2766 unsigned num_children
= 0;
2767 int usd
= -dir
* sese
->dir
;
2770 fprintf (dump_file
, "\nProcessing %d(%d) %+d\n",
2771 me
->index
, sese
->node
, dir
);
2775 /* This is the above pseudo-child. It has the BB itself as an
2776 additional child node. */
2777 node_child
= sese
->high
;
2778 hi_child
= node_child
.second
;
2779 if (node_child
.first
)
2780 hi_child
+= BB_GET_SESE (node_child
.first
)->node
;
2784 /* Examine each edge.
2785 - if it is a child (a) append its bracket list and (b) record
2786 whether it is the child with the highest reaching bracket.
2787 - if it is an edge to ancestor, record whether it's the highest
2788 reaching backlink. */
2789 FOR_EACH_EDGE (e
, ei
, edges
)
2791 basic_block target
= *(basic_block
*)((char *)e
+ offset
);
2793 if (bb_sese
*t_sese
= BB_GET_SESE (target
))
2795 if (t_sese
->parent
== sese
->node
&& !(t_sese
->dir
+ usd
))
2797 /* Child node. Append its bracket list. */
2799 sese
->append (t_sese
);
2801 /* Compare it's hi value. */
2802 int t_hi
= t_sese
->high
.second
;
2804 if (basic_block child_hi_block
= t_sese
->high
.first
)
2805 t_hi
+= BB_GET_SESE (child_hi_block
)->node
;
2807 if (hi_child
> t_hi
)
2810 node_child
= t_sese
->high
;
2814 else if (t_sese
->node
< sese
->node
+ dir
2815 && !(dir
< 0 && sese
->parent
== t_sese
->node
))
2817 /* Non-parental ancestor node -- a backlink. */
2818 int d
= usd
* t_sese
->dir
;
2819 int back
= t_sese
->node
+ d
;
2824 node_back
= pseudo_node_t (target
, d
);
2829 { /* Fallen off graph, backlink to entry node. */
2831 node_back
= pseudo_node_t (0, 0);
2835 /* Remove any brackets that terminate at this pseudo node. */
2836 sese
->remove (pseudo_node_t (me
, dir
));
2838 /* Now push any backlinks from this pseudo node. */
2839 FOR_EACH_EDGE (e
, ei
, edges
)
2841 basic_block target
= *(basic_block
*)((char *)e
+ offset
);
2842 if (bb_sese
*t_sese
= BB_GET_SESE (target
))
2844 if (t_sese
->node
< sese
->node
+ dir
2845 && !(dir
< 0 && sese
->parent
== t_sese
->node
))
2846 /* Non-parental ancestor node - backedge from me. */
2847 sese
->push (pseudo_node_t (target
, usd
* t_sese
->dir
));
2851 /* back edge to entry node */
2852 sese
->push (pseudo_node_t (0, 0));
2856 /* If this node leads directly or indirectly to a no-return region of
2857 the graph, then fake a backedge to entry node. */
2858 if (!sese
->brackets
.length () || !edges
|| !edges
->length ())
2861 node_back
= pseudo_node_t (0, 0);
2862 sese
->push (node_back
);
2865 /* Record the highest reaching backedge from us or a descendant. */
2866 sese
->high
= hi_back
< hi_child
? node_back
: node_child
;
2868 if (num_children
> 1)
2870 /* There is more than one child -- this is a Y shaped piece of
2871 spanning tree. We have to insert a fake backedge from this
2872 node to the highest ancestor reached by not-the-highest
2873 reaching child. Note that there may be multiple children
2874 with backedges to the same highest node. That's ok and we
2875 insert the edge to that highest node. */
2877 if (dir
< 0 && child
)
2879 node_child
= sese
->high
;
2880 hi_child
= node_child
.second
;
2881 if (node_child
.first
)
2882 hi_child
+= BB_GET_SESE (node_child
.first
)->node
;
2885 FOR_EACH_EDGE (e
, ei
, edges
)
2887 basic_block target
= *(basic_block
*)((char *)e
+ offset
);
2889 if (target
== child
)
2890 /* Ignore the highest child. */
2893 bb_sese
*t_sese
= BB_GET_SESE (target
);
2896 if (t_sese
->parent
!= sese
->node
)
2900 /* Compare its hi value. */
2901 int t_hi
= t_sese
->high
.second
;
2903 if (basic_block child_hi_block
= t_sese
->high
.first
)
2904 t_hi
+= BB_GET_SESE (child_hi_block
)->node
;
2906 if (hi_child
> t_hi
)
2909 node_child
= t_sese
->high
;
2913 sese
->push (node_child
);
2918 /* DFS walk of BB graph. Color node BLOCK according to COLORING then
2919 proceed to successors. Set SESE entry and exit nodes of
2923 nvptx_sese_color (auto_vec
<unsigned> &color_counts
, bb_pair_vec_t
®ions
,
2924 basic_block block
, int coloring
)
2926 bb_sese
*sese
= BB_GET_SESE (block
);
2928 if (block
->flags
& BB_VISITED
)
2930 /* If we've already encountered this block, either we must not
2931 be coloring, or it must have been colored the current color. */
2932 gcc_assert (coloring
< 0 || (sese
&& coloring
== sese
->color
));
2936 block
->flags
|= BB_VISITED
;
2942 /* Start coloring a region. */
2943 regions
[sese
->color
].first
= block
;
2944 coloring
= sese
->color
;
2947 if (!--color_counts
[sese
->color
] && sese
->color
== coloring
)
2949 /* Found final block of SESE region. */
2950 regions
[sese
->color
].second
= block
;
2954 /* Color the node, so we can assert on revisiting the node
2955 that the graph is indeed SESE. */
2956 sese
->color
= coloring
;
2959 /* Fallen off the subgraph, we cannot be coloring. */
2960 gcc_assert (coloring
< 0);
2962 /* Walk each successor block. */
2963 if (block
->succs
&& block
->succs
->length ())
2968 FOR_EACH_EDGE (e
, ei
, block
->succs
)
2969 nvptx_sese_color (color_counts
, regions
, e
->dest
, coloring
);
2972 gcc_assert (coloring
< 0);
2975 /* Find minimal set of SESE regions covering BLOCKS. REGIONS might
2976 end up with NULL entries in it. */
2979 nvptx_find_sese (auto_vec
<basic_block
> &blocks
, bb_pair_vec_t
®ions
)
2984 /* First clear each BB of the whole function. */
2985 FOR_EACH_BB_FN (block
, cfun
)
2987 block
->flags
&= ~BB_VISITED
;
2988 BB_SET_SESE (block
, 0);
2990 block
= EXIT_BLOCK_PTR_FOR_FN (cfun
);
2991 block
->flags
&= ~BB_VISITED
;
2992 BB_SET_SESE (block
, 0);
2993 block
= ENTRY_BLOCK_PTR_FOR_FN (cfun
);
2994 block
->flags
&= ~BB_VISITED
;
2995 BB_SET_SESE (block
, 0);
2997 /* Mark blocks in the function that are in this graph. */
2998 for (ix
= 0; blocks
.iterate (ix
, &block
); ix
++)
2999 block
->flags
|= BB_VISITED
;
3001 /* Counts of nodes assigned to each color. There cannot be more
3002 colors than blocks (and hopefully there will be fewer). */
3003 auto_vec
<unsigned> color_counts
;
3004 color_counts
.reserve (blocks
.length ());
3006 /* Worklist of nodes in the spanning tree. Again, there cannot be
3007 more nodes in the tree than blocks (there will be fewer if the
3008 CFG of blocks is disjoint). */
3009 auto_vec
<basic_block
> spanlist
;
3010 spanlist
.reserve (blocks
.length ());
3012 /* Make sure every block has its cycle class determined. */
3013 for (ix
= 0; blocks
.iterate (ix
, &block
); ix
++)
3015 if (BB_GET_SESE (block
))
3016 /* We already met this block in an earlier graph solve. */
3020 fprintf (dump_file
, "Searching graph starting at %d\n", block
->index
);
3022 /* Number the nodes reachable from block initial DFS order. */
3023 int depth
= nvptx_sese_number (2, 0, +1, block
, &spanlist
);
3025 /* Now walk in reverse DFS order to find cycle equivalents. */
3026 while (spanlist
.length ())
3028 block
= spanlist
.pop ();
3029 bb_sese
*sese
= BB_GET_SESE (block
);
3031 /* Do the pseudo node below. */
3032 nvptx_sese_pseudo (block
, sese
, depth
, +1,
3033 sese
->dir
> 0 ? block
->succs
: block
->preds
,
3034 (sese
->dir
> 0 ? offsetof (edge_def
, dest
)
3035 : offsetof (edge_def
, src
)));
3036 sese
->set_color (color_counts
);
3037 /* Do the pseudo node above. */
3038 nvptx_sese_pseudo (block
, sese
, depth
, -1,
3039 sese
->dir
< 0 ? block
->succs
: block
->preds
,
3040 (sese
->dir
< 0 ? offsetof (edge_def
, dest
)
3041 : offsetof (edge_def
, src
)));
3044 fprintf (dump_file
, "\n");
3050 const char *comma
= "";
3052 fprintf (dump_file
, "Found %d cycle equivalents\n",
3053 color_counts
.length ());
3054 for (ix
= 0; color_counts
.iterate (ix
, &count
); ix
++)
3056 fprintf (dump_file
, "%s%d[%d]={", comma
, ix
, count
);
3059 for (unsigned jx
= 0; blocks
.iterate (jx
, &block
); jx
++)
3060 if (BB_GET_SESE (block
)->color
== ix
)
3062 block
->flags
|= BB_VISITED
;
3063 fprintf (dump_file
, "%s%d", comma
, block
->index
);
3066 fprintf (dump_file
, "}");
3069 fprintf (dump_file
, "\n");
3072 /* Now we've colored every block in the subgraph. We now need to
3073 determine the minimal set of SESE regions that cover that
3074 subgraph. Do this with a DFS walk of the complete function.
3075 During the walk we're either 'looking' or 'coloring'. When we
3076 reach the last node of a particular color, we stop coloring and
3077 return to looking. */
3079 /* There cannot be more SESE regions than colors. */
3080 regions
.reserve (color_counts
.length ());
3081 for (ix
= color_counts
.length (); ix
--;)
3082 regions
.quick_push (bb_pair_t (0, 0));
3084 for (ix
= 0; blocks
.iterate (ix
, &block
); ix
++)
3085 block
->flags
&= ~BB_VISITED
;
3087 nvptx_sese_color (color_counts
, regions
, ENTRY_BLOCK_PTR_FOR_FN (cfun
), -1);
3091 const char *comma
= "";
3092 int len
= regions
.length ();
3094 fprintf (dump_file
, "SESE regions:");
3095 for (ix
= 0; ix
!= len
; ix
++)
3097 basic_block from
= regions
[ix
].first
;
3098 basic_block to
= regions
[ix
].second
;
3102 fprintf (dump_file
, "%s %d{%d", comma
, ix
, from
->index
);
3104 fprintf (dump_file
, "->%d", to
->index
);
3106 int color
= BB_GET_SESE (from
)->color
;
3108 /* Print the blocks within the region (excluding ends). */
3109 FOR_EACH_BB_FN (block
, cfun
)
3111 bb_sese
*sese
= BB_GET_SESE (block
);
3113 if (sese
&& sese
->color
== color
3114 && block
!= from
&& block
!= to
)
3115 fprintf (dump_file
, ".%d", block
->index
);
3117 fprintf (dump_file
, "}");
3121 fprintf (dump_file
, "\n\n");
3124 for (ix
= 0; blocks
.iterate (ix
, &block
); ix
++)
3125 delete BB_GET_SESE (block
);
3131 /* Propagate live state at the start of a partitioned region. BLOCK
3132 provides the live register information, and might not contain
3133 INSN. Propagation is inserted just after INSN. RW indicates whether
3134 we are reading and/or writing state. This
3135 separation is needed for worker-level proppagation where we
3136 essentially do a spill & fill. FN is the underlying worker
3137 function to generate the propagation instructions for single
3138 register. DATA is user data.
3140 We propagate the live register set and the entire frame. We could
3141 do better by (a) propagating just the live set that is used within
3142 the partitioned regions and (b) only propagating stack entries that
3143 are used. The latter might be quite hard to determine. */
3145 typedef rtx (*propagator_fn
) (rtx
, propagate_mask
, unsigned, void *);
3148 nvptx_propagate (basic_block block
, rtx_insn
*insn
, propagate_mask rw
,
3149 propagator_fn fn
, void *data
)
3151 bitmap live
= DF_LIVE_IN (block
);
3152 bitmap_iterator iterator
;
3155 /* Copy the frame array. */
3156 HOST_WIDE_INT fs
= get_frame_size ();
3159 rtx tmp
= gen_reg_rtx (DImode
);
3161 rtx ptr
= gen_reg_rtx (Pmode
);
3162 rtx pred
= NULL_RTX
;
3163 rtx_code_label
*label
= NULL
;
3165 gcc_assert (!(fs
& (GET_MODE_SIZE (DImode
) - 1)));
3166 fs
/= GET_MODE_SIZE (DImode
);
3167 /* Detect single iteration loop. */
3172 emit_insn (gen_rtx_SET (ptr
, frame_pointer_rtx
));
3175 idx
= gen_reg_rtx (SImode
);
3176 pred
= gen_reg_rtx (BImode
);
3177 label
= gen_label_rtx ();
3179 emit_insn (gen_rtx_SET (idx
, GEN_INT (fs
)));
3180 /* Allow worker function to initialize anything needed. */
3181 rtx init
= fn (tmp
, PM_loop_begin
, fs
, data
);
3185 LABEL_NUSES (label
)++;
3186 emit_insn (gen_addsi3 (idx
, idx
, GEN_INT (-1)));
3189 emit_insn (gen_rtx_SET (tmp
, gen_rtx_MEM (DImode
, ptr
)));
3190 emit_insn (fn (tmp
, rw
, fs
, data
));
3192 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode
, ptr
), tmp
));
3195 emit_insn (gen_rtx_SET (pred
, gen_rtx_NE (BImode
, idx
, const0_rtx
)));
3196 emit_insn (gen_adddi3 (ptr
, ptr
, GEN_INT (GET_MODE_SIZE (DImode
))));
3197 emit_insn (gen_br_true_uni (pred
, label
));
3198 rtx fini
= fn (tmp
, PM_loop_end
, fs
, data
);
3201 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx
), idx
));
3203 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp
), tmp
));
3204 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr
), ptr
));
3205 rtx cpy
= get_insns ();
3207 insn
= emit_insn_after (cpy
, insn
);
3210 /* Copy live registers. */
3211 EXECUTE_IF_SET_IN_BITMAP (live
, 0, ix
, iterator
)
3213 rtx reg
= regno_reg_rtx
[ix
];
3215 if (REGNO (reg
) >= FIRST_PSEUDO_REGISTER
)
3217 rtx bcast
= fn (reg
, rw
, 0, data
);
3219 insn
= emit_insn_after (bcast
, insn
);
3224 /* Worker for nvptx_vpropagate. */
3227 vprop_gen (rtx reg
, propagate_mask pm
,
3228 unsigned ARG_UNUSED (count
), void *ARG_UNUSED (data
))
3230 if (!(pm
& PM_read_write
))
3233 return nvptx_gen_vcast (reg
);
3236 /* Propagate state that is live at start of BLOCK across the vectors
3237 of a single warp. Propagation is inserted just after INSN. */
3240 nvptx_vpropagate (basic_block block
, rtx_insn
*insn
)
3242 nvptx_propagate (block
, insn
, PM_read_write
, vprop_gen
, 0);
3245 /* Worker for nvptx_wpropagate. */
3248 wprop_gen (rtx reg
, propagate_mask pm
, unsigned rep
, void *data_
)
3250 wcast_data_t
*data
= (wcast_data_t
*)data_
;
3252 if (pm
& PM_loop_begin
)
3254 /* Starting a loop, initialize pointer. */
3255 unsigned align
= GET_MODE_ALIGNMENT (GET_MODE (reg
)) / BITS_PER_UNIT
;
3257 if (align
> worker_bcast_align
)
3258 worker_bcast_align
= align
;
3259 data
->offset
= (data
->offset
+ align
- 1) & ~(align
- 1);
3261 data
->ptr
= gen_reg_rtx (Pmode
);
3263 return gen_adddi3 (data
->ptr
, data
->base
, GEN_INT (data
->offset
));
3265 else if (pm
& PM_loop_end
)
3267 rtx clobber
= gen_rtx_CLOBBER (GET_MODE (data
->ptr
), data
->ptr
);
3268 data
->ptr
= NULL_RTX
;
3272 return nvptx_gen_wcast (reg
, pm
, rep
, data
);
3275 /* Spill or fill live state that is live at start of BLOCK. PRE_P
3276 indicates if this is just before partitioned mode (do spill), or
3277 just after it starts (do fill). Sequence is inserted just after
3281 nvptx_wpropagate (bool pre_p
, basic_block block
, rtx_insn
*insn
)
3285 data
.base
= gen_reg_rtx (Pmode
);
3287 data
.ptr
= NULL_RTX
;
3289 nvptx_propagate (block
, insn
, pre_p
? PM_read
: PM_write
, wprop_gen
, &data
);
3292 /* Stuff was emitted, initialize the base pointer now. */
3293 rtx init
= gen_rtx_SET (data
.base
, worker_bcast_sym
);
3294 emit_insn_after (init
, insn
);
3296 if (worker_bcast_size
< data
.offset
)
3297 worker_bcast_size
= data
.offset
;
3301 /* Emit a worker-level synchronization barrier. We use different
3302 markers for before and after synchronizations. */
3305 nvptx_wsync (bool after
)
3307 return gen_nvptx_barsync (GEN_INT (after
));
3310 /* Single neutering according to MASK. FROM is the incoming block and
3311 TO is the outgoing block. These may be the same block. Insert at
3314 if (tid.<axis>) goto end.
3316 and insert before ending branch of TO (if there is such an insn):
3319 <possibly-broadcast-cond>
3322 We currently only use differnt FROM and TO when skipping an entire
3323 loop. We could do more if we detected superblocks. */
3326 nvptx_single (unsigned mask
, basic_block from
, basic_block to
)
3328 rtx_insn
*head
= BB_HEAD (from
);
3329 rtx_insn
*tail
= BB_END (to
);
3330 unsigned skip_mask
= mask
;
3332 /* Find first insn of from block */
3333 while (head
!= BB_END (from
) && !INSN_P (head
))
3334 head
= NEXT_INSN (head
);
3336 /* Find last insn of to block */
3337 rtx_insn
*limit
= from
== to
? head
: BB_HEAD (to
);
3338 while (tail
!= limit
&& !INSN_P (tail
) && !LABEL_P (tail
))
3339 tail
= PREV_INSN (tail
);
3341 /* Detect if tail is a branch. */
3342 rtx tail_branch
= NULL_RTX
;
3343 rtx cond_branch
= NULL_RTX
;
3344 if (tail
&& INSN_P (tail
))
3346 tail_branch
= PATTERN (tail
);
3347 if (GET_CODE (tail_branch
) != SET
|| SET_DEST (tail_branch
) != pc_rtx
)
3348 tail_branch
= NULL_RTX
;
3351 cond_branch
= SET_SRC (tail_branch
);
3352 if (GET_CODE (cond_branch
) != IF_THEN_ELSE
)
3353 cond_branch
= NULL_RTX
;
3359 /* If this is empty, do nothing. */
3360 if (!head
|| !INSN_P (head
))
3363 /* If this is a dummy insn, do nothing. */
3364 switch (recog_memoized (head
))
3368 case CODE_FOR_nvptx_fork
:
3369 case CODE_FOR_nvptx_forked
:
3370 case CODE_FOR_nvptx_joining
:
3371 case CODE_FOR_nvptx_join
:
3377 /* If we're only doing vector single, there's no need to
3378 emit skip code because we'll not insert anything. */
3379 if (!(mask
& GOMP_DIM_MASK (GOMP_DIM_VECTOR
)))
3382 else if (tail_branch
)
3383 /* Block with only unconditional branch. Nothing to do. */
3387 /* Insert the vector test inside the worker test. */
3389 rtx_insn
*before
= tail
;
3390 for (mode
= GOMP_DIM_WORKER
; mode
<= GOMP_DIM_VECTOR
; mode
++)
3391 if (GOMP_DIM_MASK (mode
) & skip_mask
)
3393 rtx_code_label
*label
= gen_label_rtx ();
3394 rtx pred
= cfun
->machine
->axis_predicate
[mode
- GOMP_DIM_WORKER
];
3398 pred
= gen_reg_rtx (BImode
);
3399 cfun
->machine
->axis_predicate
[mode
- GOMP_DIM_WORKER
] = pred
;
3403 if (mode
== GOMP_DIM_VECTOR
)
3404 br
= gen_br_true (pred
, label
);
3406 br
= gen_br_true_uni (pred
, label
);
3407 emit_insn_before (br
, head
);
3409 LABEL_NUSES (label
)++;
3411 before
= emit_label_before (label
, before
);
3413 emit_label_after (label
, tail
);
3416 /* Now deal with propagating the branch condition. */
3419 rtx pvar
= XEXP (XEXP (cond_branch
, 0), 0);
3421 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR
) == mask
)
3423 /* Vector mode only, do a shuffle. */
3424 emit_insn_before (nvptx_gen_vcast (pvar
), tail
);
3428 /* Includes worker mode, do spill & fill. By construction
3429 we should never have worker mode only. */
3432 data
.base
= worker_bcast_sym
;
3435 if (worker_bcast_size
< GET_MODE_SIZE (SImode
))
3436 worker_bcast_size
= GET_MODE_SIZE (SImode
);
3439 emit_insn_before (nvptx_gen_wcast (pvar
, PM_read
, 0, &data
),
3441 /* Barrier so other workers can see the write. */
3442 emit_insn_before (nvptx_wsync (false), tail
);
3444 emit_insn_before (nvptx_gen_wcast (pvar
, PM_write
, 0, &data
), tail
);
3445 /* This barrier is needed to avoid worker zero clobbering
3446 the broadcast buffer before all the other workers have
3447 had a chance to read this instance of it. */
3448 emit_insn_before (nvptx_wsync (true), tail
);
3451 extract_insn (tail
);
3452 rtx unsp
= gen_rtx_UNSPEC (BImode
, gen_rtvec (1, pvar
),
3454 validate_change (tail
, recog_data
.operand_loc
[0], unsp
, false);
3458 /* PAR is a parallel that is being skipped in its entirety according to
3459 MASK. Treat this as skipping a superblock starting at forked
3460 and ending at joining. */
3463 nvptx_skip_par (unsigned mask
, parallel
*par
)
3465 basic_block tail
= par
->join_block
;
3466 gcc_assert (tail
->preds
->length () == 1);
3468 basic_block pre_tail
= (*tail
->preds
)[0]->src
;
3469 gcc_assert (pre_tail
->succs
->length () == 1);
3471 nvptx_single (mask
, par
->forked_block
, pre_tail
);
3474 /* If PAR has a single inner parallel and PAR itself only contains
3475 empty entry and exit blocks, swallow the inner PAR. */
3478 nvptx_optimize_inner (parallel
*par
)
3480 parallel
*inner
= par
->inner
;
3482 /* We mustn't be the outer dummy par. */
3486 /* We must have a single inner par. */
3487 if (!inner
|| inner
->next
)
3490 /* We must only contain 2 blocks ourselves -- the head and tail of
3492 if (par
->blocks
.length () != 2)
3495 /* We must be disjoint partitioning. As we only have vector and
3496 worker partitioning, this is sufficient to guarantee the pars
3497 have adjacent partitioning. */
3498 if ((par
->mask
& inner
->mask
) & (GOMP_DIM_MASK (GOMP_DIM_MAX
) - 1))
3499 /* This indicates malformed code generation. */
3502 /* The outer forked insn should be immediately followed by the inner
3504 rtx_insn
*forked
= par
->forked_insn
;
3505 rtx_insn
*fork
= BB_END (par
->forked_block
);
3507 if (NEXT_INSN (forked
) != fork
)
3509 gcc_checking_assert (recog_memoized (fork
) == CODE_FOR_nvptx_fork
);
3511 /* The outer joining insn must immediately follow the inner join
3513 rtx_insn
*joining
= par
->joining_insn
;
3514 rtx_insn
*join
= inner
->join_insn
;
3515 if (NEXT_INSN (join
) != joining
)
3518 /* Preconditions met. Swallow the inner par. */
3520 fprintf (dump_file
, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
3521 inner
->mask
, inner
->forked_block
->index
,
3522 inner
->join_block
->index
,
3523 par
->mask
, par
->forked_block
->index
, par
->join_block
->index
);
3525 par
->mask
|= inner
->mask
& (GOMP_DIM_MASK (GOMP_DIM_MAX
) - 1);
3527 par
->blocks
.reserve (inner
->blocks
.length ());
3528 while (inner
->blocks
.length ())
3529 par
->blocks
.quick_push (inner
->blocks
.pop ());
3531 par
->inner
= inner
->inner
;
3532 inner
->inner
= NULL
;
3537 /* Process the parallel PAR and all its contained
3538 parallels. We do everything but the neutering. Return mask of
3539 partitioned modes used within this parallel. */
3542 nvptx_process_pars (parallel
*par
)
3545 nvptx_optimize_inner (par
);
3547 unsigned inner_mask
= par
->mask
;
3549 /* Do the inner parallels first. */
3552 par
->inner_mask
= nvptx_process_pars (par
->inner
);
3553 inner_mask
|= par
->inner_mask
;
3556 if (par
->mask
& GOMP_DIM_MASK (GOMP_DIM_MAX
))
3557 /* No propagation needed for a call. */;
3558 else if (par
->mask
& GOMP_DIM_MASK (GOMP_DIM_WORKER
))
3560 nvptx_wpropagate (false, par
->forked_block
, par
->forked_insn
);
3561 nvptx_wpropagate (true, par
->forked_block
, par
->fork_insn
);
3562 /* Insert begin and end synchronizations. */
3563 emit_insn_after (nvptx_wsync (false), par
->forked_insn
);
3564 emit_insn_before (nvptx_wsync (true), par
->joining_insn
);
3566 else if (par
->mask
& GOMP_DIM_MASK (GOMP_DIM_VECTOR
))
3567 nvptx_vpropagate (par
->forked_block
, par
->forked_insn
);
3569 /* Now do siblings. */
3571 inner_mask
|= nvptx_process_pars (par
->next
);
3575 /* Neuter the parallel described by PAR. We recurse in depth-first
3576 order. MODES are the partitioning of the execution and OUTER is
3577 the partitioning of the parallels we are contained in. */
3580 nvptx_neuter_pars (parallel
*par
, unsigned modes
, unsigned outer
)
3582 unsigned me
= (par
->mask
3583 & (GOMP_DIM_MASK (GOMP_DIM_WORKER
)
3584 | GOMP_DIM_MASK (GOMP_DIM_VECTOR
)));
3585 unsigned skip_mask
= 0, neuter_mask
= 0;
3588 nvptx_neuter_pars (par
->inner
, modes
, outer
| me
);
3590 for (unsigned mode
= GOMP_DIM_WORKER
; mode
<= GOMP_DIM_VECTOR
; mode
++)
3592 if ((outer
| me
) & GOMP_DIM_MASK (mode
))
3593 {} /* Mode is partitioned: no neutering. */
3594 else if (!(modes
& GOMP_DIM_MASK (mode
)))
3595 {} /* Mode is not used: nothing to do. */
3596 else if (par
->inner_mask
& GOMP_DIM_MASK (mode
)
3597 || !par
->forked_insn
)
3598 /* Partitioned in inner parallels, or we're not a partitioned
3599 at all: neuter individual blocks. */
3600 neuter_mask
|= GOMP_DIM_MASK (mode
);
3601 else if (!par
->parent
|| !par
->parent
->forked_insn
3602 || par
->parent
->inner_mask
& GOMP_DIM_MASK (mode
))
3603 /* Parent isn't a parallel or contains this paralleling: skip
3604 parallel at this level. */
3605 skip_mask
|= GOMP_DIM_MASK (mode
);
3607 {} /* Parent will skip this parallel itself. */
3616 /* Neuter whole SESE regions. */
3617 bb_pair_vec_t regions
;
3619 nvptx_find_sese (par
->blocks
, regions
);
3620 len
= regions
.length ();
3621 for (ix
= 0; ix
!= len
; ix
++)
3623 basic_block from
= regions
[ix
].first
;
3624 basic_block to
= regions
[ix
].second
;
3627 nvptx_single (neuter_mask
, from
, to
);
3634 /* Neuter each BB individually. */
3635 len
= par
->blocks
.length ();
3636 for (ix
= 0; ix
!= len
; ix
++)
3638 basic_block block
= par
->blocks
[ix
];
3640 nvptx_single (neuter_mask
, block
, block
);
3646 nvptx_skip_par (skip_mask
, par
);
3649 nvptx_neuter_pars (par
->next
, modes
, outer
);
3652 /* PTX-specific reorganization
3653 - Split blocks at fork and join instructions
3654 - Compute live registers
3655 - Mark now-unused registers, so function begin doesn't declare
3657 - Insert state propagation when entering partitioned mode
3658 - Insert neutering instructions when in single mode
3659 - Replace subregs with suitable sequences.
3665 /* We are freeing block_for_insn in the toplev to keep compatibility
3666 with old MDEP_REORGS that are not CFG based. Recompute it now. */
3667 compute_bb_for_insn ();
3669 thread_prologue_and_epilogue_insns ();
3671 /* Split blocks and record interesting unspecs. */
3672 bb_insn_map_t bb_insn_map
;
3674 nvptx_split_blocks (&bb_insn_map
);
3676 /* Compute live regs */
3677 df_clear_flags (DF_LR_RUN_DCE
);
3678 df_set_flags (DF_NO_INSN_RESCAN
| DF_NO_HARD_REGS
);
3679 df_live_add_problem ();
3680 df_live_set_all_dirty ();
3682 regstat_init_n_sets_and_refs ();
3685 df_dump (dump_file
);
3687 /* Mark unused regs as unused. */
3688 int max_regs
= max_reg_num ();
3689 for (int i
= LAST_VIRTUAL_REGISTER
+ 1; i
< max_regs
; i
++)
3690 if (REG_N_SETS (i
) == 0 && REG_N_REFS (i
) == 0)
3691 regno_reg_rtx
[i
] = const0_rtx
;
3693 /* Determine launch dimensions of the function. If it is not an
3694 offloaded function (i.e. this is a regular compiler), the
3695 function has no neutering. */
3696 tree attr
= get_oacc_fn_attrib (current_function_decl
);
3699 /* If we determined this mask before RTL expansion, we could
3700 elide emission of some levels of forks and joins. */
3702 tree dims
= TREE_VALUE (attr
);
3705 for (ix
= 0; ix
!= GOMP_DIM_MAX
; ix
++, dims
= TREE_CHAIN (dims
))
3707 int size
= TREE_INT_CST_LOW (TREE_VALUE (dims
));
3708 tree allowed
= TREE_PURPOSE (dims
);
3710 if (size
!= 1 && !(allowed
&& integer_zerop (allowed
)))
3711 mask
|= GOMP_DIM_MASK (ix
);
3713 /* If there is worker neutering, there must be vector
3714 neutering. Otherwise the hardware will fail. */
3715 gcc_assert (!(mask
& GOMP_DIM_MASK (GOMP_DIM_WORKER
))
3716 || (mask
& GOMP_DIM_MASK (GOMP_DIM_VECTOR
)));
3718 /* Discover & process partitioned regions. */
3719 parallel
*pars
= nvptx_discover_pars (&bb_insn_map
);
3720 nvptx_process_pars (pars
);
3721 nvptx_neuter_pars (pars
, mask
, 0);
3725 /* Replace subregs. */
3726 nvptx_reorg_subreg ();
3728 regstat_free_n_sets_and_refs ();
3730 df_finish_pass (true);
3733 /* Handle a "kernel" attribute; arguments as in
3734 struct attribute_spec.handler. */
3737 nvptx_handle_kernel_attribute (tree
*node
, tree name
, tree
ARG_UNUSED (args
),
3738 int ARG_UNUSED (flags
), bool *no_add_attrs
)
3742 if (TREE_CODE (decl
) != FUNCTION_DECL
)
3744 error ("%qE attribute only applies to functions", name
);
3745 *no_add_attrs
= true;
3748 else if (TREE_TYPE (TREE_TYPE (decl
)) != void_type_node
)
3750 error ("%qE attribute requires a void return type", name
);
3751 *no_add_attrs
= true;
3757 /* Table of valid machine attributes. */
3758 static const struct attribute_spec nvptx_attribute_table
[] =
3760 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
3761 affects_type_identity } */
3762 { "kernel", 0, 0, true, false, false, nvptx_handle_kernel_attribute
, false },
3763 { NULL
, 0, 0, false, false, false, NULL
, false }
3766 /* Limit vector alignments to BIGGEST_ALIGNMENT. */
3768 static HOST_WIDE_INT
3769 nvptx_vector_alignment (const_tree type
)
3771 HOST_WIDE_INT align
= tree_to_shwi (TYPE_SIZE (type
));
3773 return MIN (align
, BIGGEST_ALIGNMENT
);
3776 /* Indicate that INSN cannot be duplicated. */
3779 nvptx_cannot_copy_insn_p (rtx_insn
*insn
)
3781 switch (recog_memoized (insn
))
3783 case CODE_FOR_nvptx_shufflesi
:
3784 case CODE_FOR_nvptx_shufflesf
:
3785 case CODE_FOR_nvptx_barsync
:
3786 case CODE_FOR_nvptx_fork
:
3787 case CODE_FOR_nvptx_forked
:
3788 case CODE_FOR_nvptx_joining
:
3789 case CODE_FOR_nvptx_join
:
3796 /* Section anchors do not work. Initialization for flag_section_anchor
3797 probes the existence of the anchoring target hooks and prevents
3798 anchoring if they don't exist. However, we may be being used with
3799 a host-side compiler that does support anchoring, and hence see
3800 the anchor flag set (as it's not recalculated). So provide an
3801 implementation denying anchoring. */
3804 nvptx_use_anchors_for_symbol_p (const_rtx
ARG_UNUSED (a
))
3809 /* Record a symbol for mkoffload to enter into the mapping table. */
3812 nvptx_record_offload_symbol (tree decl
)
3814 switch (TREE_CODE (decl
))
3817 fprintf (asm_out_file
, "//:VAR_MAP \"%s\"\n",
3818 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl
)));
3823 tree attr
= get_oacc_fn_attrib (decl
);
3824 tree dims
= TREE_VALUE (attr
);
3827 fprintf (asm_out_file
, "//:FUNC_MAP \"%s\"",
3828 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl
)));
3830 for (ix
= 0; ix
!= GOMP_DIM_MAX
; ix
++, dims
= TREE_CHAIN (dims
))
3832 int size
= TREE_INT_CST_LOW (TREE_VALUE (dims
));
3834 gcc_assert (!TREE_PURPOSE (dims
));
3835 fprintf (asm_out_file
, ", %#x", size
);
3838 fprintf (asm_out_file
, "\n");
3847 /* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
3848 at the start of a file. */
3851 nvptx_file_start (void)
3853 fputs ("// BEGIN PREAMBLE\n", asm_out_file
);
3854 fputs ("\t.version\t3.1\n", asm_out_file
);
3855 fputs ("\t.target\tsm_30\n", asm_out_file
);
3856 fprintf (asm_out_file
, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode
));
3857 fputs ("// END PREAMBLE\n", asm_out_file
);
3860 /* Write out the function declarations we've collected and declare storage
3861 for the broadcast buffer. */
3864 nvptx_file_end (void)
3866 hash_table
<tree_hasher
>::iterator iter
;
3868 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab
, decl
, tree
, iter
)
3869 nvptx_record_fndecl (decl
);
3870 fputs (func_decls
.str().c_str(), asm_out_file
);
3872 if (worker_bcast_size
)
3874 /* Define the broadcast buffer. */
3876 worker_bcast_size
= (worker_bcast_size
+ worker_bcast_align
- 1)
3877 & ~(worker_bcast_align
- 1);
3879 fprintf (asm_out_file
, "\n// BEGIN VAR DEF: %s\n", worker_bcast_name
);
3880 fprintf (asm_out_file
, ".shared .align %d .u8 %s[%d];\n",
3882 worker_bcast_name
, worker_bcast_size
);
3885 if (worker_red_size
)
3887 /* Define the reduction buffer. */
3889 worker_red_size
= ((worker_red_size
+ worker_red_align
- 1)
3890 & ~(worker_red_align
- 1));
3892 fprintf (asm_out_file
, "\n// BEGIN VAR DEF: %s\n", worker_red_name
);
3893 fprintf (asm_out_file
, ".shared .align %d .u8 %s[%d];\n",
3895 worker_red_name
, worker_red_size
);
3899 /* Expander for the shuffle builtins. */
3902 nvptx_expand_shuffle (tree exp
, rtx target
, machine_mode mode
, int ignore
)
3907 rtx src
= expand_expr (CALL_EXPR_ARG (exp
, 0),
3908 NULL_RTX
, mode
, EXPAND_NORMAL
);
3910 src
= copy_to_mode_reg (mode
, src
);
3912 rtx idx
= expand_expr (CALL_EXPR_ARG (exp
, 1),
3913 NULL_RTX
, SImode
, EXPAND_NORMAL
);
3914 rtx op
= expand_expr (CALL_EXPR_ARG (exp
, 2),
3915 NULL_RTX
, SImode
, EXPAND_NORMAL
);
3917 if (!REG_P (idx
) && GET_CODE (idx
) != CONST_INT
)
3918 idx
= copy_to_mode_reg (SImode
, idx
);
3920 rtx pat
= nvptx_gen_shuffle (target
, src
, idx
, INTVAL (op
));
3927 /* Worker reduction address expander. */
3930 nvptx_expand_worker_addr (tree exp
, rtx target
,
3931 machine_mode
ARG_UNUSED (mode
), int ignore
)
3936 unsigned align
= TREE_INT_CST_LOW (CALL_EXPR_ARG (exp
, 2));
3937 if (align
> worker_red_align
)
3938 worker_red_align
= align
;
3940 unsigned offset
= TREE_INT_CST_LOW (CALL_EXPR_ARG (exp
, 0));
3941 unsigned size
= TREE_INT_CST_LOW (CALL_EXPR_ARG (exp
, 1));
3942 if (size
+ offset
> worker_red_size
)
3943 worker_red_size
= size
+ offset
;
3945 emit_insn (gen_rtx_SET (target
, worker_red_sym
));
3948 emit_insn (gen_rtx_SET (target
,
3949 gen_rtx_PLUS (Pmode
, target
, GEN_INT (offset
))));
3951 emit_insn (gen_rtx_SET (target
,
3952 gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, target
),
3953 UNSPEC_FROM_SHARED
)));
3958 /* Expand the CMP_SWAP PTX builtins. We have our own versions that do
3959 not require taking the address of any object, other than the memory
3960 cell being operated on. */
3963 nvptx_expand_cmp_swap (tree exp
, rtx target
,
3964 machine_mode
ARG_UNUSED (m
), int ARG_UNUSED (ignore
))
3966 machine_mode mode
= TYPE_MODE (TREE_TYPE (exp
));
3969 target
= gen_reg_rtx (mode
);
3971 rtx mem
= expand_expr (CALL_EXPR_ARG (exp
, 0),
3972 NULL_RTX
, Pmode
, EXPAND_NORMAL
);
3973 rtx cmp
= expand_expr (CALL_EXPR_ARG (exp
, 1),
3974 NULL_RTX
, mode
, EXPAND_NORMAL
);
3975 rtx src
= expand_expr (CALL_EXPR_ARG (exp
, 2),
3976 NULL_RTX
, mode
, EXPAND_NORMAL
);
3979 mem
= gen_rtx_MEM (mode
, mem
);
3981 cmp
= copy_to_mode_reg (mode
, cmp
);
3983 src
= copy_to_mode_reg (mode
, src
);
3986 pat
= gen_atomic_compare_and_swapsi_1 (target
, mem
, cmp
, src
, const0_rtx
);
3988 pat
= gen_atomic_compare_and_swapdi_1 (target
, mem
, cmp
, src
, const0_rtx
);
3996 /* Codes for all the NVPTX builtins. */
3999 NVPTX_BUILTIN_SHUFFLE
,
4000 NVPTX_BUILTIN_SHUFFLELL
,
4001 NVPTX_BUILTIN_WORKER_ADDR
,
4002 NVPTX_BUILTIN_CMP_SWAP
,
4003 NVPTX_BUILTIN_CMP_SWAPLL
,
4007 static GTY(()) tree nvptx_builtin_decls
[NVPTX_BUILTIN_MAX
];
4009 /* Return the NVPTX builtin for CODE. */
4012 nvptx_builtin_decl (unsigned code
, bool ARG_UNUSED (initialize_p
))
4014 if (code
>= NVPTX_BUILTIN_MAX
)
4015 return error_mark_node
;
4017 return nvptx_builtin_decls
[code
];
4020 /* Set up all builtin functions for this target. */
4023 nvptx_init_builtins (void)
4025 #define DEF(ID, NAME, T) \
4026 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
4027 = add_builtin_function ("__builtin_nvptx_" NAME, \
4028 build_function_type_list T, \
4029 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
4031 #define UINT unsigned_type_node
4032 #define LLUINT long_long_unsigned_type_node
4033 #define PTRVOID ptr_type_node
4035 DEF (SHUFFLE
, "shuffle", (UINT
, UINT
, UINT
, UINT
, NULL_TREE
));
4036 DEF (SHUFFLELL
, "shufflell", (LLUINT
, LLUINT
, UINT
, UINT
, NULL_TREE
));
4037 DEF (WORKER_ADDR
, "worker_addr",
4038 (PTRVOID
, ST
, UINT
, UINT
, NULL_TREE
));
4039 DEF (CMP_SWAP
, "cmp_swap", (UINT
, PTRVOID
, UINT
, UINT
, NULL_TREE
));
4040 DEF (CMP_SWAPLL
, "cmp_swapll", (LLUINT
, PTRVOID
, LLUINT
, LLUINT
, NULL_TREE
));
4049 /* Expand an expression EXP that calls a built-in function,
4050 with result going to TARGET if that's convenient
4051 (and in mode MODE if that's convenient).
4052 SUBTARGET may be used as the target for computing one of EXP's operands.
4053 IGNORE is nonzero if the value is to be ignored. */
4056 nvptx_expand_builtin (tree exp
, rtx target
, rtx
ARG_UNUSED (subtarget
),
4057 machine_mode mode
, int ignore
)
4059 tree fndecl
= TREE_OPERAND (CALL_EXPR_FN (exp
), 0);
4060 switch (DECL_FUNCTION_CODE (fndecl
))
4062 case NVPTX_BUILTIN_SHUFFLE
:
4063 case NVPTX_BUILTIN_SHUFFLELL
:
4064 return nvptx_expand_shuffle (exp
, target
, mode
, ignore
);
4066 case NVPTX_BUILTIN_WORKER_ADDR
:
4067 return nvptx_expand_worker_addr (exp
, target
, mode
, ignore
);
4069 case NVPTX_BUILTIN_CMP_SWAP
:
4070 case NVPTX_BUILTIN_CMP_SWAPLL
:
4071 return nvptx_expand_cmp_swap (exp
, target
, mode
, ignore
);
4073 default: gcc_unreachable ();
4077 /* Define dimension sizes for known hardware. */
4078 #define PTX_VECTOR_LENGTH 32
4079 #define PTX_WORKER_LENGTH 32
4081 /* Validate compute dimensions of an OpenACC offload or routine, fill
4082 in non-unity defaults. FN_LEVEL indicates the level at which a
4083 routine might spawn a loop. It is negative for non-routines. */
4086 nvptx_goacc_validate_dims (tree decl
, int dims
[], int fn_level
)
4088 bool changed
= false;
4090 /* The vector size must be 32, unless this is a SEQ routine. */
4091 if (fn_level
<= GOMP_DIM_VECTOR
4092 && dims
[GOMP_DIM_VECTOR
] != PTX_VECTOR_LENGTH
)
4094 if (dims
[GOMP_DIM_VECTOR
] >= 0 && fn_level
< 0)
4095 warning_at (DECL_SOURCE_LOCATION (decl
), 0,
4096 dims
[GOMP_DIM_VECTOR
]
4097 ? "using vector_length (%d), ignoring %d"
4098 : "using vector_length (%d), ignoring runtime setting",
4099 PTX_VECTOR_LENGTH
, dims
[GOMP_DIM_VECTOR
]);
4100 dims
[GOMP_DIM_VECTOR
] = PTX_VECTOR_LENGTH
;
4104 /* Check the num workers is not too large. */
4105 if (dims
[GOMP_DIM_WORKER
] > PTX_WORKER_LENGTH
)
4107 warning_at (DECL_SOURCE_LOCATION (decl
), 0,
4108 "using num_workers (%d), ignoring %d",
4109 PTX_WORKER_LENGTH
, dims
[GOMP_DIM_WORKER
]);
4110 dims
[GOMP_DIM_WORKER
] = PTX_WORKER_LENGTH
;
4117 /* Return maximum dimension size, or zero for unbounded. */
4120 nvptx_dim_limit (int axis
)
4124 case GOMP_DIM_WORKER
:
4125 return PTX_WORKER_LENGTH
;
4127 case GOMP_DIM_VECTOR
:
4128 return PTX_VECTOR_LENGTH
;
4136 /* Determine whether fork & joins are needed. */
4139 nvptx_goacc_fork_join (gcall
*call
, const int dims
[],
4140 bool ARG_UNUSED (is_fork
))
4142 tree arg
= gimple_call_arg (call
, 2);
4143 unsigned axis
= TREE_INT_CST_LOW (arg
);
4145 /* We only care about worker and vector partitioning. */
4146 if (axis
< GOMP_DIM_WORKER
)
4149 /* If the size is 1, there's no partitioning. */
4150 if (dims
[axis
] == 1)
4156 /* Generate a PTX builtin function call that returns the address in
4157 the worker reduction buffer at OFFSET. TYPE is the type of the
4158 data at that location. */
4161 nvptx_get_worker_red_addr (tree type
, tree offset
)
4163 machine_mode mode
= TYPE_MODE (type
);
4164 tree fndecl
= nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR
, true);
4165 tree size
= build_int_cst (unsigned_type_node
, GET_MODE_SIZE (mode
));
4166 tree align
= build_int_cst (unsigned_type_node
,
4167 GET_MODE_ALIGNMENT (mode
) / BITS_PER_UNIT
);
4168 tree call
= build_call_expr (fndecl
, 3, offset
, size
, align
);
4170 return fold_convert (build_pointer_type (type
), call
);
4173 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
4174 will cast the variable if necessary. */
4177 nvptx_generate_vector_shuffle (location_t loc
,
4178 tree dest_var
, tree var
, unsigned shift
,
4181 unsigned fn
= NVPTX_BUILTIN_SHUFFLE
;
4182 tree_code code
= NOP_EXPR
;
4183 tree arg_type
= unsigned_type_node
;
4184 tree var_type
= TREE_TYPE (var
);
4185 tree dest_type
= var_type
;
4187 if (TREE_CODE (var_type
) == COMPLEX_TYPE
)
4188 var_type
= TREE_TYPE (var_type
);
4190 if (TREE_CODE (var_type
) == REAL_TYPE
)
4191 code
= VIEW_CONVERT_EXPR
;
4193 if (TYPE_SIZE (var_type
)
4194 == TYPE_SIZE (long_long_unsigned_type_node
))
4196 fn
= NVPTX_BUILTIN_SHUFFLELL
;
4197 arg_type
= long_long_unsigned_type_node
;
4200 tree call
= nvptx_builtin_decl (fn
, true);
4201 tree bits
= build_int_cst (unsigned_type_node
, shift
);
4202 tree kind
= build_int_cst (unsigned_type_node
, SHUFFLE_DOWN
);
4205 if (var_type
!= dest_type
)
4207 /* Do real and imaginary parts separately. */
4208 tree real
= fold_build1 (REALPART_EXPR
, var_type
, var
);
4209 real
= fold_build1 (code
, arg_type
, real
);
4210 real
= build_call_expr_loc (loc
, call
, 3, real
, bits
, kind
);
4211 real
= fold_build1 (code
, var_type
, real
);
4213 tree imag
= fold_build1 (IMAGPART_EXPR
, var_type
, var
);
4214 imag
= fold_build1 (code
, arg_type
, imag
);
4215 imag
= build_call_expr_loc (loc
, call
, 3, imag
, bits
, kind
);
4216 imag
= fold_build1 (code
, var_type
, imag
);
4218 expr
= fold_build2 (COMPLEX_EXPR
, dest_type
, real
, imag
);
4222 expr
= fold_build1 (code
, arg_type
, var
);
4223 expr
= build_call_expr_loc (loc
, call
, 3, expr
, bits
, kind
);
4224 expr
= fold_build1 (code
, dest_type
, expr
);
4227 gimplify_assign (dest_var
, expr
, seq
);
4230 /* Lazily generate the global lock var decl and return its address. */
4233 nvptx_global_lock_addr ()
4235 tree v
= global_lock_var
;
4239 tree name
= get_identifier ("__reduction_lock");
4240 tree type
= build_qualified_type (unsigned_type_node
,
4241 TYPE_QUAL_VOLATILE
);
4242 v
= build_decl (BUILTINS_LOCATION
, VAR_DECL
, name
, type
);
4243 global_lock_var
= v
;
4244 DECL_ARTIFICIAL (v
) = 1;
4245 DECL_EXTERNAL (v
) = 1;
4246 TREE_STATIC (v
) = 1;
4247 TREE_PUBLIC (v
) = 1;
4249 mark_addressable (v
);
4250 mark_decl_referenced (v
);
4253 return build_fold_addr_expr (v
);
4256 /* Insert code to locklessly update *PTR with *PTR OP VAR just before
4257 GSI. We use a lockless scheme for nearly all case, which looks
4259 actual = initval(OP);
4262 write = guess OP myval;
4263 actual = cmp&swap (ptr, guess, write)
4264 } while (actual bit-different-to guess);
4267 This relies on a cmp&swap instruction, which is available for 32-
4268 and 64-bit types. Larger types must use a locking scheme. */
4271 nvptx_lockless_update (location_t loc
, gimple_stmt_iterator
*gsi
,
4272 tree ptr
, tree var
, tree_code op
)
4274 unsigned fn
= NVPTX_BUILTIN_CMP_SWAP
;
4275 tree_code code
= NOP_EXPR
;
4276 tree arg_type
= unsigned_type_node
;
4277 tree var_type
= TREE_TYPE (var
);
4279 if (TREE_CODE (var_type
) == COMPLEX_TYPE
4280 || TREE_CODE (var_type
) == REAL_TYPE
)
4281 code
= VIEW_CONVERT_EXPR
;
4283 if (TYPE_SIZE (var_type
) == TYPE_SIZE (long_long_unsigned_type_node
))
4285 arg_type
= long_long_unsigned_type_node
;
4286 fn
= NVPTX_BUILTIN_CMP_SWAPLL
;
4289 tree swap_fn
= nvptx_builtin_decl (fn
, true);
4291 gimple_seq init_seq
= NULL
;
4292 tree init_var
= make_ssa_name (arg_type
);
4293 tree init_expr
= omp_reduction_init_op (loc
, op
, var_type
);
4294 init_expr
= fold_build1 (code
, arg_type
, init_expr
);
4295 gimplify_assign (init_var
, init_expr
, &init_seq
);
4296 gimple
*init_end
= gimple_seq_last (init_seq
);
4298 gsi_insert_seq_before (gsi
, init_seq
, GSI_SAME_STMT
);
4300 /* Split the block just after the init stmts. */
4301 basic_block pre_bb
= gsi_bb (*gsi
);
4302 edge pre_edge
= split_block (pre_bb
, init_end
);
4303 basic_block loop_bb
= pre_edge
->dest
;
4304 pre_bb
= pre_edge
->src
;
4305 /* Reset the iterator. */
4306 *gsi
= gsi_for_stmt (gsi_stmt (*gsi
));
4308 tree expect_var
= make_ssa_name (arg_type
);
4309 tree actual_var
= make_ssa_name (arg_type
);
4310 tree write_var
= make_ssa_name (arg_type
);
4312 /* Build and insert the reduction calculation. */
4313 gimple_seq red_seq
= NULL
;
4314 tree write_expr
= fold_build1 (code
, var_type
, expect_var
);
4315 write_expr
= fold_build2 (op
, var_type
, write_expr
, var
);
4316 write_expr
= fold_build1 (code
, arg_type
, write_expr
);
4317 gimplify_assign (write_var
, write_expr
, &red_seq
);
4319 gsi_insert_seq_before (gsi
, red_seq
, GSI_SAME_STMT
);
4321 /* Build & insert the cmp&swap sequence. */
4322 gimple_seq latch_seq
= NULL
;
4323 tree swap_expr
= build_call_expr_loc (loc
, swap_fn
, 3,
4324 ptr
, expect_var
, write_var
);
4325 gimplify_assign (actual_var
, swap_expr
, &latch_seq
);
4327 gcond
*cond
= gimple_build_cond (EQ_EXPR
, actual_var
, expect_var
,
4328 NULL_TREE
, NULL_TREE
);
4329 gimple_seq_add_stmt (&latch_seq
, cond
);
4331 gimple
*latch_end
= gimple_seq_last (latch_seq
);
4332 gsi_insert_seq_before (gsi
, latch_seq
, GSI_SAME_STMT
);
4334 /* Split the block just after the latch stmts. */
4335 edge post_edge
= split_block (loop_bb
, latch_end
);
4336 basic_block post_bb
= post_edge
->dest
;
4337 loop_bb
= post_edge
->src
;
4338 *gsi
= gsi_for_stmt (gsi_stmt (*gsi
));
4340 post_edge
->flags
^= EDGE_TRUE_VALUE
| EDGE_FALLTHRU
;
4341 edge loop_edge
= make_edge (loop_bb
, loop_bb
, EDGE_FALSE_VALUE
);
4342 set_immediate_dominator (CDI_DOMINATORS
, loop_bb
, pre_bb
);
4343 set_immediate_dominator (CDI_DOMINATORS
, post_bb
, loop_bb
);
4345 gphi
*phi
= create_phi_node (expect_var
, loop_bb
);
4346 add_phi_arg (phi
, init_var
, pre_edge
, loc
);
4347 add_phi_arg (phi
, actual_var
, loop_edge
, loc
);
4349 loop
*loop
= alloc_loop ();
4350 loop
->header
= loop_bb
;
4351 loop
->latch
= loop_bb
;
4352 add_loop (loop
, loop_bb
->loop_father
);
4354 return fold_build1 (code
, var_type
, write_var
);
4357 /* Insert code to lockfully update *PTR with *PTR OP VAR just before
4358 GSI. This is necessary for types larger than 64 bits, where there
4359 is no cmp&swap instruction to implement a lockless scheme. We use
4360 a lock variable in global memory.
4362 while (cmp&swap (&lock_var, 0, 1))
4365 accum = accum OP var;
4367 cmp&swap (&lock_var, 1, 0);
4370 A lock in global memory is necessary to force execution engine
4371 descheduling and avoid resource starvation that can occur if the
4372 lock is in .shared memory. */
4375 nvptx_lockfull_update (location_t loc
, gimple_stmt_iterator
*gsi
,
4376 tree ptr
, tree var
, tree_code op
)
4378 tree var_type
= TREE_TYPE (var
);
4379 tree swap_fn
= nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP
, true);
4380 tree uns_unlocked
= build_int_cst (unsigned_type_node
, 0);
4381 tree uns_locked
= build_int_cst (unsigned_type_node
, 1);
4383 /* Split the block just before the gsi. Insert a gimple nop to make
4385 gimple
*nop
= gimple_build_nop ();
4386 gsi_insert_before (gsi
, nop
, GSI_SAME_STMT
);
4387 basic_block entry_bb
= gsi_bb (*gsi
);
4388 edge entry_edge
= split_block (entry_bb
, nop
);
4389 basic_block lock_bb
= entry_edge
->dest
;
4390 /* Reset the iterator. */
4391 *gsi
= gsi_for_stmt (gsi_stmt (*gsi
));
4393 /* Build and insert the locking sequence. */
4394 gimple_seq lock_seq
= NULL
;
4395 tree lock_var
= make_ssa_name (unsigned_type_node
);
4396 tree lock_expr
= nvptx_global_lock_addr ();
4397 lock_expr
= build_call_expr_loc (loc
, swap_fn
, 3, lock_expr
,
4398 uns_unlocked
, uns_locked
);
4399 gimplify_assign (lock_var
, lock_expr
, &lock_seq
);
4400 gcond
*cond
= gimple_build_cond (EQ_EXPR
, lock_var
, uns_unlocked
,
4401 NULL_TREE
, NULL_TREE
);
4402 gimple_seq_add_stmt (&lock_seq
, cond
);
4403 gimple
*lock_end
= gimple_seq_last (lock_seq
);
4404 gsi_insert_seq_before (gsi
, lock_seq
, GSI_SAME_STMT
);
4406 /* Split the block just after the lock sequence. */
4407 edge locked_edge
= split_block (lock_bb
, lock_end
);
4408 basic_block update_bb
= locked_edge
->dest
;
4409 lock_bb
= locked_edge
->src
;
4410 *gsi
= gsi_for_stmt (gsi_stmt (*gsi
));
4412 /* Create the lock loop ... */
4413 locked_edge
->flags
^= EDGE_TRUE_VALUE
| EDGE_FALLTHRU
;
4414 make_edge (lock_bb
, lock_bb
, EDGE_FALSE_VALUE
);
4415 set_immediate_dominator (CDI_DOMINATORS
, lock_bb
, entry_bb
);
4416 set_immediate_dominator (CDI_DOMINATORS
, update_bb
, lock_bb
);
4418 /* ... and the loop structure. */
4419 loop
*lock_loop
= alloc_loop ();
4420 lock_loop
->header
= lock_bb
;
4421 lock_loop
->latch
= lock_bb
;
4422 lock_loop
->nb_iterations_estimate
= 1;
4423 lock_loop
->any_estimate
= true;
4424 add_loop (lock_loop
, entry_bb
->loop_father
);
4426 /* Build and insert the reduction calculation. */
4427 gimple_seq red_seq
= NULL
;
4428 tree acc_in
= make_ssa_name (var_type
);
4429 tree ref_in
= build_simple_mem_ref (ptr
);
4430 TREE_THIS_VOLATILE (ref_in
) = 1;
4431 gimplify_assign (acc_in
, ref_in
, &red_seq
);
4433 tree acc_out
= make_ssa_name (var_type
);
4434 tree update_expr
= fold_build2 (op
, var_type
, ref_in
, var
);
4435 gimplify_assign (acc_out
, update_expr
, &red_seq
);
4437 tree ref_out
= build_simple_mem_ref (ptr
);
4438 TREE_THIS_VOLATILE (ref_out
) = 1;
4439 gimplify_assign (ref_out
, acc_out
, &red_seq
);
4441 gsi_insert_seq_before (gsi
, red_seq
, GSI_SAME_STMT
);
4443 /* Build & insert the unlock sequence. */
4444 gimple_seq unlock_seq
= NULL
;
4445 tree unlock_expr
= nvptx_global_lock_addr ();
4446 unlock_expr
= build_call_expr_loc (loc
, swap_fn
, 3, unlock_expr
,
4447 uns_locked
, uns_unlocked
);
4448 gimplify_and_add (unlock_expr
, &unlock_seq
);
4449 gsi_insert_seq_before (gsi
, unlock_seq
, GSI_SAME_STMT
);
4454 /* Emit a sequence to update a reduction accumlator at *PTR with the
4455 value held in VAR using operator OP. Return the updated value.
4457 TODO: optimize for atomic ops and indepedent complex ops. */
4460 nvptx_reduction_update (location_t loc
, gimple_stmt_iterator
*gsi
,
4461 tree ptr
, tree var
, tree_code op
)
4463 tree type
= TREE_TYPE (var
);
4464 tree size
= TYPE_SIZE (type
);
4466 if (size
== TYPE_SIZE (unsigned_type_node
)
4467 || size
== TYPE_SIZE (long_long_unsigned_type_node
))
4468 return nvptx_lockless_update (loc
, gsi
, ptr
, var
, op
);
4470 return nvptx_lockfull_update (loc
, gsi
, ptr
, var
, op
);
4473 /* NVPTX implementation of GOACC_REDUCTION_SETUP. */
4476 nvptx_goacc_reduction_setup (gcall
*call
)
4478 gimple_stmt_iterator gsi
= gsi_for_stmt (call
);
4479 tree lhs
= gimple_call_lhs (call
);
4480 tree var
= gimple_call_arg (call
, 2);
4481 int level
= TREE_INT_CST_LOW (gimple_call_arg (call
, 3));
4482 gimple_seq seq
= NULL
;
4484 push_gimplify_context (true);
4486 if (level
!= GOMP_DIM_GANG
)
4488 /* Copy the receiver object. */
4489 tree ref_to_res
= gimple_call_arg (call
, 1);
4491 if (!integer_zerop (ref_to_res
))
4492 var
= build_simple_mem_ref (ref_to_res
);
4495 if (level
== GOMP_DIM_WORKER
)
4497 /* Store incoming value to worker reduction buffer. */
4498 tree offset
= gimple_call_arg (call
, 5);
4499 tree call
= nvptx_get_worker_red_addr (TREE_TYPE (var
), offset
);
4500 tree ptr
= make_ssa_name (TREE_TYPE (call
));
4502 gimplify_assign (ptr
, call
, &seq
);
4503 tree ref
= build_simple_mem_ref (ptr
);
4504 TREE_THIS_VOLATILE (ref
) = 1;
4505 gimplify_assign (ref
, var
, &seq
);
4509 gimplify_assign (lhs
, var
, &seq
);
4511 pop_gimplify_context (NULL
);
4512 gsi_replace_with_seq (&gsi
, seq
, true);
4515 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
4518 nvptx_goacc_reduction_init (gcall
*call
)
4520 gimple_stmt_iterator gsi
= gsi_for_stmt (call
);
4521 tree lhs
= gimple_call_lhs (call
);
4522 tree var
= gimple_call_arg (call
, 2);
4523 int level
= TREE_INT_CST_LOW (gimple_call_arg (call
, 3));
4524 enum tree_code rcode
4525 = (enum tree_code
)TREE_INT_CST_LOW (gimple_call_arg (call
, 4));
4526 tree init
= omp_reduction_init_op (gimple_location (call
), rcode
,
4528 gimple_seq seq
= NULL
;
4530 push_gimplify_context (true);
4532 if (level
== GOMP_DIM_VECTOR
)
4534 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
4535 tree tid
= make_ssa_name (integer_type_node
);
4536 tree dim_vector
= gimple_call_arg (call
, 3);
4537 gimple
*tid_call
= gimple_build_call_internal (IFN_GOACC_DIM_POS
, 1,
4539 gimple
*cond_stmt
= gimple_build_cond (NE_EXPR
, tid
, integer_zero_node
,
4540 NULL_TREE
, NULL_TREE
);
4542 gimple_call_set_lhs (tid_call
, tid
);
4543 gimple_seq_add_stmt (&seq
, tid_call
);
4544 gimple_seq_add_stmt (&seq
, cond_stmt
);
4546 /* Split the block just after the call. */
4547 edge init_edge
= split_block (gsi_bb (gsi
), call
);
4548 basic_block init_bb
= init_edge
->dest
;
4549 basic_block call_bb
= init_edge
->src
;
4551 /* Fixup flags from call_bb to init_bb. */
4552 init_edge
->flags
^= EDGE_FALLTHRU
| EDGE_TRUE_VALUE
;
4554 /* Set the initialization stmts. */
4555 gimple_seq init_seq
= NULL
;
4556 tree init_var
= make_ssa_name (TREE_TYPE (var
));
4557 gimplify_assign (init_var
, init
, &init_seq
);
4558 gsi
= gsi_start_bb (init_bb
);
4559 gsi_insert_seq_before (&gsi
, init_seq
, GSI_SAME_STMT
);
4561 /* Split block just after the init stmt. */
4563 edge inited_edge
= split_block (gsi_bb (gsi
), gsi_stmt (gsi
));
4564 basic_block dst_bb
= inited_edge
->dest
;
4566 /* Create false edge from call_bb to dst_bb. */
4567 edge nop_edge
= make_edge (call_bb
, dst_bb
, EDGE_FALSE_VALUE
);
4569 /* Create phi node in dst block. */
4570 gphi
*phi
= create_phi_node (lhs
, dst_bb
);
4571 add_phi_arg (phi
, init_var
, inited_edge
, gimple_location (call
));
4572 add_phi_arg (phi
, var
, nop_edge
, gimple_location (call
));
4574 /* Reset dominator of dst bb. */
4575 set_immediate_dominator (CDI_DOMINATORS
, dst_bb
, call_bb
);
4577 /* Reset the gsi. */
4578 gsi
= gsi_for_stmt (call
);
4582 if (level
== GOMP_DIM_GANG
)
4584 /* If there's no receiver object, propagate the incoming VAR. */
4585 tree ref_to_res
= gimple_call_arg (call
, 1);
4586 if (integer_zerop (ref_to_res
))
4590 gimplify_assign (lhs
, init
, &seq
);
4593 pop_gimplify_context (NULL
);
4594 gsi_replace_with_seq (&gsi
, seq
, true);
4597 /* NVPTX implementation of GOACC_REDUCTION_FINI. */
4600 nvptx_goacc_reduction_fini (gcall
*call
)
4602 gimple_stmt_iterator gsi
= gsi_for_stmt (call
);
4603 tree lhs
= gimple_call_lhs (call
);
4604 tree ref_to_res
= gimple_call_arg (call
, 1);
4605 tree var
= gimple_call_arg (call
, 2);
4606 int level
= TREE_INT_CST_LOW (gimple_call_arg (call
, 3));
4608 = (enum tree_code
)TREE_INT_CST_LOW (gimple_call_arg (call
, 4));
4609 gimple_seq seq
= NULL
;
4610 tree r
= NULL_TREE
;;
4612 push_gimplify_context (true);
4614 if (level
== GOMP_DIM_VECTOR
)
4616 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
4617 but that requires a method of emitting a unified jump at the
4619 for (int shfl
= PTX_VECTOR_LENGTH
/ 2; shfl
> 0; shfl
= shfl
>> 1)
4621 tree other_var
= make_ssa_name (TREE_TYPE (var
));
4622 nvptx_generate_vector_shuffle (gimple_location (call
),
4623 other_var
, var
, shfl
, &seq
);
4625 r
= make_ssa_name (TREE_TYPE (var
));
4626 gimplify_assign (r
, fold_build2 (op
, TREE_TYPE (var
),
4627 var
, other_var
), &seq
);
4633 tree accum
= NULL_TREE
;
4635 if (level
== GOMP_DIM_WORKER
)
4637 /* Get reduction buffer address. */
4638 tree offset
= gimple_call_arg (call
, 5);
4639 tree call
= nvptx_get_worker_red_addr (TREE_TYPE (var
), offset
);
4640 tree ptr
= make_ssa_name (TREE_TYPE (call
));
4642 gimplify_assign (ptr
, call
, &seq
);
4645 else if (integer_zerop (ref_to_res
))
4652 /* UPDATE the accumulator. */
4653 gsi_insert_seq_before (&gsi
, seq
, GSI_SAME_STMT
);
4655 r
= nvptx_reduction_update (gimple_location (call
), &gsi
,
4661 gimplify_assign (lhs
, r
, &seq
);
4662 pop_gimplify_context (NULL
);
4664 gsi_replace_with_seq (&gsi
, seq
, true);
4667 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
4670 nvptx_goacc_reduction_teardown (gcall
*call
)
4672 gimple_stmt_iterator gsi
= gsi_for_stmt (call
);
4673 tree lhs
= gimple_call_lhs (call
);
4674 tree var
= gimple_call_arg (call
, 2);
4675 int level
= TREE_INT_CST_LOW (gimple_call_arg (call
, 3));
4676 gimple_seq seq
= NULL
;
4678 push_gimplify_context (true);
4679 if (level
== GOMP_DIM_WORKER
)
4681 /* Read the worker reduction buffer. */
4682 tree offset
= gimple_call_arg (call
, 5);
4683 tree call
= nvptx_get_worker_red_addr(TREE_TYPE (var
), offset
);
4684 tree ptr
= make_ssa_name (TREE_TYPE (call
));
4686 gimplify_assign (ptr
, call
, &seq
);
4687 var
= build_simple_mem_ref (ptr
);
4688 TREE_THIS_VOLATILE (var
) = 1;
4691 if (level
!= GOMP_DIM_GANG
)
4693 /* Write to the receiver object. */
4694 tree ref_to_res
= gimple_call_arg (call
, 1);
4696 if (!integer_zerop (ref_to_res
))
4697 gimplify_assign (build_simple_mem_ref (ref_to_res
), var
, &seq
);
4701 gimplify_assign (lhs
, var
, &seq
);
4703 pop_gimplify_context (NULL
);
4705 gsi_replace_with_seq (&gsi
, seq
, true);
4708 /* NVPTX reduction expander. */
4711 nvptx_goacc_reduction (gcall
*call
)
4713 unsigned code
= (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call
, 0));
4717 case IFN_GOACC_REDUCTION_SETUP
:
4718 nvptx_goacc_reduction_setup (call
);
4721 case IFN_GOACC_REDUCTION_INIT
:
4722 nvptx_goacc_reduction_init (call
);
4725 case IFN_GOACC_REDUCTION_FINI
:
4726 nvptx_goacc_reduction_fini (call
);
4729 case IFN_GOACC_REDUCTION_TEARDOWN
:
4730 nvptx_goacc_reduction_teardown (call
);
4738 #undef TARGET_OPTION_OVERRIDE
4739 #define TARGET_OPTION_OVERRIDE nvptx_option_override
4741 #undef TARGET_ATTRIBUTE_TABLE
4742 #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
4744 #undef TARGET_LEGITIMATE_ADDRESS_P
4745 #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
4747 #undef TARGET_PROMOTE_FUNCTION_MODE
4748 #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
4750 #undef TARGET_FUNCTION_ARG
4751 #define TARGET_FUNCTION_ARG nvptx_function_arg
4752 #undef TARGET_FUNCTION_INCOMING_ARG
4753 #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
4754 #undef TARGET_FUNCTION_ARG_ADVANCE
4755 #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
4756 #undef TARGET_FUNCTION_ARG_BOUNDARY
4757 #define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
4758 #undef TARGET_FUNCTION_ARG_ROUND_BOUNDARY
4759 #define TARGET_FUNCTION_ARG_ROUND_BOUNDARY nvptx_function_arg_boundary
4760 #undef TARGET_PASS_BY_REFERENCE
4761 #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
4762 #undef TARGET_FUNCTION_VALUE_REGNO_P
4763 #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
4764 #undef TARGET_FUNCTION_VALUE
4765 #define TARGET_FUNCTION_VALUE nvptx_function_value
4766 #undef TARGET_LIBCALL_VALUE
4767 #define TARGET_LIBCALL_VALUE nvptx_libcall_value
4768 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
4769 #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
4770 #undef TARGET_GET_DRAP_RTX
4771 #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
4772 #undef TARGET_SPLIT_COMPLEX_ARG
4773 #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
4774 #undef TARGET_RETURN_IN_MEMORY
4775 #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
4776 #undef TARGET_OMIT_STRUCT_RETURN_REG
4777 #define TARGET_OMIT_STRUCT_RETURN_REG true
4778 #undef TARGET_STRICT_ARGUMENT_NAMING
4779 #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
4780 #undef TARGET_STATIC_CHAIN
4781 #define TARGET_STATIC_CHAIN nvptx_static_chain
4783 #undef TARGET_CALL_ARGS
4784 #define TARGET_CALL_ARGS nvptx_call_args
4785 #undef TARGET_END_CALL_ARGS
4786 #define TARGET_END_CALL_ARGS nvptx_end_call_args
4788 #undef TARGET_ASM_FILE_START
4789 #define TARGET_ASM_FILE_START nvptx_file_start
4790 #undef TARGET_ASM_FILE_END
4791 #define TARGET_ASM_FILE_END nvptx_file_end
4792 #undef TARGET_ASM_GLOBALIZE_LABEL
4793 #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
4794 #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
4795 #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
4796 #undef TARGET_PRINT_OPERAND
4797 #define TARGET_PRINT_OPERAND nvptx_print_operand
4798 #undef TARGET_PRINT_OPERAND_ADDRESS
4799 #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
4800 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
4801 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
4802 #undef TARGET_ASM_INTEGER
4803 #define TARGET_ASM_INTEGER nvptx_assemble_integer
4804 #undef TARGET_ASM_DECL_END
4805 #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
4806 #undef TARGET_ASM_DECLARE_CONSTANT_NAME
4807 #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
4808 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
4809 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
4810 #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
4811 #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
4813 #undef TARGET_MACHINE_DEPENDENT_REORG
4814 #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
4815 #undef TARGET_NO_REGISTER_ALLOCATION
4816 #define TARGET_NO_REGISTER_ALLOCATION true
4818 #undef TARGET_RECORD_OFFLOAD_SYMBOL
4819 #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
4821 #undef TARGET_VECTOR_ALIGNMENT
4822 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
4824 #undef TARGET_CANNOT_COPY_INSN_P
4825 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
4827 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
4828 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
4830 #undef TARGET_INIT_BUILTINS
4831 #define TARGET_INIT_BUILTINS nvptx_init_builtins
4832 #undef TARGET_EXPAND_BUILTIN
4833 #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
4834 #undef TARGET_BUILTIN_DECL
4835 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
4837 #undef TARGET_GOACC_VALIDATE_DIMS
4838 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
4840 #undef TARGET_GOACC_DIM_LIMIT
4841 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
4843 #undef TARGET_GOACC_FORK_JOIN
4844 #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
4846 #undef TARGET_GOACC_REDUCTION
4847 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
4849 struct gcc_target targetm
= TARGET_INITIALIZER
;
4851 #include "gt-nvptx.h"