1 /* Target code for NVPTX.
2 Copyright (C) 2014-2018 Free Software Foundation, Inc.
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
26 #include "coretypes.h"
40 #include "diagnostic.h"
42 #include "insn-flags.h"
44 #include "insn-attr.h"
53 #include "tm-constrs.h"
54 #include "langhooks.h"
58 #include "stor-layout.h"
60 #include "omp-general.h"
62 #include "gomp-constants.h"
64 #include "internal-fn.h"
65 #include "gimple-iterator.h"
66 #include "stringpool.h"
69 #include "tree-ssa-operands.h"
70 #include "tree-ssanames.h"
72 #include "tree-phinodes.h"
74 #include "fold-const.h"
77 /* This file should be included last. */
78 #include "target-def.h"
80 #define WORKAROUND_PTXJIT_BUG 1
81 #define WORKAROUND_PTXJIT_BUG_2 1
83 /* The various PTX memory areas an object might reside in. */
95 /* We record the data area in the target symbol flags. */
96 #define SYMBOL_DATA_AREA(SYM) \
97 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
99 #define SET_SYMBOL_DATA_AREA(SYM,AREA) \
100 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
102 /* Record the function decls we've written, and the libfuncs and function
103 decls corresponding to them. */
104 static std::stringstream func_decls
;
106 struct declared_libfunc_hasher
: ggc_cache_ptr_hash
<rtx_def
>
108 static hashval_t
hash (rtx x
) { return htab_hash_pointer (x
); }
109 static bool equal (rtx a
, rtx b
) { return a
== b
; }
113 hash_table
<declared_libfunc_hasher
> *declared_libfuncs_htab
;
115 struct tree_hasher
: ggc_cache_ptr_hash
<tree_node
>
117 static hashval_t
hash (tree t
) { return htab_hash_pointer (t
); }
118 static bool equal (tree a
, tree b
) { return a
== b
; }
121 static GTY((cache
)) hash_table
<tree_hasher
> *declared_fndecls_htab
;
122 static GTY((cache
)) hash_table
<tree_hasher
> *needed_fndecls_htab
;
124 /* Buffer needed to broadcast across workers. This is used for both
125 worker-neutering and worker broadcasting. It is shared by all
126 functions emitted. The buffer is placed in shared memory. It'd be
127 nice if PTX supported common blocks, because then this could be
128 shared across TUs (taking the largest size). */
129 static unsigned worker_bcast_size
;
130 static unsigned worker_bcast_align
;
131 static GTY(()) rtx worker_bcast_sym
;
133 /* Buffer needed for worker reductions. This has to be distinct from
134 the worker broadcast array, as both may be live concurrently. */
135 static unsigned worker_red_size
;
136 static unsigned worker_red_align
;
137 static GTY(()) rtx worker_red_sym
;
139 /* Global lock variable, needed for 128bit worker & gang reductions. */
140 static GTY(()) tree global_lock_var
;
142 /* True if any function references __nvptx_stacks. */
143 static bool need_softstack_decl
;
145 /* True if any function references __nvptx_uni. */
146 static bool need_unisimt_decl
;
148 /* Allocate a new, cleared machine_function structure. */
150 static struct machine_function
*
151 nvptx_init_machine_status (void)
153 struct machine_function
*p
= ggc_cleared_alloc
<machine_function
> ();
154 p
->return_mode
= VOIDmode
;
158 /* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
159 and -fopenacc is also enabled. */
162 diagnose_openacc_conflict (bool optval
, const char *optname
)
164 if (flag_openacc
&& optval
)
165 error ("option %s is not supported together with -fopenacc", optname
);
168 /* Implement TARGET_OPTION_OVERRIDE. */
171 nvptx_option_override (void)
173 init_machine_status
= nvptx_init_machine_status
;
175 /* Set toplevel_reorder, unless explicitly disabled. We need
176 reordering so that we emit necessary assembler decls of
177 undeclared variables. */
178 if (!global_options_set
.x_flag_toplevel_reorder
)
179 flag_toplevel_reorder
= 1;
181 debug_nonbind_markers_p
= 0;
183 /* Set flag_no_common, unless explicitly disabled. We fake common
184 using .weak, and that's not entirely accurate, so avoid it
186 if (!global_options_set
.x_flag_no_common
)
189 /* The patch area requires nops, which we don't have. */
190 if (function_entry_patch_area_size
> 0)
191 sorry ("not generating patch area, nops not supported");
193 /* Assumes that it will see only hard registers. */
194 flag_var_tracking
= 0;
196 if (nvptx_optimize
< 0)
197 nvptx_optimize
= optimize
> 0;
199 declared_fndecls_htab
= hash_table
<tree_hasher
>::create_ggc (17);
200 needed_fndecls_htab
= hash_table
<tree_hasher
>::create_ggc (17);
201 declared_libfuncs_htab
202 = hash_table
<declared_libfunc_hasher
>::create_ggc (17);
204 worker_bcast_sym
= gen_rtx_SYMBOL_REF (Pmode
, "__worker_bcast");
205 SET_SYMBOL_DATA_AREA (worker_bcast_sym
, DATA_AREA_SHARED
);
206 worker_bcast_align
= GET_MODE_ALIGNMENT (SImode
) / BITS_PER_UNIT
;
208 worker_red_sym
= gen_rtx_SYMBOL_REF (Pmode
, "__worker_red");
209 SET_SYMBOL_DATA_AREA (worker_red_sym
, DATA_AREA_SHARED
);
210 worker_red_align
= GET_MODE_ALIGNMENT (SImode
) / BITS_PER_UNIT
;
212 diagnose_openacc_conflict (TARGET_GOMP
, "-mgomp");
213 diagnose_openacc_conflict (TARGET_SOFT_STACK
, "-msoft-stack");
214 diagnose_openacc_conflict (TARGET_UNIFORM_SIMT
, "-muniform-simt");
217 target_flags
|= MASK_SOFT_STACK
| MASK_UNIFORM_SIMT
;
220 /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
221 deal with ptx ideosyncracies. */
224 nvptx_ptx_type_from_mode (machine_mode mode
, bool promote
)
259 /* Encode the PTX data area that DECL (which might not actually be a
260 _DECL) should reside in. */
263 nvptx_encode_section_info (tree decl
, rtx rtl
, int first
)
265 default_encode_section_info (decl
, rtl
, first
);
266 if (first
&& MEM_P (rtl
))
268 nvptx_data_area area
= DATA_AREA_GENERIC
;
270 if (TREE_CONSTANT (decl
))
271 area
= DATA_AREA_CONST
;
272 else if (TREE_CODE (decl
) == VAR_DECL
)
274 if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl
)))
276 area
= DATA_AREA_SHARED
;
277 if (DECL_INITIAL (decl
))
278 error ("static initialization of variable %q+D in %<.shared%>"
279 " memory is not supported", decl
);
282 area
= TREE_READONLY (decl
) ? DATA_AREA_CONST
: DATA_AREA_GLOBAL
;
285 SET_SYMBOL_DATA_AREA (XEXP (rtl
, 0), area
);
289 /* Return the PTX name of the data area in which SYM should be
290 placed. The symbol must have already been processed by
291 nvptx_encode_seciton_info, or equivalent. */
294 section_for_sym (rtx sym
)
296 nvptx_data_area area
= SYMBOL_DATA_AREA (sym
);
297 /* Same order as nvptx_data_area enum. */
298 static char const *const areas
[] =
299 {"", ".global", ".shared", ".local", ".const", ".param"};
304 /* Similarly for a decl. */
307 section_for_decl (const_tree decl
)
309 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree
, decl
)), 0));
312 /* Check NAME for special function names and redirect them by returning a
313 replacement. This applies to malloc, free and realloc, for which we
314 want to use libgcc wrappers, and call, which triggers a bug in
315 ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
316 not active in an offload compiler -- the names are all set by the
317 host-side compiler. */
320 nvptx_name_replacement (const char *name
)
322 if (strcmp (name
, "call") == 0)
323 return "__nvptx_call";
324 if (strcmp (name
, "malloc") == 0)
325 return "__nvptx_malloc";
326 if (strcmp (name
, "free") == 0)
327 return "__nvptx_free";
328 if (strcmp (name
, "realloc") == 0)
329 return "__nvptx_realloc";
333 /* If MODE should be treated as two registers of an inner mode, return
334 that inner mode. Otherwise return VOIDmode. */
337 maybe_split_mode (machine_mode mode
)
339 if (COMPLEX_MODE_P (mode
))
340 return GET_MODE_INNER (mode
);
348 /* Return true if mode should be treated as two registers. */
351 split_mode_p (machine_mode mode
)
353 return maybe_split_mode (mode
) != VOIDmode
;
356 /* Output a register, subreg, or register pair (with optional
357 enclosing braces). */
360 output_reg (FILE *file
, unsigned regno
, machine_mode inner_mode
,
361 int subreg_offset
= -1)
363 if (inner_mode
== VOIDmode
)
365 if (HARD_REGISTER_NUM_P (regno
))
366 fprintf (file
, "%s", reg_names
[regno
]);
368 fprintf (file
, "%%r%d", regno
);
370 else if (subreg_offset
>= 0)
372 output_reg (file
, regno
, VOIDmode
);
373 fprintf (file
, "$%d", subreg_offset
);
377 if (subreg_offset
== -1)
379 output_reg (file
, regno
, inner_mode
, GET_MODE_SIZE (inner_mode
));
381 output_reg (file
, regno
, inner_mode
, 0);
382 if (subreg_offset
== -1)
387 /* Emit forking instructions for MASK. */
390 nvptx_emit_forking (unsigned mask
, bool is_call
)
392 mask
&= (GOMP_DIM_MASK (GOMP_DIM_WORKER
)
393 | GOMP_DIM_MASK (GOMP_DIM_VECTOR
));
396 rtx op
= GEN_INT (mask
| (is_call
<< GOMP_DIM_MAX
));
398 /* Emit fork at all levels. This helps form SESE regions, as
399 it creates a block with a single successor before entering a
400 partitooned region. That is a good candidate for the end of
403 emit_insn (gen_nvptx_fork (op
));
404 emit_insn (gen_nvptx_forked (op
));
408 /* Emit joining instructions for MASK. */
411 nvptx_emit_joining (unsigned mask
, bool is_call
)
413 mask
&= (GOMP_DIM_MASK (GOMP_DIM_WORKER
)
414 | GOMP_DIM_MASK (GOMP_DIM_VECTOR
));
417 rtx op
= GEN_INT (mask
| (is_call
<< GOMP_DIM_MAX
));
419 /* Emit joining for all non-call pars to ensure there's a single
420 predecessor for the block the join insn ends up in. This is
421 needed for skipping entire loops. */
423 emit_insn (gen_nvptx_joining (op
));
424 emit_insn (gen_nvptx_join (op
));
429 /* Determine whether MODE and TYPE (possibly NULL) should be passed or
430 returned in memory. Integer and floating types supported by the
431 machine are passed in registers, everything else is passed in
432 memory. Complex types are split. */
435 pass_in_memory (machine_mode mode
, const_tree type
, bool for_return
)
439 if (AGGREGATE_TYPE_P (type
))
441 if (TREE_CODE (type
) == VECTOR_TYPE
)
445 if (!for_return
&& COMPLEX_MODE_P (mode
))
446 /* Complex types are passed as two underlying args. */
447 mode
= GET_MODE_INNER (mode
);
449 if (GET_MODE_CLASS (mode
) != MODE_INT
450 && GET_MODE_CLASS (mode
) != MODE_FLOAT
)
453 if (GET_MODE_SIZE (mode
) > UNITS_PER_WORD
)
459 /* A non-memory argument of mode MODE is being passed, determine the mode it
460 should be promoted to. This is also used for determining return
464 promote_arg (machine_mode mode
, bool prototyped
)
466 if (!prototyped
&& mode
== SFmode
)
467 /* K&R float promotion for unprototyped functions. */
469 else if (GET_MODE_SIZE (mode
) < GET_MODE_SIZE (SImode
))
475 /* A non-memory return type of MODE is being returned. Determine the
476 mode it should be promoted to. */
479 promote_return (machine_mode mode
)
481 return promote_arg (mode
, true);
484 /* Implement TARGET_FUNCTION_ARG. */
487 nvptx_function_arg (cumulative_args_t
ARG_UNUSED (cum_v
), machine_mode mode
,
488 const_tree
, bool named
)
490 if (mode
== VOIDmode
|| !named
)
493 return gen_reg_rtx (mode
);
496 /* Implement TARGET_FUNCTION_INCOMING_ARG. */
499 nvptx_function_incoming_arg (cumulative_args_t cum_v
, machine_mode mode
,
500 const_tree
, bool named
)
502 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
504 if (mode
== VOIDmode
|| !named
)
507 /* No need to deal with split modes here, the only case that can
508 happen is complex modes and those are dealt with by
509 TARGET_SPLIT_COMPLEX_ARG. */
510 return gen_rtx_UNSPEC (mode
,
511 gen_rtvec (1, GEN_INT (cum
->count
)),
515 /* Implement TARGET_FUNCTION_ARG_ADVANCE. */
518 nvptx_function_arg_advance (cumulative_args_t cum_v
,
519 machine_mode
ARG_UNUSED (mode
),
520 const_tree
ARG_UNUSED (type
),
521 bool ARG_UNUSED (named
))
523 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
528 /* Implement TARGET_FUNCTION_ARG_BOUNDARY.
530 For nvptx This is only used for varadic args. The type has already
531 been promoted and/or converted to invisible reference. */
534 nvptx_function_arg_boundary (machine_mode mode
, const_tree
ARG_UNUSED (type
))
536 return GET_MODE_ALIGNMENT (mode
);
539 /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
541 For nvptx, we know how to handle functions declared as stdarg: by
542 passing an extra pointer to the unnamed arguments. However, the
543 Fortran frontend can produce a different situation, where a
544 function pointer is declared with no arguments, but the actual
545 function and calls to it take more arguments. In that case, we
546 want to ensure the call matches the definition of the function. */
549 nvptx_strict_argument_naming (cumulative_args_t cum_v
)
551 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
553 return cum
->fntype
== NULL_TREE
|| stdarg_p (cum
->fntype
);
556 /* Implement TARGET_LIBCALL_VALUE. */
559 nvptx_libcall_value (machine_mode mode
, const_rtx
)
561 if (!cfun
|| !cfun
->machine
->doing_call
)
562 /* Pretend to return in a hard reg for early uses before pseudos can be
564 return gen_rtx_REG (mode
, NVPTX_RETURN_REGNUM
);
566 return gen_reg_rtx (mode
);
569 /* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
570 where function FUNC returns or receives a value of data type TYPE. */
573 nvptx_function_value (const_tree type
, const_tree
ARG_UNUSED (func
),
576 machine_mode mode
= promote_return (TYPE_MODE (type
));
581 cfun
->machine
->return_mode
= mode
;
582 return gen_rtx_REG (mode
, NVPTX_RETURN_REGNUM
);
585 return nvptx_libcall_value (mode
, NULL_RTX
);
588 /* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
591 nvptx_function_value_regno_p (const unsigned int regno
)
593 return regno
== NVPTX_RETURN_REGNUM
;
596 /* Types with a mode other than those supported by the machine are passed by
597 reference in memory. */
600 nvptx_pass_by_reference (cumulative_args_t
ARG_UNUSED (cum
),
601 machine_mode mode
, const_tree type
,
602 bool ARG_UNUSED (named
))
604 return pass_in_memory (mode
, type
, false);
607 /* Implement TARGET_RETURN_IN_MEMORY. */
610 nvptx_return_in_memory (const_tree type
, const_tree
)
612 return pass_in_memory (TYPE_MODE (type
), type
, true);
615 /* Implement TARGET_PROMOTE_FUNCTION_MODE. */
618 nvptx_promote_function_mode (const_tree type
, machine_mode mode
,
619 int *ARG_UNUSED (punsignedp
),
620 const_tree funtype
, int for_return
)
622 return promote_arg (mode
, for_return
|| !type
|| TYPE_ARG_TYPES (funtype
));
625 /* Helper for write_arg. Emit a single PTX argument of MODE, either
626 in a prototype, or as copy in a function prologue. ARGNO is the
627 index of this argument in the PTX function. FOR_REG is negative,
628 if we're emitting the PTX prototype. It is zero if we're copying
629 to an argument register and it is greater than zero if we're
630 copying to a specific hard register. */
633 write_arg_mode (std::stringstream
&s
, int for_reg
, int argno
,
636 const char *ptx_type
= nvptx_ptx_type_from_mode (mode
, false);
640 /* Writing PTX prototype. */
641 s
<< (argno
? ", " : " (");
642 s
<< ".param" << ptx_type
<< " %in_ar" << argno
;
646 s
<< "\t.reg" << ptx_type
<< " ";
648 s
<< reg_names
[for_reg
];
654 s
<< "\tld.param" << ptx_type
<< " ";
656 s
<< reg_names
[for_reg
];
659 s
<< ", [%in_ar" << argno
<< "];\n";
665 /* Process function parameter TYPE to emit one or more PTX
666 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
667 is true, if this is a prototyped function, rather than an old-style
668 C declaration. Returns the next argument number to use.
670 The promotion behavior here must match the regular GCC function
671 parameter marshalling machinery. */
674 write_arg_type (std::stringstream
&s
, int for_reg
, int argno
,
675 tree type
, bool prototyped
)
677 machine_mode mode
= TYPE_MODE (type
);
679 if (mode
== VOIDmode
)
682 if (pass_in_memory (mode
, type
, false))
686 bool split
= TREE_CODE (type
) == COMPLEX_TYPE
;
690 /* Complex types are sent as two separate args. */
691 type
= TREE_TYPE (type
);
692 mode
= TYPE_MODE (type
);
696 mode
= promote_arg (mode
, prototyped
);
698 argno
= write_arg_mode (s
, for_reg
, argno
, mode
);
701 return write_arg_mode (s
, for_reg
, argno
, mode
);
704 /* Emit a PTX return as a prototype or function prologue declaration
708 write_return_mode (std::stringstream
&s
, bool for_proto
, machine_mode mode
)
710 const char *ptx_type
= nvptx_ptx_type_from_mode (mode
, false);
711 const char *pfx
= "\t.reg";
712 const char *sfx
= ";\n";
715 pfx
= "(.param", sfx
= "_out) ";
717 s
<< pfx
<< ptx_type
<< " " << reg_names
[NVPTX_RETURN_REGNUM
] << sfx
;
720 /* Process a function return TYPE to emit a PTX return as a prototype
721 or function prologue declaration. Returns true if return is via an
722 additional pointer parameter. The promotion behavior here must
723 match the regular GCC function return mashalling. */
726 write_return_type (std::stringstream
&s
, bool for_proto
, tree type
)
728 machine_mode mode
= TYPE_MODE (type
);
730 if (mode
== VOIDmode
)
733 bool return_in_mem
= pass_in_memory (mode
, type
, true);
738 return return_in_mem
;
740 /* Named return values can cause us to return a pointer as well
741 as expect an argument for the return location. This is
742 optimization-level specific, so no caller can make use of
743 this data, but more importantly for us, we must ensure it
744 doesn't change the PTX prototype. */
745 mode
= (machine_mode
) cfun
->machine
->return_mode
;
747 if (mode
== VOIDmode
)
748 return return_in_mem
;
750 /* Clear return_mode to inhibit copy of retval to non-existent
752 cfun
->machine
->return_mode
= VOIDmode
;
755 mode
= promote_return (mode
);
757 write_return_mode (s
, for_proto
, mode
);
759 return return_in_mem
;
762 /* Look for attributes in ATTRS that would indicate we must write a function
763 as a .entry kernel rather than a .func. Return true if one is found. */
766 write_as_kernel (tree attrs
)
768 return (lookup_attribute ("kernel", attrs
) != NULL_TREE
769 || (lookup_attribute ("omp target entrypoint", attrs
) != NULL_TREE
770 && lookup_attribute ("oacc function", attrs
) != NULL_TREE
));
771 /* For OpenMP target regions, the corresponding kernel entry is emitted from
772 write_omp_entry as a separate function. */
775 /* Emit a linker marker for a function decl or defn. */
778 write_fn_marker (std::stringstream
&s
, bool is_defn
, bool globalize
,
784 s
<< " FUNCTION " << (is_defn
? "DEF: " : "DECL: ");
788 /* Emit a linker marker for a variable decl or defn. */
791 write_var_marker (FILE *file
, bool is_defn
, bool globalize
, const char *name
)
793 fprintf (file
, "\n// BEGIN%s VAR %s: ",
794 globalize
? " GLOBAL" : "",
795 is_defn
? "DEF" : "DECL");
796 assemble_name_raw (file
, name
);
800 /* Write a .func or .kernel declaration or definition along with
801 a helper comment for use by ld. S is the stream to write to, DECL
802 the decl for the function with name NAME. For definitions, emit
803 a declaration too. */
806 write_fn_proto (std::stringstream
&s
, bool is_defn
,
807 const char *name
, const_tree decl
)
810 /* Emit a declaration. The PTX assembler gets upset without it. */
811 name
= write_fn_proto (s
, false, name
, decl
);
814 /* Avoid repeating the name replacement. */
815 name
= nvptx_name_replacement (name
);
820 write_fn_marker (s
, is_defn
, TREE_PUBLIC (decl
), name
);
822 /* PTX declaration. */
823 if (DECL_EXTERNAL (decl
))
825 else if (TREE_PUBLIC (decl
))
826 s
<< (DECL_WEAK (decl
) ? ".weak " : ".visible ");
827 s
<< (write_as_kernel (DECL_ATTRIBUTES (decl
)) ? ".entry " : ".func ");
829 tree fntype
= TREE_TYPE (decl
);
830 tree result_type
= TREE_TYPE (fntype
);
832 /* atomic_compare_exchange_$n builtins have an exceptional calling
834 int not_atomic_weak_arg
= -1;
835 if (DECL_BUILT_IN_CLASS (decl
) == BUILT_IN_NORMAL
)
836 switch (DECL_FUNCTION_CODE (decl
))
838 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1
:
839 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2
:
840 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4
:
841 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8
:
842 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16
:
843 /* These atomics skip the 'weak' parm in an actual library
844 call. We must skip it in the prototype too. */
845 not_atomic_weak_arg
= 3;
852 /* Declare the result. */
853 bool return_in_mem
= write_return_type (s
, true, result_type
);
859 /* Emit argument list. */
861 argno
= write_arg_type (s
, -1, argno
, ptr_type_node
, true);
864 NULL in TYPE_ARG_TYPES, for old-style functions
865 NULL in DECL_ARGUMENTS, for builtin functions without another
867 So we have to pick the best one we have. */
868 tree args
= TYPE_ARG_TYPES (fntype
);
869 bool prototyped
= true;
872 args
= DECL_ARGUMENTS (decl
);
876 for (; args
; args
= TREE_CHAIN (args
), not_atomic_weak_arg
--)
878 tree type
= prototyped
? TREE_VALUE (args
) : TREE_TYPE (args
);
880 if (not_atomic_weak_arg
)
881 argno
= write_arg_type (s
, -1, argno
, type
, prototyped
);
883 gcc_assert (type
== boolean_type_node
);
886 if (stdarg_p (fntype
))
887 argno
= write_arg_type (s
, -1, argno
, ptr_type_node
, true);
889 if (DECL_STATIC_CHAIN (decl
))
890 argno
= write_arg_type (s
, -1, argno
, ptr_type_node
, true);
892 if (!argno
&& strcmp (name
, "main") == 0)
894 argno
= write_arg_type (s
, -1, argno
, integer_type_node
, true);
895 argno
= write_arg_type (s
, -1, argno
, ptr_type_node
, true);
901 s
<< (is_defn
? "\n" : ";\n");
906 /* Construct a function declaration from a call insn. This can be
907 necessary for two reasons - either we have an indirect call which
908 requires a .callprototype declaration, or we have a libcall
909 generated by emit_library_call for which no decl exists. */
912 write_fn_proto_from_insn (std::stringstream
&s
, const char *name
,
917 s
<< "\t.callprototype ";
922 name
= nvptx_name_replacement (name
);
923 write_fn_marker (s
, false, true, name
);
924 s
<< "\t.extern .func ";
927 if (result
!= NULL_RTX
)
928 write_return_mode (s
, true, GET_MODE (result
));
932 int arg_end
= XVECLEN (pat
, 0);
933 for (int i
= 1; i
< arg_end
; i
++)
935 /* We don't have to deal with mode splitting & promotion here,
936 as that was already done when generating the call
938 machine_mode mode
= GET_MODE (XEXP (XVECEXP (pat
, 0, i
), 0));
940 write_arg_mode (s
, -1, i
- 1, mode
);
947 /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
948 table and and write a ptx prototype. These are emitted at end of
952 nvptx_record_fndecl (tree decl
)
954 tree
*slot
= declared_fndecls_htab
->find_slot (decl
, INSERT
);
958 const char *name
= get_fnname_from_decl (decl
);
959 write_fn_proto (func_decls
, false, name
, decl
);
963 /* Record a libcall or unprototyped external function. CALLEE is the
964 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
965 declaration for it. */
968 nvptx_record_libfunc (rtx callee
, rtx retval
, rtx pat
)
970 rtx
*slot
= declared_libfuncs_htab
->find_slot (callee
, INSERT
);
975 const char *name
= XSTR (callee
, 0);
976 write_fn_proto_from_insn (func_decls
, name
, retval
, pat
);
980 /* DECL is an external FUNCTION_DECL, that we're referencing. If it
981 is prototyped, record it now. Otherwise record it as needed at end
982 of compilation, when we might have more information about it. */
985 nvptx_record_needed_fndecl (tree decl
)
987 if (TYPE_ARG_TYPES (TREE_TYPE (decl
)) == NULL_TREE
)
989 tree
*slot
= needed_fndecls_htab
->find_slot (decl
, INSERT
);
994 nvptx_record_fndecl (decl
);
997 /* SYM is a SYMBOL_REF. If it refers to an external function, record
1001 nvptx_maybe_record_fnsym (rtx sym
)
1003 tree decl
= SYMBOL_REF_DECL (sym
);
1005 if (decl
&& TREE_CODE (decl
) == FUNCTION_DECL
&& DECL_EXTERNAL (decl
))
1006 nvptx_record_needed_fndecl (decl
);
1009 /* Emit a local array to hold some part of a conventional stack frame
1010 and initialize REGNO to point to it. If the size is zero, it'll
1011 never be valid to dereference, so we can simply initialize to
1015 init_frame (FILE *file
, int regno
, unsigned align
, unsigned size
)
1018 fprintf (file
, "\t.local .align %d .b8 %s_ar[%u];\n",
1019 align
, reg_names
[regno
], size
);
1020 fprintf (file
, "\t.reg.u%d %s;\n",
1021 POINTER_SIZE
, reg_names
[regno
]);
1022 fprintf (file
, (size
? "\tcvta.local.u%d %s, %s_ar;\n"
1023 : "\tmov.u%d %s, 0;\n"),
1024 POINTER_SIZE
, reg_names
[regno
], reg_names
[regno
]);
1027 /* Emit soft stack frame setup sequence. */
1030 init_softstack_frame (FILE *file
, unsigned alignment
, HOST_WIDE_INT size
)
1032 /* Maintain 64-bit stack alignment. */
1033 unsigned keep_align
= BIGGEST_ALIGNMENT
/ BITS_PER_UNIT
;
1034 size
= ROUND_UP (size
, keep_align
);
1035 int bits
= POINTER_SIZE
;
1036 const char *reg_stack
= reg_names
[STACK_POINTER_REGNUM
];
1037 const char *reg_frame
= reg_names
[FRAME_POINTER_REGNUM
];
1038 const char *reg_sspslot
= reg_names
[SOFTSTACK_SLOT_REGNUM
];
1039 const char *reg_sspprev
= reg_names
[SOFTSTACK_PREV_REGNUM
];
1040 fprintf (file
, "\t.reg.u%d %s;\n", bits
, reg_stack
);
1041 fprintf (file
, "\t.reg.u%d %s;\n", bits
, reg_frame
);
1042 fprintf (file
, "\t.reg.u%d %s;\n", bits
, reg_sspslot
);
1043 fprintf (file
, "\t.reg.u%d %s;\n", bits
, reg_sspprev
);
1044 fprintf (file
, "\t{\n");
1045 fprintf (file
, "\t\t.reg.u32 %%fstmp0;\n");
1046 fprintf (file
, "\t\t.reg.u%d %%fstmp1;\n", bits
);
1047 fprintf (file
, "\t\t.reg.u%d %%fstmp2;\n", bits
);
1048 fprintf (file
, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1049 fprintf (file
, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1050 bits
== 64 ? ".wide" : ".lo", bits
/ 8);
1051 fprintf (file
, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits
);
1053 /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */
1054 fprintf (file
, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits
, reg_sspslot
);
1056 /* Initialize %sspprev = __nvptx_stacks[tid.y]. */
1057 fprintf (file
, "\t\tld.shared.u%d %s, [%s];\n",
1058 bits
, reg_sspprev
, reg_sspslot
);
1060 /* Initialize %frame = %sspprev - size. */
1061 fprintf (file
, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC
";\n",
1062 bits
, reg_frame
, reg_sspprev
, size
);
1064 /* Apply alignment, if larger than 64. */
1065 if (alignment
> keep_align
)
1066 fprintf (file
, "\t\tand.b%d %s, %s, %d;\n",
1067 bits
, reg_frame
, reg_frame
, -alignment
);
1069 size
= crtl
->outgoing_args_size
;
1070 gcc_assert (size
% keep_align
== 0);
1072 /* Initialize %stack. */
1073 fprintf (file
, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC
";\n",
1074 bits
, reg_stack
, reg_frame
, size
);
1077 fprintf (file
, "\t\tst.shared.u%d [%s], %s;\n",
1078 bits
, reg_sspslot
, reg_stack
);
1079 fprintf (file
, "\t}\n");
1080 cfun
->machine
->has_softstack
= true;
1081 need_softstack_decl
= true;
1084 /* Emit code to initialize the REGNO predicate register to indicate
1085 whether we are not lane zero on the NAME axis. */
1088 nvptx_init_axis_predicate (FILE *file
, int regno
, const char *name
)
1090 fprintf (file
, "\t{\n");
1091 fprintf (file
, "\t\t.reg.u32\t%%%s;\n", name
);
1092 fprintf (file
, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name
, name
);
1093 fprintf (file
, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno
, name
);
1094 fprintf (file
, "\t}\n");
1097 /* Emit code to initialize predicate and master lane index registers for
1098 -muniform-simt code generation variant. */
1101 nvptx_init_unisimt_predicate (FILE *file
)
1103 cfun
->machine
->unisimt_location
= gen_reg_rtx (Pmode
);
1104 int loc
= REGNO (cfun
->machine
->unisimt_location
);
1105 int bits
= POINTER_SIZE
;
1106 fprintf (file
, "\t.reg.u%d %%r%d;\n", bits
, loc
);
1107 fprintf (file
, "\t{\n");
1108 fprintf (file
, "\t\t.reg.u32 %%ustmp0;\n");
1109 fprintf (file
, "\t\t.reg.u%d %%ustmp1;\n", bits
);
1110 fprintf (file
, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1111 fprintf (file
, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1112 bits
== 64 ? ".wide" : ".lo");
1113 fprintf (file
, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits
, loc
);
1114 fprintf (file
, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits
, loc
, loc
);
1115 if (cfun
->machine
->unisimt_predicate
)
1117 int master
= REGNO (cfun
->machine
->unisimt_master
);
1118 int pred
= REGNO (cfun
->machine
->unisimt_predicate
);
1119 fprintf (file
, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master
, loc
);
1120 fprintf (file
, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
1121 /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'. */
1122 fprintf (file
, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master
, master
);
1123 /* Compute predicate as 'tid.x == master'. */
1124 fprintf (file
, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred
, master
);
1126 fprintf (file
, "\t}\n");
1127 need_unisimt_decl
= true;
1130 /* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1132 extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1133 void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1135 __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1136 __nvptx_uni[tid.y] = 0;
1137 gomp_nvptx_main (ORIG, arg);
1139 ORIG itself should not be emitted as a PTX .entry function. */
1142 write_omp_entry (FILE *file
, const char *name
, const char *orig
)
1144 static bool gomp_nvptx_main_declared
;
1145 if (!gomp_nvptx_main_declared
)
1147 gomp_nvptx_main_declared
= true;
1148 write_fn_marker (func_decls
, false, true, "gomp_nvptx_main");
1149 func_decls
<< ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1150 << " %in_ar1, .param.u" << POINTER_SIZE
<< " %in_ar2);\n";
1152 /* PR79332. Single out this string; it confuses gcc.pot generation. */
1153 #define NTID_Y "%ntid.y"
1154 #define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1155 (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1158 .reg.u" PS " %R<4>;\n\
1159 mov.u32 %r0, %tid.y;\n\
1160 mov.u32 %r1, " NTID_Y ";\n\
1161 mov.u32 %r2, %ctaid.x;\n\
1162 cvt.u" PS ".u32 %R1, %r0;\n\
1163 " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1164 mov.u" PS " %R0, __nvptx_stacks;\n\
1165 " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1166 ld.param.u" PS " %R2, [%stack];\n\
1167 ld.param.u" PS " %R3, [%sz];\n\
1168 add.u" PS " %R2, %R2, %R3;\n\
1169 mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1170 st.shared.u" PS " [%R0], %R2;\n\
1171 mov.u" PS " %R0, __nvptx_uni;\n\
1172 " MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1174 st.shared.u32 [%R0], %r0;\n\
1175 mov.u" PS " %R0, \0;\n\
1176 ld.param.u" PS " %R1, [%arg];\n\
1178 .param.u" PS " %P<2>;\n\
1179 st.param.u" PS " [%P0], %R0;\n\
1180 st.param.u" PS " [%P1], %R1;\n\
1181 call.uni gomp_nvptx_main, (%P0, %P1);\n\
1185 static const char entry64
[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1186 static const char entry32
[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 ");
1187 #undef ENTRY_TEMPLATE
1189 const char *entry_1
= TARGET_ABI64
? entry64
: entry32
;
1190 /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */
1191 const char *entry_2
= entry_1
+ strlen (entry64
) + 1;
1192 fprintf (file
, ".visible .entry %s%s%s%s", name
, entry_1
, orig
, entry_2
);
1193 need_softstack_decl
= need_unisimt_decl
= true;
1196 /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
1197 function, including local var decls and copies from the arguments to
1201 nvptx_declare_function_name (FILE *file
, const char *name
, const_tree decl
)
1203 tree fntype
= TREE_TYPE (decl
);
1204 tree result_type
= TREE_TYPE (fntype
);
1207 if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl
))
1208 && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl
)))
1210 char *buf
= (char *) alloca (strlen (name
) + sizeof ("$impl"));
1211 sprintf (buf
, "%s$impl", name
);
1212 write_omp_entry (file
, name
, buf
);
1215 /* We construct the initial part of the function into a string
1216 stream, in order to share the prototype writing code. */
1217 std::stringstream s
;
1218 write_fn_proto (s
, true, name
, decl
);
1221 bool return_in_mem
= write_return_type (s
, false, result_type
);
1223 argno
= write_arg_type (s
, 0, argno
, ptr_type_node
, true);
1225 /* Declare and initialize incoming arguments. */
1226 tree args
= TYPE_ARG_TYPES (fntype
);
1227 bool prototyped
= true;
1230 args
= DECL_ARGUMENTS (decl
);
1234 for (; args
!= NULL_TREE
; args
= TREE_CHAIN (args
))
1236 tree type
= prototyped
? TREE_VALUE (args
) : TREE_TYPE (args
);
1238 argno
= write_arg_type (s
, 0, argno
, type
, prototyped
);
1241 if (stdarg_p (fntype
))
1242 argno
= write_arg_type (s
, ARG_POINTER_REGNUM
, argno
, ptr_type_node
,
1245 if (DECL_STATIC_CHAIN (decl
) || cfun
->machine
->has_chain
)
1246 write_arg_type (s
, STATIC_CHAIN_REGNUM
,
1247 DECL_STATIC_CHAIN (decl
) ? argno
: -1, ptr_type_node
,
1250 fprintf (file
, "%s", s
.str().c_str());
1252 /* Usually 'crtl->is_leaf' is computed during register allocator
1253 initialization (which is not done on NVPTX) or for pressure-sensitive
1254 optimizations. Initialize it here, except if already set. */
1256 crtl
->is_leaf
= leaf_function_p ();
1258 HOST_WIDE_INT sz
= get_frame_size ();
1259 bool need_frameptr
= sz
|| cfun
->machine
->has_chain
;
1260 int alignment
= crtl
->stack_alignment_needed
/ BITS_PER_UNIT
;
1261 if (!TARGET_SOFT_STACK
)
1263 /* Declare a local var for outgoing varargs. */
1264 if (cfun
->machine
->has_varadic
)
1265 init_frame (file
, STACK_POINTER_REGNUM
,
1266 UNITS_PER_WORD
, crtl
->outgoing_args_size
);
1268 /* Declare a local variable for the frame. Force its size to be
1269 DImode-compatible. */
1271 init_frame (file
, FRAME_POINTER_REGNUM
, alignment
,
1272 ROUND_UP (sz
, GET_MODE_SIZE (DImode
)));
1274 else if (need_frameptr
|| cfun
->machine
->has_varadic
|| cfun
->calls_alloca
1275 || (cfun
->machine
->has_simtreg
&& !crtl
->is_leaf
))
1276 init_softstack_frame (file
, alignment
, sz
);
1278 if (cfun
->machine
->has_simtreg
)
1280 unsigned HOST_WIDE_INT
&simtsz
= cfun
->machine
->simt_stack_size
;
1281 unsigned HOST_WIDE_INT
&align
= cfun
->machine
->simt_stack_align
;
1282 align
= MAX (align
, GET_MODE_SIZE (DImode
));
1283 if (!crtl
->is_leaf
|| cfun
->calls_alloca
)
1284 simtsz
= HOST_WIDE_INT_M1U
;
1285 if (simtsz
== HOST_WIDE_INT_M1U
)
1286 simtsz
= nvptx_softstack_size
;
1287 if (cfun
->machine
->has_softstack
)
1288 simtsz
+= POINTER_SIZE
/ 8;
1289 simtsz
= ROUND_UP (simtsz
, GET_MODE_SIZE (DImode
));
1290 if (align
> GET_MODE_SIZE (DImode
))
1291 simtsz
+= align
- GET_MODE_SIZE (DImode
);
1293 fprintf (file
, "\t.local.align 8 .b8 %%simtstack_ar["
1294 HOST_WIDE_INT_PRINT_DEC
"];\n", simtsz
);
1296 /* Declare the pseudos we have as ptx registers. */
1297 int maxregs
= max_reg_num ();
1298 for (int i
= LAST_VIRTUAL_REGISTER
+ 1; i
< maxregs
; i
++)
1300 if (regno_reg_rtx
[i
] != const0_rtx
)
1302 machine_mode mode
= PSEUDO_REGNO_MODE (i
);
1303 machine_mode split
= maybe_split_mode (mode
);
1305 if (split_mode_p (mode
))
1307 fprintf (file
, "\t.reg%s ", nvptx_ptx_type_from_mode (mode
, true));
1308 output_reg (file
, i
, split
, -2);
1309 fprintf (file
, ";\n");
1313 /* Emit axis predicates. */
1314 if (cfun
->machine
->axis_predicate
[0])
1315 nvptx_init_axis_predicate (file
,
1316 REGNO (cfun
->machine
->axis_predicate
[0]), "y");
1317 if (cfun
->machine
->axis_predicate
[1])
1318 nvptx_init_axis_predicate (file
,
1319 REGNO (cfun
->machine
->axis_predicate
[1]), "x");
1320 if (cfun
->machine
->unisimt_predicate
1321 || (cfun
->machine
->has_simtreg
&& !crtl
->is_leaf
))
1322 nvptx_init_unisimt_predicate (file
);
1325 /* Output code for switching uniform-simt state. ENTERING indicates whether
1326 we are entering or leaving non-uniform execution region. */
1329 nvptx_output_unisimt_switch (FILE *file
, bool entering
)
1331 if (crtl
->is_leaf
&& !cfun
->machine
->unisimt_predicate
)
1333 fprintf (file
, "\t{\n");
1334 fprintf (file
, "\t\t.reg.u32 %%ustmp2;\n");
1335 fprintf (file
, "\t\tmov.u32 %%ustmp2, %d;\n", entering
? -1 : 0);
1338 int loc
= REGNO (cfun
->machine
->unisimt_location
);
1339 fprintf (file
, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc
);
1341 if (cfun
->machine
->unisimt_predicate
)
1343 int master
= REGNO (cfun
->machine
->unisimt_master
);
1344 int pred
= REGNO (cfun
->machine
->unisimt_predicate
);
1345 fprintf (file
, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
1346 fprintf (file
, "\t\tmov.u32 %%r%d, %s;\n",
1347 master
, entering
? "%ustmp2" : "0");
1348 fprintf (file
, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred
, master
);
1350 fprintf (file
, "\t}\n");
1353 /* Output code for allocating per-lane storage and switching soft-stack pointer.
1354 ENTERING indicates whether we are entering or leaving non-uniform execution.
1355 PTR is the register pointing to allocated storage, it is assigned to on
1356 entering and used to restore state on leaving. SIZE and ALIGN are used only
1360 nvptx_output_softstack_switch (FILE *file
, bool entering
,
1361 rtx ptr
, rtx size
, rtx align
)
1363 gcc_assert (REG_P (ptr
) && !HARD_REGISTER_P (ptr
));
1364 if (crtl
->is_leaf
&& !cfun
->machine
->simt_stack_size
)
1366 int bits
= POINTER_SIZE
, regno
= REGNO (ptr
);
1367 fprintf (file
, "\t{\n");
1370 fprintf (file
, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
1371 HOST_WIDE_INT_PRINT_DEC
";\n", bits
, regno
,
1372 cfun
->machine
->simt_stack_size
);
1373 fprintf (file
, "\t\tsub.u%d %%r%d, %%r%d, ", bits
, regno
, regno
);
1374 if (CONST_INT_P (size
))
1375 fprintf (file
, HOST_WIDE_INT_PRINT_DEC
,
1376 ROUND_UP (UINTVAL (size
), GET_MODE_SIZE (DImode
)));
1378 output_reg (file
, REGNO (size
), VOIDmode
);
1379 fputs (";\n", file
);
1380 if (!CONST_INT_P (size
) || UINTVAL (align
) > GET_MODE_SIZE (DImode
))
1382 "\t\tand.u%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC
";\n",
1383 bits
, regno
, regno
, UINTVAL (align
));
1385 if (cfun
->machine
->has_softstack
)
1387 const char *reg_stack
= reg_names
[STACK_POINTER_REGNUM
];
1390 fprintf (file
, "\t\tst.u%d [%%r%d + -%d], %s;\n",
1391 bits
, regno
, bits
/ 8, reg_stack
);
1392 fprintf (file
, "\t\tsub.u%d %s, %%r%d, %d;\n",
1393 bits
, reg_stack
, regno
, bits
/ 8);
1397 fprintf (file
, "\t\tld.u%d %s, [%%r%d + -%d];\n",
1398 bits
, reg_stack
, regno
, bits
/ 8);
1400 nvptx_output_set_softstack (REGNO (stack_pointer_rtx
));
1402 fprintf (file
, "\t}\n");
1405 /* Output code to enter non-uniform execution region. DEST is a register
1406 to hold a per-lane allocation given by SIZE and ALIGN. */
1409 nvptx_output_simt_enter (rtx dest
, rtx size
, rtx align
)
1411 nvptx_output_unisimt_switch (asm_out_file
, true);
1412 nvptx_output_softstack_switch (asm_out_file
, true, dest
, size
, align
);
1416 /* Output code to leave non-uniform execution region. SRC is the register
1417 holding per-lane storage previously allocated by omp_simt_enter insn. */
1420 nvptx_output_simt_exit (rtx src
)
1422 nvptx_output_unisimt_switch (asm_out_file
, false);
1423 nvptx_output_softstack_switch (asm_out_file
, false, src
, NULL_RTX
, NULL_RTX
);
1427 /* Output instruction that sets soft stack pointer in shared memory to the
1428 value in register given by SRC_REGNO. */
1431 nvptx_output_set_softstack (unsigned src_regno
)
1433 if (cfun
->machine
->has_softstack
&& !crtl
->is_leaf
)
1435 fprintf (asm_out_file
, "\tst.shared.u%d\t[%s], ",
1436 POINTER_SIZE
, reg_names
[SOFTSTACK_SLOT_REGNUM
]);
1437 output_reg (asm_out_file
, src_regno
, VOIDmode
);
1438 fprintf (asm_out_file
, ";\n");
1442 /* Output a return instruction. Also copy the return value to its outgoing
1446 nvptx_output_return (void)
1448 machine_mode mode
= (machine_mode
)cfun
->machine
->return_mode
;
1450 if (mode
!= VOIDmode
)
1451 fprintf (asm_out_file
, "\tst.param%s\t[%s_out], %s;\n",
1452 nvptx_ptx_type_from_mode (mode
, false),
1453 reg_names
[NVPTX_RETURN_REGNUM
],
1454 reg_names
[NVPTX_RETURN_REGNUM
]);
1459 /* Terminate a function by writing a closing brace to FILE. */
1462 nvptx_function_end (FILE *file
)
1464 fprintf (file
, "}\n");
1467 /* Decide whether we can make a sibling call to a function. For ptx, we
1471 nvptx_function_ok_for_sibcall (tree
, tree
)
1476 /* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1479 nvptx_get_drap_rtx (void)
1481 if (TARGET_SOFT_STACK
&& stack_realign_drap
)
1482 return arg_pointer_rtx
;
1486 /* Implement the TARGET_CALL_ARGS hook. Record information about one
1487 argument to the next call. */
1490 nvptx_call_args (rtx arg
, tree fntype
)
1492 if (!cfun
->machine
->doing_call
)
1494 cfun
->machine
->doing_call
= true;
1495 cfun
->machine
->is_varadic
= false;
1496 cfun
->machine
->num_args
= 0;
1498 if (fntype
&& stdarg_p (fntype
))
1500 cfun
->machine
->is_varadic
= true;
1501 cfun
->machine
->has_varadic
= true;
1502 cfun
->machine
->num_args
++;
1506 if (REG_P (arg
) && arg
!= pc_rtx
)
1508 cfun
->machine
->num_args
++;
1509 cfun
->machine
->call_args
= alloc_EXPR_LIST (VOIDmode
, arg
,
1510 cfun
->machine
->call_args
);
1514 /* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1515 information we recorded. */
1518 nvptx_end_call_args (void)
1520 cfun
->machine
->doing_call
= false;
1521 free_EXPR_LIST_list (&cfun
->machine
->call_args
);
1524 /* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1525 track of whether calls involving static chains or varargs were seen
1526 in the current function.
1527 For libcalls, maintain a hash table of decls we have seen, and
1528 record a function decl for later when encountering a new one. */
1531 nvptx_expand_call (rtx retval
, rtx address
)
1533 rtx callee
= XEXP (address
, 0);
1534 rtx varargs
= NULL_RTX
;
1535 unsigned parallel
= 0;
1537 if (!call_insn_operand (callee
, Pmode
))
1539 callee
= force_reg (Pmode
, callee
);
1540 address
= change_address (address
, QImode
, callee
);
1543 if (GET_CODE (callee
) == SYMBOL_REF
)
1545 tree decl
= SYMBOL_REF_DECL (callee
);
1546 if (decl
!= NULL_TREE
)
1548 if (DECL_STATIC_CHAIN (decl
))
1549 cfun
->machine
->has_chain
= true;
1551 tree attr
= oacc_get_fn_attrib (decl
);
1554 tree dims
= TREE_VALUE (attr
);
1556 parallel
= GOMP_DIM_MASK (GOMP_DIM_MAX
) - 1;
1557 for (int ix
= 0; ix
!= GOMP_DIM_MAX
; ix
++)
1559 if (TREE_PURPOSE (dims
)
1560 && !integer_zerop (TREE_PURPOSE (dims
)))
1562 /* Not on this axis. */
1563 parallel
^= GOMP_DIM_MASK (ix
);
1564 dims
= TREE_CHAIN (dims
);
1570 unsigned nargs
= cfun
->machine
->num_args
;
1571 if (cfun
->machine
->is_varadic
)
1573 varargs
= gen_reg_rtx (Pmode
);
1574 emit_move_insn (varargs
, stack_pointer_rtx
);
1577 rtvec vec
= rtvec_alloc (nargs
+ 1);
1578 rtx pat
= gen_rtx_PARALLEL (VOIDmode
, vec
);
1581 rtx call
= gen_rtx_CALL (VOIDmode
, address
, const0_rtx
);
1582 rtx tmp_retval
= retval
;
1585 if (!nvptx_register_operand (retval
, GET_MODE (retval
)))
1586 tmp_retval
= gen_reg_rtx (GET_MODE (retval
));
1587 call
= gen_rtx_SET (tmp_retval
, call
);
1589 XVECEXP (pat
, 0, vec_pos
++) = call
;
1591 /* Construct the call insn, including a USE for each argument pseudo
1592 register. These will be used when printing the insn. */
1593 for (rtx arg
= cfun
->machine
->call_args
; arg
; arg
= XEXP (arg
, 1))
1594 XVECEXP (pat
, 0, vec_pos
++) = gen_rtx_USE (VOIDmode
, XEXP (arg
, 0));
1597 XVECEXP (pat
, 0, vec_pos
++) = gen_rtx_USE (VOIDmode
, varargs
);
1599 gcc_assert (vec_pos
= XVECLEN (pat
, 0));
1601 nvptx_emit_forking (parallel
, true);
1602 emit_call_insn (pat
);
1603 nvptx_emit_joining (parallel
, true);
1605 if (tmp_retval
!= retval
)
1606 emit_move_insn (retval
, tmp_retval
);
1609 /* Emit a comparison COMPARE, and return the new test to be used in the
1613 nvptx_expand_compare (rtx compare
)
1615 rtx pred
= gen_reg_rtx (BImode
);
1616 rtx cmp
= gen_rtx_fmt_ee (GET_CODE (compare
), BImode
,
1617 XEXP (compare
, 0), XEXP (compare
, 1));
1618 emit_insn (gen_rtx_SET (pred
, cmp
));
1619 return gen_rtx_NE (BImode
, pred
, const0_rtx
);
1622 /* Expand the oacc fork & join primitive into ptx-required unspecs. */
1625 nvptx_expand_oacc_fork (unsigned mode
)
1627 nvptx_emit_forking (GOMP_DIM_MASK (mode
), false);
1631 nvptx_expand_oacc_join (unsigned mode
)
1633 nvptx_emit_joining (GOMP_DIM_MASK (mode
), false);
1636 /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1640 nvptx_gen_unpack (rtx dst0
, rtx dst1
, rtx src
)
1644 switch (GET_MODE (src
))
1647 res
= gen_unpackdisi2 (dst0
, dst1
, src
);
1650 res
= gen_unpackdfsi2 (dst0
, dst1
, src
);
1652 default: gcc_unreachable ();
1657 /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1661 nvptx_gen_pack (rtx dst
, rtx src0
, rtx src1
)
1665 switch (GET_MODE (dst
))
1668 res
= gen_packsidi2 (dst
, src0
, src1
);
1671 res
= gen_packsidf2 (dst
, src0
, src1
);
1673 default: gcc_unreachable ();
1678 /* Generate an instruction or sequence to broadcast register REG
1679 across the vectors of a single warp. */
1682 nvptx_gen_shuffle (rtx dst
, rtx src
, rtx idx
, nvptx_shuffle_kind kind
)
1686 switch (GET_MODE (dst
))
1689 res
= gen_nvptx_shufflesi (dst
, src
, idx
, GEN_INT (kind
));
1692 res
= gen_nvptx_shufflesf (dst
, src
, idx
, GEN_INT (kind
));
1697 rtx tmp0
= gen_reg_rtx (SImode
);
1698 rtx tmp1
= gen_reg_rtx (SImode
);
1701 emit_insn (nvptx_gen_unpack (tmp0
, tmp1
, src
));
1702 emit_insn (nvptx_gen_shuffle (tmp0
, tmp0
, idx
, kind
));
1703 emit_insn (nvptx_gen_shuffle (tmp1
, tmp1
, idx
, kind
));
1704 emit_insn (nvptx_gen_pack (dst
, tmp0
, tmp1
));
1711 rtx tmp
= gen_reg_rtx (SImode
);
1714 emit_insn (gen_sel_truesi (tmp
, src
, GEN_INT (1), const0_rtx
));
1715 emit_insn (nvptx_gen_shuffle (tmp
, tmp
, idx
, kind
));
1716 emit_insn (gen_rtx_SET (dst
, gen_rtx_NE (BImode
, tmp
, const0_rtx
)));
1724 rtx tmp
= gen_reg_rtx (SImode
);
1727 emit_insn (gen_rtx_SET (tmp
, gen_rtx_fmt_e (ZERO_EXTEND
, SImode
, src
)));
1728 emit_insn (nvptx_gen_shuffle (tmp
, tmp
, idx
, kind
));
1729 emit_insn (gen_rtx_SET (dst
, gen_rtx_fmt_e (TRUNCATE
, GET_MODE (dst
),
1742 /* Generate an instruction or sequence to broadcast register REG
1743 across the vectors of a single warp. */
1746 nvptx_gen_vcast (rtx reg
)
1748 return nvptx_gen_shuffle (reg
, reg
, const0_rtx
, SHUFFLE_IDX
);
1751 /* Structure used when generating a worker-level spill or fill. */
1755 rtx base
; /* Register holding base addr of buffer. */
1756 rtx ptr
; /* Iteration var, if needed. */
1757 unsigned offset
; /* Offset into worker buffer. */
1760 /* Direction of the spill/fill and looping setup/teardown indicator. */
1766 PM_loop_begin
= 1 << 2,
1767 PM_loop_end
= 1 << 3,
1769 PM_read_write
= PM_read
| PM_write
1772 /* Generate instruction(s) to spill or fill register REG to/from the
1773 worker broadcast array. PM indicates what is to be done, REP
1774 how many loop iterations will be executed (0 for not a loop). */
1777 nvptx_gen_wcast (rtx reg
, propagate_mask pm
, unsigned rep
, wcast_data_t
*data
)
1780 machine_mode mode
= GET_MODE (reg
);
1786 rtx tmp
= gen_reg_rtx (SImode
);
1790 emit_insn (gen_sel_truesi (tmp
, reg
, GEN_INT (1), const0_rtx
));
1791 emit_insn (nvptx_gen_wcast (tmp
, pm
, rep
, data
));
1793 emit_insn (gen_rtx_SET (reg
, gen_rtx_NE (BImode
, tmp
, const0_rtx
)));
1801 rtx addr
= data
->ptr
;
1805 unsigned align
= GET_MODE_ALIGNMENT (mode
) / BITS_PER_UNIT
;
1807 if (align
> worker_bcast_align
)
1808 worker_bcast_align
= align
;
1809 data
->offset
= (data
->offset
+ align
- 1) & ~(align
- 1);
1812 addr
= gen_rtx_PLUS (Pmode
, addr
, GEN_INT (data
->offset
));
1815 addr
= gen_rtx_MEM (mode
, addr
);
1817 res
= gen_rtx_SET (addr
, reg
);
1818 else if (pm
== PM_write
)
1819 res
= gen_rtx_SET (reg
, addr
);
1825 /* We're using a ptr, increment it. */
1829 emit_insn (gen_adddi3 (data
->ptr
, data
->ptr
,
1830 GEN_INT (GET_MODE_SIZE (GET_MODE (reg
)))));
1836 data
->offset
+= rep
* GET_MODE_SIZE (GET_MODE (reg
));
1843 /* Returns true if X is a valid address for use in a memory reference. */
1846 nvptx_legitimate_address_p (machine_mode
, rtx x
, bool)
1848 enum rtx_code code
= GET_CODE (x
);
1856 if (REG_P (XEXP (x
, 0)) && CONST_INT_P (XEXP (x
, 1)))
1870 /* Machinery to output constant initializers. When beginning an
1871 initializer, we decide on a fragment size (which is visible in ptx
1872 in the type used), and then all initializer data is buffered until
1873 a fragment is filled and ready to be written out. */
1877 unsigned HOST_WIDE_INT mask
; /* Mask for storing fragment. */
1878 unsigned HOST_WIDE_INT val
; /* Current fragment value. */
1879 unsigned HOST_WIDE_INT remaining
; /* Remaining bytes to be written
1881 unsigned size
; /* Fragment size to accumulate. */
1882 unsigned offset
; /* Offset within current fragment. */
1883 bool started
; /* Whether we've output any initializer. */
1886 /* The current fragment is full, write it out. SYM may provide a
1887 symbolic reference we should output, in which case the fragment
1888 value is the addend. */
1891 output_init_frag (rtx sym
)
1893 fprintf (asm_out_file
, init_frag
.started
? ", " : " = { ");
1894 unsigned HOST_WIDE_INT val
= init_frag
.val
;
1896 init_frag
.started
= true;
1898 init_frag
.offset
= 0;
1899 init_frag
.remaining
--;
1903 bool function
= (SYMBOL_REF_DECL (sym
)
1904 && (TREE_CODE (SYMBOL_REF_DECL (sym
)) == FUNCTION_DECL
));
1906 fprintf (asm_out_file
, "generic(");
1907 output_address (VOIDmode
, sym
);
1909 fprintf (asm_out_file
, ")");
1911 fprintf (asm_out_file
, " + ");
1915 fprintf (asm_out_file
, HOST_WIDE_INT_PRINT_DEC
, val
);
1918 /* Add value VAL of size SIZE to the data we're emitting, and keep
1919 writing out chunks as they fill up. */
1922 nvptx_assemble_value (unsigned HOST_WIDE_INT val
, unsigned size
)
1924 val
&= ((unsigned HOST_WIDE_INT
)2 << (size
* BITS_PER_UNIT
- 1)) - 1;
1926 for (unsigned part
= 0; size
; size
-= part
)
1928 val
>>= part
* BITS_PER_UNIT
;
1929 part
= init_frag
.size
- init_frag
.offset
;
1933 unsigned HOST_WIDE_INT partial
1934 = val
<< (init_frag
.offset
* BITS_PER_UNIT
);
1935 init_frag
.val
|= partial
& init_frag
.mask
;
1936 init_frag
.offset
+= part
;
1938 if (init_frag
.offset
== init_frag
.size
)
1939 output_init_frag (NULL
);
1943 /* Target hook for assembling integer object X of size SIZE. */
1946 nvptx_assemble_integer (rtx x
, unsigned int size
, int ARG_UNUSED (aligned_p
))
1948 HOST_WIDE_INT val
= 0;
1950 switch (GET_CODE (x
))
1953 /* Let the generic machinery figure it out, usually for a
1958 nvptx_assemble_value (INTVAL (x
), size
);
1963 gcc_assert (GET_CODE (x
) == PLUS
);
1964 val
= INTVAL (XEXP (x
, 1));
1966 gcc_assert (GET_CODE (x
) == SYMBOL_REF
);
1970 gcc_assert (size
== init_frag
.size
);
1971 if (init_frag
.offset
)
1972 sorry ("cannot emit unaligned pointers in ptx assembly");
1974 nvptx_maybe_record_fnsym (x
);
1975 init_frag
.val
= val
;
1976 output_init_frag (x
);
1983 /* Output SIZE zero bytes. We ignore the FILE argument since the
1984 functions we're calling to perform the output just use
1988 nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size
)
1990 /* Finish the current fragment, if it's started. */
1991 if (init_frag
.offset
)
1993 unsigned part
= init_frag
.size
- init_frag
.offset
;
1995 part
= (unsigned) size
;
1997 nvptx_assemble_value (0, part
);
2000 /* If this skip doesn't terminate the initializer, write as many
2001 remaining pieces as possible directly. */
2002 if (size
< init_frag
.remaining
* init_frag
.size
)
2004 while (size
>= init_frag
.size
)
2006 size
-= init_frag
.size
;
2007 output_init_frag (NULL_RTX
);
2010 nvptx_assemble_value (0, size
);
2014 /* Output a string STR with length SIZE. As in nvptx_output_skip we
2015 ignore the FILE arg. */
2018 nvptx_output_ascii (FILE *, const char *str
, unsigned HOST_WIDE_INT size
)
2020 for (unsigned HOST_WIDE_INT i
= 0; i
< size
; i
++)
2021 nvptx_assemble_value (str
[i
], 1);
2024 /* Emit a PTX variable decl and prepare for emission of its
2025 initializer. NAME is the symbol name and SETION the PTX data
2026 area. The type is TYPE, object size SIZE and alignment is ALIGN.
2027 The caller has already emitted any indentation and linkage
2028 specifier. It is responsible for any initializer, terminating ;
2029 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
2030 this is the opposite way round that PTX wants them! */
2033 nvptx_assemble_decl_begin (FILE *file
, const char *name
, const char *section
,
2034 const_tree type
, HOST_WIDE_INT size
, unsigned align
)
2036 bool atype
= (TREE_CODE (type
) == ARRAY_TYPE
)
2037 && (TYPE_DOMAIN (type
) == NULL_TREE
);
2039 while (TREE_CODE (type
) == ARRAY_TYPE
)
2040 type
= TREE_TYPE (type
);
2042 if (TREE_CODE (type
) == VECTOR_TYPE
2043 || TREE_CODE (type
) == COMPLEX_TYPE
)
2044 /* Neither vector nor complex types can contain the other. */
2045 type
= TREE_TYPE (type
);
2047 unsigned elt_size
= int_size_in_bytes (type
);
2049 /* Largest mode we're prepared to accept. For BLKmode types we
2050 don't know if it'll contain pointer constants, so have to choose
2051 pointer size, otherwise we can choose DImode. */
2052 machine_mode elt_mode
= TYPE_MODE (type
) == BLKmode
? Pmode
: DImode
;
2054 elt_size
|= GET_MODE_SIZE (elt_mode
);
2055 elt_size
&= -elt_size
; /* Extract LSB set. */
2057 init_frag
.size
= elt_size
;
2058 /* Avoid undefined shift behavior by using '2'. */
2059 init_frag
.mask
= ((unsigned HOST_WIDE_INT
)2
2060 << (elt_size
* BITS_PER_UNIT
- 1)) - 1;
2062 init_frag
.offset
= 0;
2063 init_frag
.started
= false;
2064 /* Size might not be a multiple of elt size, if there's an
2065 initialized trailing struct array with smaller type than
2067 init_frag
.remaining
= (size
+ elt_size
- 1) / elt_size
;
2069 fprintf (file
, "%s .align %d .u%d ",
2070 section
, align
/ BITS_PER_UNIT
,
2071 elt_size
* BITS_PER_UNIT
);
2072 assemble_name (file
, name
);
2075 /* We make everything an array, to simplify any initialization
2077 fprintf (file
, "[" HOST_WIDE_INT_PRINT_DEC
"]", init_frag
.remaining
);
2079 fprintf (file
, "[]");
2082 /* Called when the initializer for a decl has been completely output through
2083 combinations of the three functions above. */
2086 nvptx_assemble_decl_end (void)
2088 if (init_frag
.offset
)
2089 /* This can happen with a packed struct with trailing array member. */
2090 nvptx_assemble_value (0, init_frag
.size
- init_frag
.offset
);
2091 fprintf (asm_out_file
, init_frag
.started
? " };\n" : ";\n");
2094 /* Output an uninitialized common or file-scope variable. */
2097 nvptx_output_aligned_decl (FILE *file
, const char *name
,
2098 const_tree decl
, HOST_WIDE_INT size
, unsigned align
)
2100 write_var_marker (file
, true, TREE_PUBLIC (decl
), name
);
2102 /* If this is public, it is common. The nearest thing we have to
2104 fprintf (file
, "\t%s", TREE_PUBLIC (decl
) ? ".weak " : "");
2106 nvptx_assemble_decl_begin (file
, name
, section_for_decl (decl
),
2107 TREE_TYPE (decl
), size
, align
);
2108 nvptx_assemble_decl_end ();
2111 /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
2112 writing a constant variable EXP with NAME and SIZE and its
2113 initializer to FILE. */
2116 nvptx_asm_declare_constant_name (FILE *file
, const char *name
,
2117 const_tree exp
, HOST_WIDE_INT obj_size
)
2119 write_var_marker (file
, true, false, name
);
2121 fprintf (file
, "\t");
2123 tree type
= TREE_TYPE (exp
);
2124 nvptx_assemble_decl_begin (file
, name
, ".const", type
, obj_size
,
2128 /* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
2129 a variable DECL with NAME to FILE. */
2132 nvptx_declare_object_name (FILE *file
, const char *name
, const_tree decl
)
2134 write_var_marker (file
, true, TREE_PUBLIC (decl
), name
);
2136 fprintf (file
, "\t%s", (!TREE_PUBLIC (decl
) ? ""
2137 : DECL_WEAK (decl
) ? ".weak " : ".visible "));
2139 tree type
= TREE_TYPE (decl
);
2140 HOST_WIDE_INT obj_size
= tree_to_shwi (DECL_SIZE_UNIT (decl
));
2141 nvptx_assemble_decl_begin (file
, name
, section_for_decl (decl
),
2142 type
, obj_size
, DECL_ALIGN (decl
));
2145 /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
2148 nvptx_globalize_label (FILE *, const char *)
2152 /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
2153 declaration only for variable DECL with NAME to FILE. */
2156 nvptx_assemble_undefined_decl (FILE *file
, const char *name
, const_tree decl
)
2158 /* The middle end can place constant pool decls into the varpool as
2159 undefined. Until that is fixed, catch the problem here. */
2160 if (DECL_IN_CONSTANT_POOL (decl
))
2163 /* We support weak defintions, and hence have the right
2164 ASM_WEAKEN_DECL definition. Diagnose the problem here. */
2165 if (DECL_WEAK (decl
))
2166 error_at (DECL_SOURCE_LOCATION (decl
),
2167 "PTX does not support weak declarations"
2168 " (only weak definitions)");
2169 write_var_marker (file
, false, TREE_PUBLIC (decl
), name
);
2171 fprintf (file
, "\t.extern ");
2172 tree size
= DECL_SIZE_UNIT (decl
);
2173 nvptx_assemble_decl_begin (file
, name
, section_for_decl (decl
),
2174 TREE_TYPE (decl
), size
? tree_to_shwi (size
) : 0,
2176 nvptx_assemble_decl_end ();
2179 /* Output a pattern for a move instruction. */
2182 nvptx_output_mov_insn (rtx dst
, rtx src
)
2184 machine_mode dst_mode
= GET_MODE (dst
);
2185 machine_mode dst_inner
= (GET_CODE (dst
) == SUBREG
2186 ? GET_MODE (XEXP (dst
, 0)) : dst_mode
);
2187 machine_mode src_inner
= (GET_CODE (src
) == SUBREG
2188 ? GET_MODE (XEXP (src
, 0)) : dst_mode
);
2191 if (GET_CODE (sym
) == CONST
)
2192 sym
= XEXP (XEXP (sym
, 0), 0);
2193 if (SYMBOL_REF_P (sym
))
2195 if (SYMBOL_DATA_AREA (sym
) != DATA_AREA_GENERIC
)
2196 return "%.\tcvta%D1%t0\t%0, %1;";
2197 nvptx_maybe_record_fnsym (sym
);
2200 if (src_inner
== dst_inner
)
2201 return "%.\tmov%t0\t%0, %1;";
2203 if (CONSTANT_P (src
))
2204 return (GET_MODE_CLASS (dst_inner
) == MODE_INT
2205 && GET_MODE_CLASS (src_inner
) != MODE_FLOAT
2206 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2208 if (GET_MODE_SIZE (dst_inner
) == GET_MODE_SIZE (src_inner
))
2210 if (GET_MODE_BITSIZE (dst_mode
) == 128
2211 && GET_MODE_BITSIZE (GET_MODE (src
)) == 128)
2213 /* mov.b128 is not supported. */
2214 if (dst_inner
== V2DImode
&& src_inner
== TImode
)
2215 return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;";
2216 else if (dst_inner
== TImode
&& src_inner
== V2DImode
)
2217 return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;";
2221 return "%.\tmov.b%T0\t%0, %1;";
2224 return "%.\tcvt%t0%t1\t%0, %1;";
2227 static void nvptx_print_operand (FILE *, rtx
, int);
2229 /* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
2230 involves writing .param declarations and in/out copies into them. For
2231 indirect calls, also write the .callprototype. */
2234 nvptx_output_call_insn (rtx_insn
*insn
, rtx result
, rtx callee
)
2238 bool needs_tgt
= register_operand (callee
, Pmode
);
2239 rtx pat
= PATTERN (insn
);
2240 if (GET_CODE (pat
) == COND_EXEC
)
2241 pat
= COND_EXEC_CODE (pat
);
2242 int arg_end
= XVECLEN (pat
, 0);
2243 tree decl
= NULL_TREE
;
2245 fprintf (asm_out_file
, "\t{\n");
2247 fprintf (asm_out_file
, "\t\t.param%s %s_in;\n",
2248 nvptx_ptx_type_from_mode (GET_MODE (result
), false),
2249 reg_names
[NVPTX_RETURN_REGNUM
]);
2251 /* Ensure we have a ptx declaration in the output if necessary. */
2252 if (GET_CODE (callee
) == SYMBOL_REF
)
2254 decl
= SYMBOL_REF_DECL (callee
);
2256 || (DECL_EXTERNAL (decl
) && !TYPE_ARG_TYPES (TREE_TYPE (decl
))))
2257 nvptx_record_libfunc (callee
, result
, pat
);
2258 else if (DECL_EXTERNAL (decl
))
2259 nvptx_record_fndecl (decl
);
2264 ASM_GENERATE_INTERNAL_LABEL (buf
, "LCT", labelno
);
2266 ASM_OUTPUT_LABEL (asm_out_file
, buf
);
2267 std::stringstream s
;
2268 write_fn_proto_from_insn (s
, NULL
, result
, pat
);
2269 fputs (s
.str().c_str(), asm_out_file
);
2272 for (int argno
= 1; argno
< arg_end
; argno
++)
2274 rtx t
= XEXP (XVECEXP (pat
, 0, argno
), 0);
2275 machine_mode mode
= GET_MODE (t
);
2276 const char *ptx_type
= nvptx_ptx_type_from_mode (mode
, false);
2278 /* Mode splitting has already been done. */
2279 fprintf (asm_out_file
, "\t\t.param%s %%out_arg%d;\n"
2280 "\t\tst.param%s [%%out_arg%d], ",
2281 ptx_type
, argno
, ptx_type
, argno
);
2282 output_reg (asm_out_file
, REGNO (t
), VOIDmode
);
2283 fprintf (asm_out_file
, ";\n");
2286 /* The '.' stands for the call's predicate, if any. */
2287 nvptx_print_operand (asm_out_file
, NULL_RTX
, '.');
2288 fprintf (asm_out_file
, "\t\tcall ");
2289 if (result
!= NULL_RTX
)
2290 fprintf (asm_out_file
, "(%s_in), ", reg_names
[NVPTX_RETURN_REGNUM
]);
2294 const char *name
= get_fnname_from_decl (decl
);
2295 name
= nvptx_name_replacement (name
);
2296 assemble_name (asm_out_file
, name
);
2299 output_address (VOIDmode
, callee
);
2301 const char *open
= "(";
2302 for (int argno
= 1; argno
< arg_end
; argno
++)
2304 fprintf (asm_out_file
, ", %s%%out_arg%d", open
, argno
);
2307 if (decl
&& DECL_STATIC_CHAIN (decl
))
2309 fprintf (asm_out_file
, ", %s%s", open
, reg_names
[STATIC_CHAIN_REGNUM
]);
2313 fprintf (asm_out_file
, ")");
2317 fprintf (asm_out_file
, ", ");
2318 assemble_name (asm_out_file
, buf
);
2320 fprintf (asm_out_file
, ";\n");
2322 if (find_reg_note (insn
, REG_NORETURN
, NULL
))
2324 /* No return functions confuse the PTX JIT, as it doesn't realize
2325 the flow control barrier they imply. It can seg fault if it
2326 encounters what looks like an unexitable loop. Emit a trailing
2327 trap and exit, which it does grok. */
2328 fprintf (asm_out_file
, "\t\ttrap; // (noreturn)\n");
2329 fprintf (asm_out_file
, "\t\texit; // (noreturn)\n");
2334 static char rval
[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2337 /* We must escape the '%' that starts RETURN_REGNUM. */
2338 sprintf (rval
, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2339 reg_names
[NVPTX_RETURN_REGNUM
]);
2346 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
2349 nvptx_print_operand_punct_valid_p (unsigned char c
)
2351 return c
== '.' || c
== '#';
2354 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
2357 nvptx_print_address_operand (FILE *file
, rtx x
, machine_mode
)
2360 if (GET_CODE (x
) == CONST
)
2362 switch (GET_CODE (x
))
2366 output_address (VOIDmode
, XEXP (x
, 0));
2367 fprintf (file
, "+");
2368 output_address (VOIDmode
, off
);
2373 output_addr_const (file
, x
);
2377 gcc_assert (GET_CODE (x
) != MEM
);
2378 nvptx_print_operand (file
, x
, 0);
2383 /* Write assembly language output for the address ADDR to FILE. */
2386 nvptx_print_operand_address (FILE *file
, machine_mode mode
, rtx addr
)
2388 nvptx_print_address_operand (file
, addr
, mode
);
2391 /* Print an operand, X, to FILE, with an optional modifier in CODE.
2394 . -- print the predicate for the instruction or an emptry string for an
2396 # -- print a rounding mode for the instruction
2398 A -- print a data area for a MEM
2399 c -- print an opcode suffix for a comparison operator, including a type code
2400 D -- print a data area for a MEM operand
2401 S -- print a shuffle kind specified by CONST_INT
2402 t -- print a type opcode suffix, promoting QImode to 32 bits
2403 T -- print a type size in bits
2404 u -- print a type opcode suffix without promotions. */
2407 nvptx_print_operand (FILE *file
, rtx x
, int code
)
2411 x
= current_insn_predicate
;
2415 if (GET_CODE (x
) == EQ
)
2417 output_reg (file
, REGNO (XEXP (x
, 0)), VOIDmode
);
2421 else if (code
== '#')
2423 fputs (".rn", file
);
2427 enum rtx_code x_code
= GET_CODE (x
);
2428 machine_mode mode
= GET_MODE (x
);
2437 if (GET_CODE (x
) == CONST
)
2439 if (GET_CODE (x
) == PLUS
)
2442 if (GET_CODE (x
) == SYMBOL_REF
)
2443 fputs (section_for_sym (x
), file
);
2448 if (x_code
== SUBREG
)
2450 machine_mode inner_mode
= GET_MODE (SUBREG_REG (x
));
2451 if (VECTOR_MODE_P (inner_mode
)
2452 && (GET_MODE_SIZE (mode
)
2453 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode
))))
2454 mode
= GET_MODE_INNER (inner_mode
);
2455 else if (split_mode_p (inner_mode
))
2456 mode
= maybe_split_mode (inner_mode
);
2460 fprintf (file
, "%s", nvptx_ptx_type_from_mode (mode
, code
== 't'));
2466 rtx inner_x
= SUBREG_REG (x
);
2467 machine_mode inner_mode
= GET_MODE (inner_x
);
2468 machine_mode split
= maybe_split_mode (inner_mode
);
2470 output_reg (file
, REGNO (inner_x
), split
,
2472 ? GET_MODE_SIZE (inner_mode
) / 2
2479 nvptx_shuffle_kind kind
= (nvptx_shuffle_kind
) UINTVAL (x
);
2480 /* Same order as nvptx_shuffle_kind. */
2481 static const char *const kinds
[] =
2482 {".up", ".down", ".bfly", ".idx"};
2483 fputs (kinds
[kind
], file
);
2488 fprintf (file
, "%d", GET_MODE_BITSIZE (mode
));
2492 fprintf (file
, "@");
2496 fprintf (file
, "@!");
2500 mode
= GET_MODE (XEXP (x
, 0));
2504 fputs (".eq", file
);
2507 if (FLOAT_MODE_P (mode
))
2508 fputs (".neu", file
);
2510 fputs (".ne", file
);
2514 fputs (".le", file
);
2518 fputs (".ge", file
);
2522 fputs (".lt", file
);
2526 fputs (".gt", file
);
2529 fputs (".ne", file
);
2532 fputs (".equ", file
);
2535 fputs (".leu", file
);
2538 fputs (".geu", file
);
2541 fputs (".ltu", file
);
2544 fputs (".gtu", file
);
2547 fputs (".nan", file
);
2550 fputs (".num", file
);
2555 if (FLOAT_MODE_P (mode
)
2556 || x_code
== EQ
|| x_code
== NE
2557 || x_code
== GEU
|| x_code
== GTU
2558 || x_code
== LEU
|| x_code
== LTU
)
2559 fputs (nvptx_ptx_type_from_mode (mode
, true), file
);
2561 fprintf (file
, ".s%d", GET_MODE_BITSIZE (mode
));
2569 rtx inner_x
= SUBREG_REG (x
);
2570 machine_mode inner_mode
= GET_MODE (inner_x
);
2571 machine_mode split
= maybe_split_mode (inner_mode
);
2573 if (VECTOR_MODE_P (inner_mode
)
2574 && (GET_MODE_SIZE (mode
)
2575 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode
))))
2577 output_reg (file
, REGNO (inner_x
), VOIDmode
);
2578 fprintf (file
, ".%s", SUBREG_BYTE (x
) == 0 ? "x" : "y");
2580 else if (split_mode_p (inner_mode
)
2581 && (GET_MODE_SIZE (inner_mode
) == GET_MODE_SIZE (mode
)))
2582 output_reg (file
, REGNO (inner_x
), split
);
2584 output_reg (file
, REGNO (inner_x
), split
, SUBREG_BYTE (x
));
2589 output_reg (file
, REGNO (x
), maybe_split_mode (mode
));
2594 nvptx_print_address_operand (file
, XEXP (x
, 0), mode
);
2599 output_addr_const (file
, x
);
2605 /* We could use output_addr_const, but that can print things like
2606 "x-8", which breaks ptxas. Need to ensure it is output as
2608 nvptx_print_address_operand (file
, x
, VOIDmode
);
2613 real_to_target (vals
, CONST_DOUBLE_REAL_VALUE (x
), mode
);
2614 vals
[0] &= 0xffffffff;
2615 vals
[1] &= 0xffffffff;
2617 fprintf (file
, "0f%08lx", vals
[0]);
2619 fprintf (file
, "0d%08lx%08lx", vals
[1], vals
[0]);
2624 unsigned n
= CONST_VECTOR_NUNITS (x
);
2625 fprintf (file
, "{ ");
2626 for (unsigned i
= 0; i
< n
; ++i
)
2629 fprintf (file
, ", ");
2631 rtx elem
= CONST_VECTOR_ELT (x
, i
);
2632 output_addr_const (file
, elem
);
2634 fprintf (file
, " }");
2639 output_addr_const (file
, x
);
2644 /* Record replacement regs used to deal with subreg operands. */
2647 rtx replacement
[MAX_RECOG_OPERANDS
];
2653 /* Allocate or reuse a replacement in R and return the rtx. */
2656 get_replacement (struct reg_replace
*r
)
2658 if (r
->n_allocated
== r
->n_in_use
)
2659 r
->replacement
[r
->n_allocated
++] = gen_reg_rtx (r
->mode
);
2660 return r
->replacement
[r
->n_in_use
++];
2663 /* Clean up subreg operands. In ptx assembly, everything is typed, and
2664 the presence of subregs would break the rules for most instructions.
2665 Replace them with a suitable new register of the right size, plus
2666 conversion copyin/copyout instructions. */
2669 nvptx_reorg_subreg (void)
2671 struct reg_replace qiregs
, hiregs
, siregs
, diregs
;
2672 rtx_insn
*insn
, *next
;
2674 qiregs
.n_allocated
= 0;
2675 hiregs
.n_allocated
= 0;
2676 siregs
.n_allocated
= 0;
2677 diregs
.n_allocated
= 0;
2678 qiregs
.mode
= QImode
;
2679 hiregs
.mode
= HImode
;
2680 siregs
.mode
= SImode
;
2681 diregs
.mode
= DImode
;
2683 for (insn
= get_insns (); insn
; insn
= next
)
2685 next
= NEXT_INSN (insn
);
2686 if (!NONDEBUG_INSN_P (insn
)
2687 || asm_noperands (PATTERN (insn
)) >= 0
2688 || GET_CODE (PATTERN (insn
)) == USE
2689 || GET_CODE (PATTERN (insn
)) == CLOBBER
)
2692 qiregs
.n_in_use
= 0;
2693 hiregs
.n_in_use
= 0;
2694 siregs
.n_in_use
= 0;
2695 diregs
.n_in_use
= 0;
2696 extract_insn (insn
);
2697 enum attr_subregs_ok s_ok
= get_attr_subregs_ok (insn
);
2699 for (int i
= 0; i
< recog_data
.n_operands
; i
++)
2701 rtx op
= recog_data
.operand
[i
];
2702 if (GET_CODE (op
) != SUBREG
)
2705 rtx inner
= SUBREG_REG (op
);
2707 machine_mode outer_mode
= GET_MODE (op
);
2708 machine_mode inner_mode
= GET_MODE (inner
);
2711 && (GET_MODE_PRECISION (inner_mode
)
2712 >= GET_MODE_PRECISION (outer_mode
)))
2714 gcc_assert (SCALAR_INT_MODE_P (outer_mode
));
2715 struct reg_replace
*r
= (outer_mode
== QImode
? &qiregs
2716 : outer_mode
== HImode
? &hiregs
2717 : outer_mode
== SImode
? &siregs
2719 rtx new_reg
= get_replacement (r
);
2721 if (recog_data
.operand_type
[i
] != OP_OUT
)
2724 if (GET_MODE_PRECISION (inner_mode
)
2725 < GET_MODE_PRECISION (outer_mode
))
2730 rtx pat
= gen_rtx_SET (new_reg
,
2731 gen_rtx_fmt_e (code
, outer_mode
, inner
));
2732 emit_insn_before (pat
, insn
);
2735 if (recog_data
.operand_type
[i
] != OP_IN
)
2738 if (GET_MODE_PRECISION (inner_mode
)
2739 < GET_MODE_PRECISION (outer_mode
))
2744 rtx pat
= gen_rtx_SET (inner
,
2745 gen_rtx_fmt_e (code
, inner_mode
, new_reg
));
2746 emit_insn_after (pat
, insn
);
2748 validate_change (insn
, recog_data
.operand_loc
[i
], new_reg
, false);
2753 /* Return a SImode "master lane index" register for uniform-simt, allocating on
2757 nvptx_get_unisimt_master ()
2759 rtx
&master
= cfun
->machine
->unisimt_master
;
2760 return master
? master
: master
= gen_reg_rtx (SImode
);
2763 /* Return a BImode "predicate" register for uniform-simt, similar to above. */
2766 nvptx_get_unisimt_predicate ()
2768 rtx
&pred
= cfun
->machine
->unisimt_predicate
;
2769 return pred
? pred
: pred
= gen_reg_rtx (BImode
);
2772 /* Return true if given call insn references one of the functions provided by
2773 the CUDA runtime: malloc, free, vprintf. */
2776 nvptx_call_insn_is_syscall_p (rtx_insn
*insn
)
2778 rtx pat
= PATTERN (insn
);
2779 gcc_checking_assert (GET_CODE (pat
) == PARALLEL
);
2780 pat
= XVECEXP (pat
, 0, 0);
2781 if (GET_CODE (pat
) == SET
)
2782 pat
= SET_SRC (pat
);
2783 gcc_checking_assert (GET_CODE (pat
) == CALL
2784 && GET_CODE (XEXP (pat
, 0)) == MEM
);
2785 rtx addr
= XEXP (XEXP (pat
, 0), 0);
2786 if (GET_CODE (addr
) != SYMBOL_REF
)
2788 const char *name
= XSTR (addr
, 0);
2789 /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
2790 references with forced assembler name refer to PTX syscalls. For vprintf,
2791 accept both normal and forced-assembler-name references. */
2792 return (!strcmp (name
, "vprintf") || !strcmp (name
, "*vprintf")
2793 || !strcmp (name
, "*malloc")
2794 || !strcmp (name
, "*free"));
2797 /* If SET subexpression of INSN sets a register, emit a shuffle instruction to
2798 propagate its value from lane MASTER to current lane. */
2801 nvptx_unisimt_handle_set (rtx set
, rtx_insn
*insn
, rtx master
)
2804 if (GET_CODE (set
) == SET
&& REG_P (reg
= SET_DEST (set
)))
2805 emit_insn_after (nvptx_gen_shuffle (reg
, reg
, master
, SHUFFLE_IDX
), insn
);
2808 /* Adjust code for uniform-simt code generation variant by making atomics and
2809 "syscalls" conditionally executed, and inserting shuffle-based propagation
2810 for registers being set. */
2813 nvptx_reorg_uniform_simt ()
2815 rtx_insn
*insn
, *next
;
2817 for (insn
= get_insns (); insn
; insn
= next
)
2819 next
= NEXT_INSN (insn
);
2820 if (!(CALL_P (insn
) && nvptx_call_insn_is_syscall_p (insn
))
2821 && !(NONJUMP_INSN_P (insn
)
2822 && GET_CODE (PATTERN (insn
)) == PARALLEL
2823 && get_attr_atomic (insn
)))
2825 rtx pat
= PATTERN (insn
);
2826 rtx master
= nvptx_get_unisimt_master ();
2827 for (int i
= 0; i
< XVECLEN (pat
, 0); i
++)
2828 nvptx_unisimt_handle_set (XVECEXP (pat
, 0, i
), insn
, master
);
2829 rtx pred
= nvptx_get_unisimt_predicate ();
2830 pred
= gen_rtx_NE (BImode
, pred
, const0_rtx
);
2831 pat
= gen_rtx_COND_EXEC (VOIDmode
, pred
, pat
);
2832 validate_change (insn
, &PATTERN (insn
), pat
, false);
2836 /* Loop structure of the function. The entire function is described as
2841 /* Parent parallel. */
2844 /* Next sibling parallel. */
2847 /* First child parallel. */
2850 /* Partitioning mask of the parallel. */
2853 /* Partitioning used within inner parallels. */
2854 unsigned inner_mask
;
2856 /* Location of parallel forked and join. The forked is the first
2857 block in the parallel and the join is the first block after of
2859 basic_block forked_block
;
2860 basic_block join_block
;
2862 rtx_insn
*forked_insn
;
2863 rtx_insn
*join_insn
;
2865 rtx_insn
*fork_insn
;
2866 rtx_insn
*joining_insn
;
2868 /* Basic blocks in this parallel, but not in child parallels. The
2869 FORKED and JOINING blocks are in the partition. The FORK and JOIN
2871 auto_vec
<basic_block
> blocks
;
2874 parallel (parallel
*parent
, unsigned mode
);
2878 /* Constructor links the new parallel into it's parent's chain of
2881 parallel::parallel (parallel
*parent_
, unsigned mask_
)
2882 :parent (parent_
), next (0), inner (0), mask (mask_
), inner_mask (0)
2884 forked_block
= join_block
= 0;
2885 forked_insn
= join_insn
= 0;
2886 fork_insn
= joining_insn
= 0;
2890 next
= parent
->inner
;
2891 parent
->inner
= this;
2895 parallel::~parallel ()
2901 /* Map of basic blocks to insns */
2902 typedef hash_map
<basic_block
, rtx_insn
*> bb_insn_map_t
;
2904 /* A tuple of an insn of interest and the BB in which it resides. */
2905 typedef std::pair
<rtx_insn
*, basic_block
> insn_bb_t
;
2906 typedef auto_vec
<insn_bb_t
> insn_bb_vec_t
;
2908 /* Split basic blocks such that each forked and join unspecs are at
2909 the start of their basic blocks. Thus afterwards each block will
2910 have a single partitioning mode. We also do the same for return
2911 insns, as they are executed by every thread. Return the
2912 partitioning mode of the function as a whole. Populate MAP with
2913 head and tail blocks. We also clear the BB visited flag, which is
2914 used when finding partitions. */
2917 nvptx_split_blocks (bb_insn_map_t
*map
)
2919 insn_bb_vec_t worklist
;
2923 /* Locate all the reorg instructions of interest. */
2924 FOR_ALL_BB_FN (block
, cfun
)
2926 bool seen_insn
= false;
2928 /* Clear visited flag, for use by parallel locator */
2929 block
->flags
&= ~BB_VISITED
;
2931 FOR_BB_INSNS (block
, insn
)
2935 switch (recog_memoized (insn
))
2940 case CODE_FOR_nvptx_forked
:
2941 case CODE_FOR_nvptx_join
:
2944 case CODE_FOR_return
:
2945 /* We also need to split just before return insns, as
2946 that insn needs executing by all threads, but the
2947 block it is in probably does not. */
2952 /* We've found an instruction that must be at the start of
2953 a block, but isn't. Add it to the worklist. */
2954 worklist
.safe_push (insn_bb_t (insn
, block
));
2956 /* It was already the first instruction. Just add it to
2958 map
->get_or_insert (block
) = insn
;
2963 /* Split blocks on the worklist. */
2966 basic_block remap
= 0;
2967 for (ix
= 0; worklist
.iterate (ix
, &elt
); ix
++)
2969 if (remap
!= elt
->second
)
2971 block
= elt
->second
;
2975 /* Split block before insn. The insn is in the new block */
2976 edge e
= split_block (block
, PREV_INSN (elt
->first
));
2979 map
->get_or_insert (block
) = elt
->first
;
2983 /* BLOCK is a basic block containing a head or tail instruction.
2984 Locate the associated prehead or pretail instruction, which must be
2985 in the single predecessor block. */
2988 nvptx_discover_pre (basic_block block
, int expected
)
2990 gcc_assert (block
->preds
->length () == 1);
2991 basic_block pre_block
= (*block
->preds
)[0]->src
;
2994 for (pre_insn
= BB_END (pre_block
); !INSN_P (pre_insn
);
2995 pre_insn
= PREV_INSN (pre_insn
))
2996 gcc_assert (pre_insn
!= BB_HEAD (pre_block
));
2998 gcc_assert (recog_memoized (pre_insn
) == expected
);
3002 /* Dump this parallel and all its inner parallels. */
3005 nvptx_dump_pars (parallel
*par
, unsigned depth
)
3007 fprintf (dump_file
, "%u: mask %d head=%d, tail=%d\n",
3009 par
->forked_block
? par
->forked_block
->index
: -1,
3010 par
->join_block
? par
->join_block
->index
: -1);
3012 fprintf (dump_file
, " blocks:");
3015 for (unsigned ix
= 0; par
->blocks
.iterate (ix
, &block
); ix
++)
3016 fprintf (dump_file
, " %d", block
->index
);
3017 fprintf (dump_file
, "\n");
3019 nvptx_dump_pars (par
->inner
, depth
+ 1);
3022 nvptx_dump_pars (par
->next
, depth
);
3025 /* If BLOCK contains a fork/join marker, process it to create or
3026 terminate a loop structure. Add this block to the current loop,
3027 and then walk successor blocks. */
3030 nvptx_find_par (bb_insn_map_t
*map
, parallel
*par
, basic_block block
)
3032 if (block
->flags
& BB_VISITED
)
3034 block
->flags
|= BB_VISITED
;
3036 if (rtx_insn
**endp
= map
->get (block
))
3038 rtx_insn
*end
= *endp
;
3040 /* This is a block head or tail, or return instruction. */
3041 switch (recog_memoized (end
))
3043 case CODE_FOR_return
:
3044 /* Return instructions are in their own block, and we
3045 don't need to do anything more. */
3048 case CODE_FOR_nvptx_forked
:
3049 /* Loop head, create a new inner loop and add it into
3050 our parent's child list. */
3052 unsigned mask
= UINTVAL (XVECEXP (PATTERN (end
), 0, 0));
3055 par
= new parallel (par
, mask
);
3056 par
->forked_block
= block
;
3057 par
->forked_insn
= end
;
3058 if (!(mask
& GOMP_DIM_MASK (GOMP_DIM_MAX
))
3059 && (mask
& GOMP_DIM_MASK (GOMP_DIM_WORKER
)))
3061 = nvptx_discover_pre (block
, CODE_FOR_nvptx_fork
);
3065 case CODE_FOR_nvptx_join
:
3066 /* A loop tail. Finish the current loop and return to
3069 unsigned mask
= UINTVAL (XVECEXP (PATTERN (end
), 0, 0));
3071 gcc_assert (par
->mask
== mask
);
3072 par
->join_block
= block
;
3073 par
->join_insn
= end
;
3074 if (!(mask
& GOMP_DIM_MASK (GOMP_DIM_MAX
))
3075 && (mask
& GOMP_DIM_MASK (GOMP_DIM_WORKER
)))
3077 = nvptx_discover_pre (block
, CODE_FOR_nvptx_joining
);
3088 /* Add this block onto the current loop's list of blocks. */
3089 par
->blocks
.safe_push (block
);
3091 /* This must be the entry block. Create a NULL parallel. */
3092 par
= new parallel (0, 0);
3094 /* Walk successor blocks. */
3098 FOR_EACH_EDGE (e
, ei
, block
->succs
)
3099 nvptx_find_par (map
, par
, e
->dest
);
3104 /* DFS walk the CFG looking for fork & join markers. Construct
3105 loop structures as we go. MAP is a mapping of basic blocks
3106 to head & tail markers, discovered when splitting blocks. This
3107 speeds up the discovery. We rely on the BB visited flag having
3108 been cleared when splitting blocks. */
3111 nvptx_discover_pars (bb_insn_map_t
*map
)
3115 /* Mark exit blocks as visited. */
3116 block
= EXIT_BLOCK_PTR_FOR_FN (cfun
);
3117 block
->flags
|= BB_VISITED
;
3119 /* And entry block as not. */
3120 block
= ENTRY_BLOCK_PTR_FOR_FN (cfun
);
3121 block
->flags
&= ~BB_VISITED
;
3123 parallel
*par
= nvptx_find_par (map
, 0, block
);
3127 fprintf (dump_file
, "\nLoops\n");
3128 nvptx_dump_pars (par
, 0);
3129 fprintf (dump_file
, "\n");
3135 /* Analyse a group of BBs within a partitioned region and create N
3136 Single-Entry-Single-Exit regions. Some of those regions will be
3137 trivial ones consisting of a single BB. The blocks of a
3138 partitioned region might form a set of disjoint graphs -- because
3139 the region encloses a differently partitoned sub region.
3141 We use the linear time algorithm described in 'Finding Regions Fast:
3142 Single Entry Single Exit and control Regions in Linear Time'
3143 Johnson, Pearson & Pingali. That algorithm deals with complete
3144 CFGs, where a back edge is inserted from END to START, and thus the
3145 problem becomes one of finding equivalent loops.
3147 In this case we have a partial CFG. We complete it by redirecting
3148 any incoming edge to the graph to be from an arbitrary external BB,
3149 and similarly redirecting any outgoing edge to be to that BB.
3150 Thus we end up with a closed graph.
3152 The algorithm works by building a spanning tree of an undirected
3153 graph and keeping track of back edges from nodes further from the
3154 root in the tree to nodes nearer to the root in the tree. In the
3155 description below, the root is up and the tree grows downwards.
3157 We avoid having to deal with degenerate back-edges to the same
3158 block, by splitting each BB into 3 -- one for input edges, one for
3159 the node itself and one for the output edges. Such back edges are
3160 referred to as 'Brackets'. Cycle equivalent nodes will have the
3161 same set of brackets.
3163 Determining bracket equivalency is done by maintaining a list of
3164 brackets in such a manner that the list length and final bracket
3165 uniquely identify the set.
3167 We use coloring to mark all BBs with cycle equivalency with the
3168 same color. This is the output of the 'Finding Regions Fast'
3169 algorithm. Notice it doesn't actually find the set of nodes within
3170 a particular region, just unorderd sets of nodes that are the
3171 entries and exits of SESE regions.
3173 After determining cycle equivalency, we need to find the minimal
3174 set of SESE regions. Do this with a DFS coloring walk of the
3175 complete graph. We're either 'looking' or 'coloring'. When
3176 looking, and we're in the subgraph, we start coloring the color of
3177 the current node, and remember that node as the start of the
3178 current color's SESE region. Every time we go to a new node, we
3179 decrement the count of nodes with thet color. If it reaches zero,
3180 we remember that node as the end of the current color's SESE region
3181 and return to 'looking'. Otherwise we color the node the current
3184 This way we end up with coloring the inside of non-trivial SESE
3185 regions with the color of that region. */
3187 /* A pair of BBs. We use this to represent SESE regions. */
3188 typedef std::pair
<basic_block
, basic_block
> bb_pair_t
;
3189 typedef auto_vec
<bb_pair_t
> bb_pair_vec_t
;
3191 /* A node in the undirected CFG. The discriminator SECOND indicates just
3192 above or just below the BB idicated by FIRST. */
3193 typedef std::pair
<basic_block
, int> pseudo_node_t
;
3195 /* A bracket indicates an edge towards the root of the spanning tree of the
3196 undirected graph. Each bracket has a color, determined
3197 from the currrent set of brackets. */
3200 pseudo_node_t back
; /* Back target */
3202 /* Current color and size of set. */
3206 bracket (pseudo_node_t back_
)
3207 : back (back_
), color (~0u), size (~0u)
3211 unsigned get_color (auto_vec
<unsigned> &color_counts
, unsigned length
)
3216 color
= color_counts
.length ();
3217 color_counts
.quick_push (0);
3219 color_counts
[color
]++;
3224 typedef auto_vec
<bracket
> bracket_vec_t
;
3226 /* Basic block info for finding SESE regions. */
3230 int node
; /* Node number in spanning tree. */
3231 int parent
; /* Parent node number. */
3233 /* The algorithm splits each node A into Ai, A', Ao. The incoming
3234 edges arrive at pseudo-node Ai and the outgoing edges leave at
3235 pseudo-node Ao. We have to remember which way we arrived at a
3236 particular node when generating the spanning tree. dir > 0 means
3237 we arrived at Ai, dir < 0 means we arrived at Ao. */
3240 /* Lowest numbered pseudo-node reached via a backedge from thsis
3241 node, or any descendant. */
3244 int color
; /* Cycle-equivalence color */
3246 /* Stack of brackets for this node. */
3247 bracket_vec_t brackets
;
3249 bb_sese (unsigned node_
, unsigned p
, int dir_
)
3250 :node (node_
), parent (p
), dir (dir_
)
3255 /* Push a bracket ending at BACK. */
3256 void push (const pseudo_node_t
&back
)
3259 fprintf (dump_file
, "Pushing backedge %d:%+d\n",
3260 back
.first
? back
.first
->index
: 0, back
.second
);
3261 brackets
.safe_push (bracket (back
));
3264 void append (bb_sese
*child
);
3265 void remove (const pseudo_node_t
&);
3267 /* Set node's color. */
3268 void set_color (auto_vec
<unsigned> &color_counts
)
3270 color
= brackets
.last ().get_color (color_counts
, brackets
.length ());
3274 bb_sese::~bb_sese ()
3278 /* Destructively append CHILD's brackets. */
3281 bb_sese::append (bb_sese
*child
)
3283 if (int len
= child
->brackets
.length ())
3289 for (ix
= 0; ix
< len
; ix
++)
3291 const pseudo_node_t
&pseudo
= child
->brackets
[ix
].back
;
3292 fprintf (dump_file
, "Appending (%d)'s backedge %d:%+d\n",
3293 child
->node
, pseudo
.first
? pseudo
.first
->index
: 0,
3297 if (!brackets
.length ())
3298 std::swap (brackets
, child
->brackets
);
3301 brackets
.reserve (len
);
3302 for (ix
= 0; ix
< len
; ix
++)
3303 brackets
.quick_push (child
->brackets
[ix
]);
3308 /* Remove brackets that terminate at PSEUDO. */
3311 bb_sese::remove (const pseudo_node_t
&pseudo
)
3313 unsigned removed
= 0;
3314 int len
= brackets
.length ();
3316 for (int ix
= 0; ix
< len
; ix
++)
3318 if (brackets
[ix
].back
== pseudo
)
3321 fprintf (dump_file
, "Removing backedge %d:%+d\n",
3322 pseudo
.first
? pseudo
.first
->index
: 0, pseudo
.second
);
3326 brackets
[ix
-removed
] = brackets
[ix
];
3332 /* Accessors for BB's aux pointer. */
3333 #define BB_SET_SESE(B, S) ((B)->aux = (S))
3334 #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
3336 /* DFS walk creating SESE data structures. Only cover nodes with
3337 BB_VISITED set. Append discovered blocks to LIST. We number in
3338 increments of 3 so that the above and below pseudo nodes can be
3339 implicitly numbered too. */
3342 nvptx_sese_number (int n
, int p
, int dir
, basic_block b
,
3343 auto_vec
<basic_block
> *list
)
3345 if (BB_GET_SESE (b
))
3349 fprintf (dump_file
, "Block %d(%d), parent (%d), orientation %+d\n",
3350 b
->index
, n
, p
, dir
);
3352 BB_SET_SESE (b
, new bb_sese (n
, p
, dir
));
3356 list
->quick_push (b
);
3358 /* First walk the nodes on the 'other side' of this node, then walk
3359 the nodes on the same side. */
3360 for (unsigned ix
= 2; ix
; ix
--)
3362 vec
<edge
, va_gc
> *edges
= dir
> 0 ? b
->succs
: b
->preds
;
3363 size_t offset
= (dir
> 0 ? offsetof (edge_def
, dest
)
3364 : offsetof (edge_def
, src
));
3368 FOR_EACH_EDGE (e
, ei
, edges
)
3370 basic_block target
= *(basic_block
*)((char *)e
+ offset
);
3372 if (target
->flags
& BB_VISITED
)
3373 n
= nvptx_sese_number (n
, p
, dir
, target
, list
);
3380 /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
3381 EDGES are the outgoing edges and OFFSET is the offset to the src
3382 or dst block on the edges. */
3385 nvptx_sese_pseudo (basic_block me
, bb_sese
*sese
, int depth
, int dir
,
3386 vec
<edge
, va_gc
> *edges
, size_t offset
)
3390 int hi_back
= depth
;
3391 pseudo_node_t
node_back (0, depth
);
3392 int hi_child
= depth
;
3393 pseudo_node_t
node_child (0, depth
);
3394 basic_block child
= NULL
;
3395 unsigned num_children
= 0;
3396 int usd
= -dir
* sese
->dir
;
3399 fprintf (dump_file
, "\nProcessing %d(%d) %+d\n",
3400 me
->index
, sese
->node
, dir
);
3404 /* This is the above pseudo-child. It has the BB itself as an
3405 additional child node. */
3406 node_child
= sese
->high
;
3407 hi_child
= node_child
.second
;
3408 if (node_child
.first
)
3409 hi_child
+= BB_GET_SESE (node_child
.first
)->node
;
3413 /* Examine each edge.
3414 - if it is a child (a) append its bracket list and (b) record
3415 whether it is the child with the highest reaching bracket.
3416 - if it is an edge to ancestor, record whether it's the highest
3417 reaching backlink. */
3418 FOR_EACH_EDGE (e
, ei
, edges
)
3420 basic_block target
= *(basic_block
*)((char *)e
+ offset
);
3422 if (bb_sese
*t_sese
= BB_GET_SESE (target
))
3424 if (t_sese
->parent
== sese
->node
&& !(t_sese
->dir
+ usd
))
3426 /* Child node. Append its bracket list. */
3428 sese
->append (t_sese
);
3430 /* Compare it's hi value. */
3431 int t_hi
= t_sese
->high
.second
;
3433 if (basic_block child_hi_block
= t_sese
->high
.first
)
3434 t_hi
+= BB_GET_SESE (child_hi_block
)->node
;
3436 if (hi_child
> t_hi
)
3439 node_child
= t_sese
->high
;
3443 else if (t_sese
->node
< sese
->node
+ dir
3444 && !(dir
< 0 && sese
->parent
== t_sese
->node
))
3446 /* Non-parental ancestor node -- a backlink. */
3447 int d
= usd
* t_sese
->dir
;
3448 int back
= t_sese
->node
+ d
;
3453 node_back
= pseudo_node_t (target
, d
);
3458 { /* Fallen off graph, backlink to entry node. */
3460 node_back
= pseudo_node_t (0, 0);
3464 /* Remove any brackets that terminate at this pseudo node. */
3465 sese
->remove (pseudo_node_t (me
, dir
));
3467 /* Now push any backlinks from this pseudo node. */
3468 FOR_EACH_EDGE (e
, ei
, edges
)
3470 basic_block target
= *(basic_block
*)((char *)e
+ offset
);
3471 if (bb_sese
*t_sese
= BB_GET_SESE (target
))
3473 if (t_sese
->node
< sese
->node
+ dir
3474 && !(dir
< 0 && sese
->parent
== t_sese
->node
))
3475 /* Non-parental ancestor node - backedge from me. */
3476 sese
->push (pseudo_node_t (target
, usd
* t_sese
->dir
));
3480 /* back edge to entry node */
3481 sese
->push (pseudo_node_t (0, 0));
3485 /* If this node leads directly or indirectly to a no-return region of
3486 the graph, then fake a backedge to entry node. */
3487 if (!sese
->brackets
.length () || !edges
|| !edges
->length ())
3490 node_back
= pseudo_node_t (0, 0);
3491 sese
->push (node_back
);
3494 /* Record the highest reaching backedge from us or a descendant. */
3495 sese
->high
= hi_back
< hi_child
? node_back
: node_child
;
3497 if (num_children
> 1)
3499 /* There is more than one child -- this is a Y shaped piece of
3500 spanning tree. We have to insert a fake backedge from this
3501 node to the highest ancestor reached by not-the-highest
3502 reaching child. Note that there may be multiple children
3503 with backedges to the same highest node. That's ok and we
3504 insert the edge to that highest node. */
3506 if (dir
< 0 && child
)
3508 node_child
= sese
->high
;
3509 hi_child
= node_child
.second
;
3510 if (node_child
.first
)
3511 hi_child
+= BB_GET_SESE (node_child
.first
)->node
;
3514 FOR_EACH_EDGE (e
, ei
, edges
)
3516 basic_block target
= *(basic_block
*)((char *)e
+ offset
);
3518 if (target
== child
)
3519 /* Ignore the highest child. */
3522 bb_sese
*t_sese
= BB_GET_SESE (target
);
3525 if (t_sese
->parent
!= sese
->node
)
3529 /* Compare its hi value. */
3530 int t_hi
= t_sese
->high
.second
;
3532 if (basic_block child_hi_block
= t_sese
->high
.first
)
3533 t_hi
+= BB_GET_SESE (child_hi_block
)->node
;
3535 if (hi_child
> t_hi
)
3538 node_child
= t_sese
->high
;
3542 sese
->push (node_child
);
3547 /* DFS walk of BB graph. Color node BLOCK according to COLORING then
3548 proceed to successors. Set SESE entry and exit nodes of
3552 nvptx_sese_color (auto_vec
<unsigned> &color_counts
, bb_pair_vec_t
®ions
,
3553 basic_block block
, int coloring
)
3555 bb_sese
*sese
= BB_GET_SESE (block
);
3557 if (block
->flags
& BB_VISITED
)
3559 /* If we've already encountered this block, either we must not
3560 be coloring, or it must have been colored the current color. */
3561 gcc_assert (coloring
< 0 || (sese
&& coloring
== sese
->color
));
3565 block
->flags
|= BB_VISITED
;
3571 /* Start coloring a region. */
3572 regions
[sese
->color
].first
= block
;
3573 coloring
= sese
->color
;
3576 if (!--color_counts
[sese
->color
] && sese
->color
== coloring
)
3578 /* Found final block of SESE region. */
3579 regions
[sese
->color
].second
= block
;
3583 /* Color the node, so we can assert on revisiting the node
3584 that the graph is indeed SESE. */
3585 sese
->color
= coloring
;
3588 /* Fallen off the subgraph, we cannot be coloring. */
3589 gcc_assert (coloring
< 0);
3591 /* Walk each successor block. */
3592 if (block
->succs
&& block
->succs
->length ())
3597 FOR_EACH_EDGE (e
, ei
, block
->succs
)
3598 nvptx_sese_color (color_counts
, regions
, e
->dest
, coloring
);
3601 gcc_assert (coloring
< 0);
3604 /* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3605 end up with NULL entries in it. */
3608 nvptx_find_sese (auto_vec
<basic_block
> &blocks
, bb_pair_vec_t
®ions
)
3613 /* First clear each BB of the whole function. */
3614 FOR_ALL_BB_FN (block
, cfun
)
3616 block
->flags
&= ~BB_VISITED
;
3617 BB_SET_SESE (block
, 0);
3620 /* Mark blocks in the function that are in this graph. */
3621 for (ix
= 0; blocks
.iterate (ix
, &block
); ix
++)
3622 block
->flags
|= BB_VISITED
;
3624 /* Counts of nodes assigned to each color. There cannot be more
3625 colors than blocks (and hopefully there will be fewer). */
3626 auto_vec
<unsigned> color_counts
;
3627 color_counts
.reserve (blocks
.length ());
3629 /* Worklist of nodes in the spanning tree. Again, there cannot be
3630 more nodes in the tree than blocks (there will be fewer if the
3631 CFG of blocks is disjoint). */
3632 auto_vec
<basic_block
> spanlist
;
3633 spanlist
.reserve (blocks
.length ());
3635 /* Make sure every block has its cycle class determined. */
3636 for (ix
= 0; blocks
.iterate (ix
, &block
); ix
++)
3638 if (BB_GET_SESE (block
))
3639 /* We already met this block in an earlier graph solve. */
3643 fprintf (dump_file
, "Searching graph starting at %d\n", block
->index
);
3645 /* Number the nodes reachable from block initial DFS order. */
3646 int depth
= nvptx_sese_number (2, 0, +1, block
, &spanlist
);
3648 /* Now walk in reverse DFS order to find cycle equivalents. */
3649 while (spanlist
.length ())
3651 block
= spanlist
.pop ();
3652 bb_sese
*sese
= BB_GET_SESE (block
);
3654 /* Do the pseudo node below. */
3655 nvptx_sese_pseudo (block
, sese
, depth
, +1,
3656 sese
->dir
> 0 ? block
->succs
: block
->preds
,
3657 (sese
->dir
> 0 ? offsetof (edge_def
, dest
)
3658 : offsetof (edge_def
, src
)));
3659 sese
->set_color (color_counts
);
3660 /* Do the pseudo node above. */
3661 nvptx_sese_pseudo (block
, sese
, depth
, -1,
3662 sese
->dir
< 0 ? block
->succs
: block
->preds
,
3663 (sese
->dir
< 0 ? offsetof (edge_def
, dest
)
3664 : offsetof (edge_def
, src
)));
3667 fprintf (dump_file
, "\n");
3673 const char *comma
= "";
3675 fprintf (dump_file
, "Found %d cycle equivalents\n",
3676 color_counts
.length ());
3677 for (ix
= 0; color_counts
.iterate (ix
, &count
); ix
++)
3679 fprintf (dump_file
, "%s%d[%d]={", comma
, ix
, count
);
3682 for (unsigned jx
= 0; blocks
.iterate (jx
, &block
); jx
++)
3683 if (BB_GET_SESE (block
)->color
== ix
)
3685 block
->flags
|= BB_VISITED
;
3686 fprintf (dump_file
, "%s%d", comma
, block
->index
);
3689 fprintf (dump_file
, "}");
3692 fprintf (dump_file
, "\n");
3695 /* Now we've colored every block in the subgraph. We now need to
3696 determine the minimal set of SESE regions that cover that
3697 subgraph. Do this with a DFS walk of the complete function.
3698 During the walk we're either 'looking' or 'coloring'. When we
3699 reach the last node of a particular color, we stop coloring and
3700 return to looking. */
3702 /* There cannot be more SESE regions than colors. */
3703 regions
.reserve (color_counts
.length ());
3704 for (ix
= color_counts
.length (); ix
--;)
3705 regions
.quick_push (bb_pair_t (0, 0));
3707 for (ix
= 0; blocks
.iterate (ix
, &block
); ix
++)
3708 block
->flags
&= ~BB_VISITED
;
3710 nvptx_sese_color (color_counts
, regions
, ENTRY_BLOCK_PTR_FOR_FN (cfun
), -1);
3714 const char *comma
= "";
3715 int len
= regions
.length ();
3717 fprintf (dump_file
, "SESE regions:");
3718 for (ix
= 0; ix
!= len
; ix
++)
3720 basic_block from
= regions
[ix
].first
;
3721 basic_block to
= regions
[ix
].second
;
3725 fprintf (dump_file
, "%s %d{%d", comma
, ix
, from
->index
);
3727 fprintf (dump_file
, "->%d", to
->index
);
3729 int color
= BB_GET_SESE (from
)->color
;
3731 /* Print the blocks within the region (excluding ends). */
3732 FOR_EACH_BB_FN (block
, cfun
)
3734 bb_sese
*sese
= BB_GET_SESE (block
);
3736 if (sese
&& sese
->color
== color
3737 && block
!= from
&& block
!= to
)
3738 fprintf (dump_file
, ".%d", block
->index
);
3740 fprintf (dump_file
, "}");
3744 fprintf (dump_file
, "\n\n");
3747 for (ix
= 0; blocks
.iterate (ix
, &block
); ix
++)
3748 delete BB_GET_SESE (block
);
3754 /* Propagate live state at the start of a partitioned region. BLOCK
3755 provides the live register information, and might not contain
3756 INSN. Propagation is inserted just after INSN. RW indicates whether
3757 we are reading and/or writing state. This
3758 separation is needed for worker-level proppagation where we
3759 essentially do a spill & fill. FN is the underlying worker
3760 function to generate the propagation instructions for single
3761 register. DATA is user data.
3763 We propagate the live register set and the entire frame. We could
3764 do better by (a) propagating just the live set that is used within
3765 the partitioned regions and (b) only propagating stack entries that
3766 are used. The latter might be quite hard to determine. */
3768 typedef rtx (*propagator_fn
) (rtx
, propagate_mask
, unsigned, void *);
3771 nvptx_propagate (basic_block block
, rtx_insn
*insn
, propagate_mask rw
,
3772 propagator_fn fn
, void *data
)
3774 bitmap live
= DF_LIVE_IN (block
);
3775 bitmap_iterator iterator
;
3778 /* Copy the frame array. */
3779 HOST_WIDE_INT fs
= get_frame_size ();
3782 rtx tmp
= gen_reg_rtx (DImode
);
3784 rtx ptr
= gen_reg_rtx (Pmode
);
3785 rtx pred
= NULL_RTX
;
3786 rtx_code_label
*label
= NULL
;
3788 /* The frame size might not be DImode compatible, but the frame
3789 array's declaration will be. So it's ok to round up here. */
3790 fs
= (fs
+ GET_MODE_SIZE (DImode
) - 1) / GET_MODE_SIZE (DImode
);
3791 /* Detect single iteration loop. */
3796 emit_insn (gen_rtx_SET (ptr
, frame_pointer_rtx
));
3799 idx
= gen_reg_rtx (SImode
);
3800 pred
= gen_reg_rtx (BImode
);
3801 label
= gen_label_rtx ();
3803 emit_insn (gen_rtx_SET (idx
, GEN_INT (fs
)));
3804 /* Allow worker function to initialize anything needed. */
3805 rtx init
= fn (tmp
, PM_loop_begin
, fs
, data
);
3809 LABEL_NUSES (label
)++;
3810 emit_insn (gen_addsi3 (idx
, idx
, GEN_INT (-1)));
3813 emit_insn (gen_rtx_SET (tmp
, gen_rtx_MEM (DImode
, ptr
)));
3814 emit_insn (fn (tmp
, rw
, fs
, data
));
3816 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode
, ptr
), tmp
));
3819 emit_insn (gen_rtx_SET (pred
, gen_rtx_NE (BImode
, idx
, const0_rtx
)));
3820 emit_insn (gen_adddi3 (ptr
, ptr
, GEN_INT (GET_MODE_SIZE (DImode
))));
3821 emit_insn (gen_br_true_uni (pred
, label
));
3822 rtx fini
= fn (tmp
, PM_loop_end
, fs
, data
);
3825 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx
), idx
));
3827 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp
), tmp
));
3828 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr
), ptr
));
3829 rtx cpy
= get_insns ();
3831 insn
= emit_insn_after (cpy
, insn
);
3834 /* Copy live registers. */
3835 EXECUTE_IF_SET_IN_BITMAP (live
, 0, ix
, iterator
)
3837 rtx reg
= regno_reg_rtx
[ix
];
3839 if (REGNO (reg
) >= FIRST_PSEUDO_REGISTER
)
3841 rtx bcast
= fn (reg
, rw
, 0, data
);
3843 insn
= emit_insn_after (bcast
, insn
);
3848 /* Worker for nvptx_vpropagate. */
3851 vprop_gen (rtx reg
, propagate_mask pm
,
3852 unsigned ARG_UNUSED (count
), void *ARG_UNUSED (data
))
3854 if (!(pm
& PM_read_write
))
3857 return nvptx_gen_vcast (reg
);
3860 /* Propagate state that is live at start of BLOCK across the vectors
3861 of a single warp. Propagation is inserted just after INSN. */
3864 nvptx_vpropagate (basic_block block
, rtx_insn
*insn
)
3866 nvptx_propagate (block
, insn
, PM_read_write
, vprop_gen
, 0);
3869 /* Worker for nvptx_wpropagate. */
3872 wprop_gen (rtx reg
, propagate_mask pm
, unsigned rep
, void *data_
)
3874 wcast_data_t
*data
= (wcast_data_t
*)data_
;
3876 if (pm
& PM_loop_begin
)
3878 /* Starting a loop, initialize pointer. */
3879 unsigned align
= GET_MODE_ALIGNMENT (GET_MODE (reg
)) / BITS_PER_UNIT
;
3881 if (align
> worker_bcast_align
)
3882 worker_bcast_align
= align
;
3883 data
->offset
= (data
->offset
+ align
- 1) & ~(align
- 1);
3885 data
->ptr
= gen_reg_rtx (Pmode
);
3887 return gen_adddi3 (data
->ptr
, data
->base
, GEN_INT (data
->offset
));
3889 else if (pm
& PM_loop_end
)
3891 rtx clobber
= gen_rtx_CLOBBER (GET_MODE (data
->ptr
), data
->ptr
);
3892 data
->ptr
= NULL_RTX
;
3896 return nvptx_gen_wcast (reg
, pm
, rep
, data
);
3899 /* Spill or fill live state that is live at start of BLOCK. PRE_P
3900 indicates if this is just before partitioned mode (do spill), or
3901 just after it starts (do fill). Sequence is inserted just after
3905 nvptx_wpropagate (bool pre_p
, basic_block block
, rtx_insn
*insn
)
3909 data
.base
= gen_reg_rtx (Pmode
);
3911 data
.ptr
= NULL_RTX
;
3913 nvptx_propagate (block
, insn
, pre_p
? PM_read
: PM_write
, wprop_gen
, &data
);
3916 /* Stuff was emitted, initialize the base pointer now. */
3917 rtx init
= gen_rtx_SET (data
.base
, worker_bcast_sym
);
3918 emit_insn_after (init
, insn
);
3920 if (worker_bcast_size
< data
.offset
)
3921 worker_bcast_size
= data
.offset
;
3925 /* Emit a worker-level synchronization barrier. We use different
3926 markers for before and after synchronizations. */
3929 nvptx_wsync (bool after
)
3931 return gen_nvptx_barsync (GEN_INT (after
));
3934 #if WORKAROUND_PTXJIT_BUG
3935 /* Return first real insn in BB, or return NULL_RTX if BB does not contain
3939 bb_first_real_insn (basic_block bb
)
3943 /* Find first insn of from block. */
3944 FOR_BB_INSNS (bb
, insn
)
3952 /* Single neutering according to MASK. FROM is the incoming block and
3953 TO is the outgoing block. These may be the same block. Insert at
3956 if (tid.<axis>) goto end.
3958 and insert before ending branch of TO (if there is such an insn):
3961 <possibly-broadcast-cond>
3964 We currently only use differnt FROM and TO when skipping an entire
3965 loop. We could do more if we detected superblocks. */
3968 nvptx_single (unsigned mask
, basic_block from
, basic_block to
)
3970 rtx_insn
*head
= BB_HEAD (from
);
3971 rtx_insn
*tail
= BB_END (to
);
3972 unsigned skip_mask
= mask
;
3976 /* Find first insn of from block. */
3977 while (head
!= BB_END (from
)
3979 || recog_memoized (head
) == CODE_FOR_nvptx_barsync
))
3980 head
= NEXT_INSN (head
);
3985 if (!(JUMP_P (head
) && single_succ_p (from
)))
3988 basic_block jump_target
= single_succ (from
);
3989 if (!single_pred_p (jump_target
))
3993 head
= BB_HEAD (from
);
3996 /* Find last insn of to block */
3997 rtx_insn
*limit
= from
== to
? head
: BB_HEAD (to
);
3998 while (tail
!= limit
&& !INSN_P (tail
) && !LABEL_P (tail
))
3999 tail
= PREV_INSN (tail
);
4001 /* Detect if tail is a branch. */
4002 rtx tail_branch
= NULL_RTX
;
4003 rtx cond_branch
= NULL_RTX
;
4004 if (tail
&& INSN_P (tail
))
4006 tail_branch
= PATTERN (tail
);
4007 if (GET_CODE (tail_branch
) != SET
|| SET_DEST (tail_branch
) != pc_rtx
)
4008 tail_branch
= NULL_RTX
;
4011 cond_branch
= SET_SRC (tail_branch
);
4012 if (GET_CODE (cond_branch
) != IF_THEN_ELSE
)
4013 cond_branch
= NULL_RTX
;
4019 /* If this is empty, do nothing. */
4020 if (!head
|| !INSN_P (head
))
4023 /* If this is a dummy insn, do nothing. */
4024 switch (recog_memoized (head
))
4028 case CODE_FOR_nvptx_barsync
:
4029 case CODE_FOR_nvptx_fork
:
4030 case CODE_FOR_nvptx_forked
:
4031 case CODE_FOR_nvptx_joining
:
4032 case CODE_FOR_nvptx_join
:
4038 /* If we're only doing vector single, there's no need to
4039 emit skip code because we'll not insert anything. */
4040 if (!(mask
& GOMP_DIM_MASK (GOMP_DIM_VECTOR
)))
4043 else if (tail_branch
)
4044 /* Block with only unconditional branch. Nothing to do. */
4048 /* Insert the vector test inside the worker test. */
4050 rtx_insn
*before
= tail
;
4051 for (mode
= GOMP_DIM_WORKER
; mode
<= GOMP_DIM_VECTOR
; mode
++)
4052 if (GOMP_DIM_MASK (mode
) & skip_mask
)
4054 rtx_code_label
*label
= gen_label_rtx ();
4055 rtx pred
= cfun
->machine
->axis_predicate
[mode
- GOMP_DIM_WORKER
];
4059 pred
= gen_reg_rtx (BImode
);
4060 cfun
->machine
->axis_predicate
[mode
- GOMP_DIM_WORKER
] = pred
;
4064 if (mode
== GOMP_DIM_VECTOR
)
4065 br
= gen_br_true (pred
, label
);
4067 br
= gen_br_true_uni (pred
, label
);
4068 emit_insn_before (br
, head
);
4070 LABEL_NUSES (label
)++;
4072 before
= emit_label_before (label
, before
);
4075 rtx_insn
*label_insn
= emit_label_after (label
, tail
);
4076 if ((mode
== GOMP_DIM_VECTOR
|| mode
== GOMP_DIM_WORKER
)
4077 && CALL_P (tail
) && find_reg_note (tail
, REG_NORETURN
, NULL
))
4078 emit_insn_after (gen_exit (), label_insn
);
4082 /* Now deal with propagating the branch condition. */
4085 rtx pvar
= XEXP (XEXP (cond_branch
, 0), 0);
4087 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR
) == mask
)
4089 /* Vector mode only, do a shuffle. */
4090 #if WORKAROUND_PTXJIT_BUG
4091 /* The branch condition %rcond is propagated like this:
4096 setp.ne.u32 %rnotvzero,%x,0;
4099 @%rnotvzero bra Lskip;
4100 setp.<op>.<type> %rcond,op1,op2;
4102 selp.u32 %rcondu32,1,0,%rcond;
4103 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4104 setp.ne.u32 %rcond,%rcondu32,0;
4106 There seems to be a bug in the ptx JIT compiler (observed at driver
4107 version 381.22, at -O1 and higher for sm_61), that drops the shfl
4108 unless %rcond is initialized to something before 'bra Lskip'. The
4109 bug is not observed with ptxas from cuda 8.0.61.
4111 It is true that the code is non-trivial: at Lskip, %rcond is
4112 uninitialized in threads 1-31, and after the selp the same holds
4113 for %rcondu32. But shfl propagates the defined value in thread 0
4114 to threads 1-31, so after the shfl %rcondu32 is defined in threads
4115 0-31, and after the setp.ne %rcond is defined in threads 0-31.
4117 There is nothing in the PTX spec to suggest that this is wrong, or
4118 to explain why the extra initialization is needed. So, we classify
4119 it as a JIT bug, and the extra initialization as workaround:
4124 setp.ne.u32 %rnotvzero,%x,0;
4127 +.reg .pred %rcond2;
4128 +setp.eq.u32 %rcond2, 1, 0;
4130 @%rnotvzero bra Lskip;
4131 setp.<op>.<type> %rcond,op1,op2;
4132 +mov.pred %rcond2, %rcond;
4134 +mov.pred %rcond, %rcond2;
4135 selp.u32 %rcondu32,1,0,%rcond;
4136 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4137 setp.ne.u32 %rcond,%rcondu32,0;
4139 rtx_insn
*label
= PREV_INSN (tail
);
4140 gcc_assert (label
&& LABEL_P (label
));
4141 rtx tmp
= gen_reg_rtx (BImode
);
4142 emit_insn_before (gen_movbi (tmp
, const0_rtx
),
4143 bb_first_real_insn (from
));
4144 emit_insn_before (gen_rtx_SET (tmp
, pvar
), label
);
4145 emit_insn_before (gen_rtx_SET (pvar
, tmp
), tail
);
4147 emit_insn_before (nvptx_gen_vcast (pvar
), tail
);
4151 /* Includes worker mode, do spill & fill. By construction
4152 we should never have worker mode only. */
4155 data
.base
= worker_bcast_sym
;
4158 if (worker_bcast_size
< GET_MODE_SIZE (SImode
))
4159 worker_bcast_size
= GET_MODE_SIZE (SImode
);
4162 emit_insn_before (nvptx_gen_wcast (pvar
, PM_read
, 0, &data
),
4164 /* Barrier so other workers can see the write. */
4165 emit_insn_before (nvptx_wsync (false), tail
);
4167 emit_insn_before (nvptx_gen_wcast (pvar
, PM_write
, 0, &data
), tail
);
4168 /* This barrier is needed to avoid worker zero clobbering
4169 the broadcast buffer before all the other workers have
4170 had a chance to read this instance of it. */
4171 emit_insn_before (nvptx_wsync (true), tail
);
4174 extract_insn (tail
);
4175 rtx unsp
= gen_rtx_UNSPEC (BImode
, gen_rtvec (1, pvar
),
4177 validate_change (tail
, recog_data
.operand_loc
[0], unsp
, false);
4181 /* PAR is a parallel that is being skipped in its entirety according to
4182 MASK. Treat this as skipping a superblock starting at forked
4183 and ending at joining. */
4186 nvptx_skip_par (unsigned mask
, parallel
*par
)
4188 basic_block tail
= par
->join_block
;
4189 gcc_assert (tail
->preds
->length () == 1);
4191 basic_block pre_tail
= (*tail
->preds
)[0]->src
;
4192 gcc_assert (pre_tail
->succs
->length () == 1);
4194 nvptx_single (mask
, par
->forked_block
, pre_tail
);
4197 /* If PAR has a single inner parallel and PAR itself only contains
4198 empty entry and exit blocks, swallow the inner PAR. */
4201 nvptx_optimize_inner (parallel
*par
)
4203 parallel
*inner
= par
->inner
;
4205 /* We mustn't be the outer dummy par. */
4209 /* We must have a single inner par. */
4210 if (!inner
|| inner
->next
)
4213 /* We must only contain 2 blocks ourselves -- the head and tail of
4215 if (par
->blocks
.length () != 2)
4218 /* We must be disjoint partitioning. As we only have vector and
4219 worker partitioning, this is sufficient to guarantee the pars
4220 have adjacent partitioning. */
4221 if ((par
->mask
& inner
->mask
) & (GOMP_DIM_MASK (GOMP_DIM_MAX
) - 1))
4222 /* This indicates malformed code generation. */
4225 /* The outer forked insn should be immediately followed by the inner
4227 rtx_insn
*forked
= par
->forked_insn
;
4228 rtx_insn
*fork
= BB_END (par
->forked_block
);
4230 if (NEXT_INSN (forked
) != fork
)
4232 gcc_checking_assert (recog_memoized (fork
) == CODE_FOR_nvptx_fork
);
4234 /* The outer joining insn must immediately follow the inner join
4236 rtx_insn
*joining
= par
->joining_insn
;
4237 rtx_insn
*join
= inner
->join_insn
;
4238 if (NEXT_INSN (join
) != joining
)
4241 /* Preconditions met. Swallow the inner par. */
4243 fprintf (dump_file
, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
4244 inner
->mask
, inner
->forked_block
->index
,
4245 inner
->join_block
->index
,
4246 par
->mask
, par
->forked_block
->index
, par
->join_block
->index
);
4248 par
->mask
|= inner
->mask
& (GOMP_DIM_MASK (GOMP_DIM_MAX
) - 1);
4250 par
->blocks
.reserve (inner
->blocks
.length ());
4251 while (inner
->blocks
.length ())
4252 par
->blocks
.quick_push (inner
->blocks
.pop ());
4254 par
->inner
= inner
->inner
;
4255 inner
->inner
= NULL
;
4260 /* Process the parallel PAR and all its contained
4261 parallels. We do everything but the neutering. Return mask of
4262 partitioned modes used within this parallel. */
4265 nvptx_process_pars (parallel
*par
)
4268 nvptx_optimize_inner (par
);
4270 unsigned inner_mask
= par
->mask
;
4272 /* Do the inner parallels first. */
4275 par
->inner_mask
= nvptx_process_pars (par
->inner
);
4276 inner_mask
|= par
->inner_mask
;
4279 if (par
->mask
& GOMP_DIM_MASK (GOMP_DIM_MAX
))
4280 /* No propagation needed for a call. */;
4281 else if (par
->mask
& GOMP_DIM_MASK (GOMP_DIM_WORKER
))
4283 nvptx_wpropagate (false, par
->forked_block
, par
->forked_insn
);
4284 nvptx_wpropagate (true, par
->forked_block
, par
->fork_insn
);
4285 /* Insert begin and end synchronizations. */
4286 emit_insn_before (nvptx_wsync (false), par
->forked_insn
);
4287 emit_insn_before (nvptx_wsync (true), par
->join_insn
);
4289 else if (par
->mask
& GOMP_DIM_MASK (GOMP_DIM_VECTOR
))
4290 nvptx_vpropagate (par
->forked_block
, par
->forked_insn
);
4292 /* Now do siblings. */
4294 inner_mask
|= nvptx_process_pars (par
->next
);
4298 /* Neuter the parallel described by PAR. We recurse in depth-first
4299 order. MODES are the partitioning of the execution and OUTER is
4300 the partitioning of the parallels we are contained in. */
4303 nvptx_neuter_pars (parallel
*par
, unsigned modes
, unsigned outer
)
4305 unsigned me
= (par
->mask
4306 & (GOMP_DIM_MASK (GOMP_DIM_WORKER
)
4307 | GOMP_DIM_MASK (GOMP_DIM_VECTOR
)));
4308 unsigned skip_mask
= 0, neuter_mask
= 0;
4311 nvptx_neuter_pars (par
->inner
, modes
, outer
| me
);
4313 for (unsigned mode
= GOMP_DIM_WORKER
; mode
<= GOMP_DIM_VECTOR
; mode
++)
4315 if ((outer
| me
) & GOMP_DIM_MASK (mode
))
4316 {} /* Mode is partitioned: no neutering. */
4317 else if (!(modes
& GOMP_DIM_MASK (mode
)))
4318 {} /* Mode is not used: nothing to do. */
4319 else if (par
->inner_mask
& GOMP_DIM_MASK (mode
)
4320 || !par
->forked_insn
)
4321 /* Partitioned in inner parallels, or we're not a partitioned
4322 at all: neuter individual blocks. */
4323 neuter_mask
|= GOMP_DIM_MASK (mode
);
4324 else if (!par
->parent
|| !par
->parent
->forked_insn
4325 || par
->parent
->inner_mask
& GOMP_DIM_MASK (mode
))
4326 /* Parent isn't a parallel or contains this paralleling: skip
4327 parallel at this level. */
4328 skip_mask
|= GOMP_DIM_MASK (mode
);
4330 {} /* Parent will skip this parallel itself. */
4339 /* Neuter whole SESE regions. */
4340 bb_pair_vec_t regions
;
4342 nvptx_find_sese (par
->blocks
, regions
);
4343 len
= regions
.length ();
4344 for (ix
= 0; ix
!= len
; ix
++)
4346 basic_block from
= regions
[ix
].first
;
4347 basic_block to
= regions
[ix
].second
;
4350 nvptx_single (neuter_mask
, from
, to
);
4357 /* Neuter each BB individually. */
4358 len
= par
->blocks
.length ();
4359 for (ix
= 0; ix
!= len
; ix
++)
4361 basic_block block
= par
->blocks
[ix
];
4363 nvptx_single (neuter_mask
, block
, block
);
4369 nvptx_skip_par (skip_mask
, par
);
4372 nvptx_neuter_pars (par
->next
, modes
, outer
);
4375 #if WORKAROUND_PTXJIT_BUG_2
4376 /* Variant of pc_set that only requires JUMP_P (INSN) if STRICT. This variant
4377 is needed in the nvptx target because the branches generated for
4378 parititioning are NONJUMP_INSN_P, not JUMP_P. */
4381 nvptx_pc_set (const rtx_insn
*insn
, bool strict
= true)
4384 if ((strict
&& !JUMP_P (insn
))
4385 || (!strict
&& !INSN_P (insn
)))
4387 pat
= PATTERN (insn
);
4389 /* The set is allowed to appear either as the insn pattern or
4390 the first set in a PARALLEL. */
4391 if (GET_CODE (pat
) == PARALLEL
)
4392 pat
= XVECEXP (pat
, 0, 0);
4393 if (GET_CODE (pat
) == SET
&& GET_CODE (SET_DEST (pat
)) == PC
)
4399 /* Variant of condjump_label that only requires JUMP_P (INSN) if STRICT. */
4402 nvptx_condjump_label (const rtx_insn
*insn
, bool strict
= true)
4404 rtx x
= nvptx_pc_set (insn
, strict
);
4409 if (GET_CODE (x
) == LABEL_REF
)
4411 if (GET_CODE (x
) != IF_THEN_ELSE
)
4413 if (XEXP (x
, 2) == pc_rtx
&& GET_CODE (XEXP (x
, 1)) == LABEL_REF
)
4415 if (XEXP (x
, 1) == pc_rtx
&& GET_CODE (XEXP (x
, 2)) == LABEL_REF
)
4420 /* Insert a dummy ptx insn when encountering a branch to a label with no ptx
4421 insn inbetween the branch and the label. This works around a JIT bug
4422 observed at driver version 384.111, at -O0 for sm_50. */
4425 prevent_branch_around_nothing (void)
4427 rtx_insn
*seen_label
= NULL
;
4428 for (rtx_insn
*insn
= get_insns (); insn
; insn
= NEXT_INSN (insn
))
4430 if (INSN_P (insn
) && condjump_p (insn
))
4432 seen_label
= label_ref_label (nvptx_condjump_label (insn
, false));
4436 if (seen_label
== NULL
)
4439 if (NOTE_P (insn
) || DEBUG_INSN_P (insn
))
4443 switch (recog_memoized (insn
))
4445 case CODE_FOR_nvptx_fork
:
4446 case CODE_FOR_nvptx_forked
:
4447 case CODE_FOR_nvptx_joining
:
4448 case CODE_FOR_nvptx_join
:
4455 if (LABEL_P (insn
) && insn
== seen_label
)
4456 emit_insn_before (gen_fake_nop (), insn
);
4463 /* PTX-specific reorganization
4464 - Split blocks at fork and join instructions
4465 - Compute live registers
4466 - Mark now-unused registers, so function begin doesn't declare
4468 - Insert state propagation when entering partitioned mode
4469 - Insert neutering instructions when in single mode
4470 - Replace subregs with suitable sequences.
4476 /* We are freeing block_for_insn in the toplev to keep compatibility
4477 with old MDEP_REORGS that are not CFG based. Recompute it now. */
4478 compute_bb_for_insn ();
4480 thread_prologue_and_epilogue_insns ();
4482 /* Split blocks and record interesting unspecs. */
4483 bb_insn_map_t bb_insn_map
;
4485 nvptx_split_blocks (&bb_insn_map
);
4487 /* Compute live regs */
4488 df_clear_flags (DF_LR_RUN_DCE
);
4489 df_set_flags (DF_NO_INSN_RESCAN
| DF_NO_HARD_REGS
);
4490 df_live_add_problem ();
4491 df_live_set_all_dirty ();
4493 regstat_init_n_sets_and_refs ();
4496 df_dump (dump_file
);
4498 /* Mark unused regs as unused. */
4499 int max_regs
= max_reg_num ();
4500 for (int i
= LAST_VIRTUAL_REGISTER
+ 1; i
< max_regs
; i
++)
4501 if (REG_N_SETS (i
) == 0 && REG_N_REFS (i
) == 0)
4502 regno_reg_rtx
[i
] = const0_rtx
;
4504 /* Determine launch dimensions of the function. If it is not an
4505 offloaded function (i.e. this is a regular compiler), the
4506 function has no neutering. */
4507 tree attr
= oacc_get_fn_attrib (current_function_decl
);
4510 /* If we determined this mask before RTL expansion, we could
4511 elide emission of some levels of forks and joins. */
4513 tree dims
= TREE_VALUE (attr
);
4516 for (ix
= 0; ix
!= GOMP_DIM_MAX
; ix
++, dims
= TREE_CHAIN (dims
))
4518 int size
= TREE_INT_CST_LOW (TREE_VALUE (dims
));
4519 tree allowed
= TREE_PURPOSE (dims
);
4521 if (size
!= 1 && !(allowed
&& integer_zerop (allowed
)))
4522 mask
|= GOMP_DIM_MASK (ix
);
4524 /* If there is worker neutering, there must be vector
4525 neutering. Otherwise the hardware will fail. */
4526 gcc_assert (!(mask
& GOMP_DIM_MASK (GOMP_DIM_WORKER
))
4527 || (mask
& GOMP_DIM_MASK (GOMP_DIM_VECTOR
)));
4529 /* Discover & process partitioned regions. */
4530 parallel
*pars
= nvptx_discover_pars (&bb_insn_map
);
4531 nvptx_process_pars (pars
);
4532 nvptx_neuter_pars (pars
, mask
, 0);
4536 /* Replace subregs. */
4537 nvptx_reorg_subreg ();
4539 if (TARGET_UNIFORM_SIMT
)
4540 nvptx_reorg_uniform_simt ();
4542 #if WORKAROUND_PTXJIT_BUG_2
4543 prevent_branch_around_nothing ();
4546 regstat_free_n_sets_and_refs ();
4548 df_finish_pass (true);
4551 /* Handle a "kernel" attribute; arguments as in
4552 struct attribute_spec.handler. */
4555 nvptx_handle_kernel_attribute (tree
*node
, tree name
, tree
ARG_UNUSED (args
),
4556 int ARG_UNUSED (flags
), bool *no_add_attrs
)
4560 if (TREE_CODE (decl
) != FUNCTION_DECL
)
4562 error ("%qE attribute only applies to functions", name
);
4563 *no_add_attrs
= true;
4565 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl
))))
4567 error ("%qE attribute requires a void return type", name
);
4568 *no_add_attrs
= true;
4574 /* Handle a "shared" attribute; arguments as in
4575 struct attribute_spec.handler. */
4578 nvptx_handle_shared_attribute (tree
*node
, tree name
, tree
ARG_UNUSED (args
),
4579 int ARG_UNUSED (flags
), bool *no_add_attrs
)
4583 if (TREE_CODE (decl
) != VAR_DECL
)
4585 error ("%qE attribute only applies to variables", name
);
4586 *no_add_attrs
= true;
4588 else if (!(TREE_PUBLIC (decl
) || TREE_STATIC (decl
)))
4590 error ("%qE attribute not allowed with auto storage class", name
);
4591 *no_add_attrs
= true;
4597 /* Table of valid machine attributes. */
4598 static const struct attribute_spec nvptx_attribute_table
[] =
4600 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
4601 affects_type_identity, handler, exclude } */
4602 { "kernel", 0, 0, true, false, false, false, nvptx_handle_kernel_attribute
,
4604 { "shared", 0, 0, true, false, false, false, nvptx_handle_shared_attribute
,
4606 { NULL
, 0, 0, false, false, false, false, NULL
, NULL
}
4609 /* Limit vector alignments to BIGGEST_ALIGNMENT. */
4611 static HOST_WIDE_INT
4612 nvptx_vector_alignment (const_tree type
)
4614 HOST_WIDE_INT align
= tree_to_shwi (TYPE_SIZE (type
));
4616 return MIN (align
, BIGGEST_ALIGNMENT
);
4619 /* Indicate that INSN cannot be duplicated. */
4622 nvptx_cannot_copy_insn_p (rtx_insn
*insn
)
4624 switch (recog_memoized (insn
))
4626 case CODE_FOR_nvptx_shufflesi
:
4627 case CODE_FOR_nvptx_shufflesf
:
4628 case CODE_FOR_nvptx_barsync
:
4629 case CODE_FOR_nvptx_fork
:
4630 case CODE_FOR_nvptx_forked
:
4631 case CODE_FOR_nvptx_joining
:
4632 case CODE_FOR_nvptx_join
:
4639 /* Section anchors do not work. Initialization for flag_section_anchor
4640 probes the existence of the anchoring target hooks and prevents
4641 anchoring if they don't exist. However, we may be being used with
4642 a host-side compiler that does support anchoring, and hence see
4643 the anchor flag set (as it's not recalculated). So provide an
4644 implementation denying anchoring. */
4647 nvptx_use_anchors_for_symbol_p (const_rtx
ARG_UNUSED (a
))
4652 /* Record a symbol for mkoffload to enter into the mapping table. */
4655 nvptx_record_offload_symbol (tree decl
)
4657 switch (TREE_CODE (decl
))
4660 fprintf (asm_out_file
, "//:VAR_MAP \"%s\"\n",
4661 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl
)));
4666 tree attr
= oacc_get_fn_attrib (decl
);
4667 /* OpenMP offloading does not set this attribute. */
4668 tree dims
= attr
? TREE_VALUE (attr
) : NULL_TREE
;
4670 fprintf (asm_out_file
, "//:FUNC_MAP \"%s\"",
4671 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl
)));
4673 for (; dims
; dims
= TREE_CHAIN (dims
))
4675 int size
= TREE_INT_CST_LOW (TREE_VALUE (dims
));
4677 gcc_assert (!TREE_PURPOSE (dims
));
4678 fprintf (asm_out_file
, ", %#x", size
);
4681 fprintf (asm_out_file
, "\n");
4690 /* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
4691 at the start of a file. */
4694 nvptx_file_start (void)
4696 fputs ("// BEGIN PREAMBLE\n", asm_out_file
);
4697 fputs ("\t.version\t3.1\n", asm_out_file
);
4698 fputs ("\t.target\tsm_30\n", asm_out_file
);
4699 fprintf (asm_out_file
, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode
));
4700 fputs ("// END PREAMBLE\n", asm_out_file
);
4703 /* Emit a declaration for a worker-level buffer in .shared memory. */
4706 write_worker_buffer (FILE *file
, rtx sym
, unsigned align
, unsigned size
)
4708 const char *name
= XSTR (sym
, 0);
4710 write_var_marker (file
, true, false, name
);
4711 fprintf (file
, ".shared .align %d .u8 %s[%d];\n",
4715 /* Write out the function declarations we've collected and declare storage
4716 for the broadcast buffer. */
4719 nvptx_file_end (void)
4721 hash_table
<tree_hasher
>::iterator iter
;
4723 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab
, decl
, tree
, iter
)
4724 nvptx_record_fndecl (decl
);
4725 fputs (func_decls
.str().c_str(), asm_out_file
);
4727 if (worker_bcast_size
)
4728 write_worker_buffer (asm_out_file
, worker_bcast_sym
,
4729 worker_bcast_align
, worker_bcast_size
);
4731 if (worker_red_size
)
4732 write_worker_buffer (asm_out_file
, worker_red_sym
,
4733 worker_red_align
, worker_red_size
);
4735 if (need_softstack_decl
)
4737 write_var_marker (asm_out_file
, false, true, "__nvptx_stacks");
4738 /* 32 is the maximum number of warps in a block. Even though it's an
4739 external declaration, emit the array size explicitly; otherwise, it
4740 may fail at PTX JIT time if the definition is later in link order. */
4741 fprintf (asm_out_file
, ".extern .shared .u%d __nvptx_stacks[32];\n",
4744 if (need_unisimt_decl
)
4746 write_var_marker (asm_out_file
, false, true, "__nvptx_uni");
4747 fprintf (asm_out_file
, ".extern .shared .u32 __nvptx_uni[32];\n");
4751 /* Expander for the shuffle builtins. */
4754 nvptx_expand_shuffle (tree exp
, rtx target
, machine_mode mode
, int ignore
)
4759 rtx src
= expand_expr (CALL_EXPR_ARG (exp
, 0),
4760 NULL_RTX
, mode
, EXPAND_NORMAL
);
4762 src
= copy_to_mode_reg (mode
, src
);
4764 rtx idx
= expand_expr (CALL_EXPR_ARG (exp
, 1),
4765 NULL_RTX
, SImode
, EXPAND_NORMAL
);
4766 rtx op
= expand_expr (CALL_EXPR_ARG (exp
, 2),
4767 NULL_RTX
, SImode
, EXPAND_NORMAL
);
4769 if (!REG_P (idx
) && GET_CODE (idx
) != CONST_INT
)
4770 idx
= copy_to_mode_reg (SImode
, idx
);
4772 rtx pat
= nvptx_gen_shuffle (target
, src
, idx
,
4773 (nvptx_shuffle_kind
) INTVAL (op
));
4780 /* Worker reduction address expander. */
4783 nvptx_expand_worker_addr (tree exp
, rtx target
,
4784 machine_mode
ARG_UNUSED (mode
), int ignore
)
4789 unsigned align
= TREE_INT_CST_LOW (CALL_EXPR_ARG (exp
, 2));
4790 if (align
> worker_red_align
)
4791 worker_red_align
= align
;
4793 unsigned offset
= TREE_INT_CST_LOW (CALL_EXPR_ARG (exp
, 0));
4794 unsigned size
= TREE_INT_CST_LOW (CALL_EXPR_ARG (exp
, 1));
4795 if (size
+ offset
> worker_red_size
)
4796 worker_red_size
= size
+ offset
;
4798 rtx addr
= worker_red_sym
;
4801 addr
= gen_rtx_PLUS (Pmode
, addr
, GEN_INT (offset
));
4802 addr
= gen_rtx_CONST (Pmode
, addr
);
4805 emit_move_insn (target
, addr
);
4810 /* Expand the CMP_SWAP PTX builtins. We have our own versions that do
4811 not require taking the address of any object, other than the memory
4812 cell being operated on. */
4815 nvptx_expand_cmp_swap (tree exp
, rtx target
,
4816 machine_mode
ARG_UNUSED (m
), int ARG_UNUSED (ignore
))
4818 machine_mode mode
= TYPE_MODE (TREE_TYPE (exp
));
4821 target
= gen_reg_rtx (mode
);
4823 rtx mem
= expand_expr (CALL_EXPR_ARG (exp
, 0),
4824 NULL_RTX
, Pmode
, EXPAND_NORMAL
);
4825 rtx cmp
= expand_expr (CALL_EXPR_ARG (exp
, 1),
4826 NULL_RTX
, mode
, EXPAND_NORMAL
);
4827 rtx src
= expand_expr (CALL_EXPR_ARG (exp
, 2),
4828 NULL_RTX
, mode
, EXPAND_NORMAL
);
4831 mem
= gen_rtx_MEM (mode
, mem
);
4833 cmp
= copy_to_mode_reg (mode
, cmp
);
4835 src
= copy_to_mode_reg (mode
, src
);
4838 pat
= gen_atomic_compare_and_swapsi_1 (target
, mem
, cmp
, src
, const0_rtx
);
4840 pat
= gen_atomic_compare_and_swapdi_1 (target
, mem
, cmp
, src
, const0_rtx
);
4848 /* Codes for all the NVPTX builtins. */
4851 NVPTX_BUILTIN_SHUFFLE
,
4852 NVPTX_BUILTIN_SHUFFLELL
,
4853 NVPTX_BUILTIN_WORKER_ADDR
,
4854 NVPTX_BUILTIN_CMP_SWAP
,
4855 NVPTX_BUILTIN_CMP_SWAPLL
,
4859 static GTY(()) tree nvptx_builtin_decls
[NVPTX_BUILTIN_MAX
];
4861 /* Return the NVPTX builtin for CODE. */
4864 nvptx_builtin_decl (unsigned code
, bool ARG_UNUSED (initialize_p
))
4866 if (code
>= NVPTX_BUILTIN_MAX
)
4867 return error_mark_node
;
4869 return nvptx_builtin_decls
[code
];
4872 /* Set up all builtin functions for this target. */
4875 nvptx_init_builtins (void)
4877 #define DEF(ID, NAME, T) \
4878 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
4879 = add_builtin_function ("__builtin_nvptx_" NAME, \
4880 build_function_type_list T, \
4881 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
4883 #define UINT unsigned_type_node
4884 #define LLUINT long_long_unsigned_type_node
4885 #define PTRVOID ptr_type_node
4887 DEF (SHUFFLE
, "shuffle", (UINT
, UINT
, UINT
, UINT
, NULL_TREE
));
4888 DEF (SHUFFLELL
, "shufflell", (LLUINT
, LLUINT
, UINT
, UINT
, NULL_TREE
));
4889 DEF (WORKER_ADDR
, "worker_addr",
4890 (PTRVOID
, ST
, UINT
, UINT
, NULL_TREE
));
4891 DEF (CMP_SWAP
, "cmp_swap", (UINT
, PTRVOID
, UINT
, UINT
, NULL_TREE
));
4892 DEF (CMP_SWAPLL
, "cmp_swapll", (LLUINT
, PTRVOID
, LLUINT
, LLUINT
, NULL_TREE
));
4901 /* Expand an expression EXP that calls a built-in function,
4902 with result going to TARGET if that's convenient
4903 (and in mode MODE if that's convenient).
4904 SUBTARGET may be used as the target for computing one of EXP's operands.
4905 IGNORE is nonzero if the value is to be ignored. */
4908 nvptx_expand_builtin (tree exp
, rtx target
, rtx
ARG_UNUSED (subtarget
),
4909 machine_mode mode
, int ignore
)
4911 tree fndecl
= TREE_OPERAND (CALL_EXPR_FN (exp
), 0);
4912 switch (DECL_FUNCTION_CODE (fndecl
))
4914 case NVPTX_BUILTIN_SHUFFLE
:
4915 case NVPTX_BUILTIN_SHUFFLELL
:
4916 return nvptx_expand_shuffle (exp
, target
, mode
, ignore
);
4918 case NVPTX_BUILTIN_WORKER_ADDR
:
4919 return nvptx_expand_worker_addr (exp
, target
, mode
, ignore
);
4921 case NVPTX_BUILTIN_CMP_SWAP
:
4922 case NVPTX_BUILTIN_CMP_SWAPLL
:
4923 return nvptx_expand_cmp_swap (exp
, target
, mode
, ignore
);
4925 default: gcc_unreachable ();
4929 /* Define dimension sizes for known hardware. */
4930 #define PTX_VECTOR_LENGTH 32
4931 #define PTX_WORKER_LENGTH 32
4932 #define PTX_GANG_DEFAULT 0 /* Defer to runtime. */
4934 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
4939 return PTX_VECTOR_LENGTH
;
4942 /* Validate compute dimensions of an OpenACC offload or routine, fill
4943 in non-unity defaults. FN_LEVEL indicates the level at which a
4944 routine might spawn a loop. It is negative for non-routines. If
4945 DECL is null, we are validating the default dimensions. */
4948 nvptx_goacc_validate_dims (tree decl
, int dims
[], int fn_level
)
4950 bool changed
= false;
4952 /* The vector size must be 32, unless this is a SEQ routine. */
4953 if (fn_level
<= GOMP_DIM_VECTOR
&& fn_level
>= -1
4954 && dims
[GOMP_DIM_VECTOR
] >= 0
4955 && dims
[GOMP_DIM_VECTOR
] != PTX_VECTOR_LENGTH
)
4957 if (fn_level
< 0 && dims
[GOMP_DIM_VECTOR
] >= 0)
4958 warning_at (decl
? DECL_SOURCE_LOCATION (decl
) : UNKNOWN_LOCATION
, 0,
4959 dims
[GOMP_DIM_VECTOR
]
4960 ? G_("using vector_length (%d), ignoring %d")
4961 : G_("using vector_length (%d), ignoring runtime setting"),
4962 PTX_VECTOR_LENGTH
, dims
[GOMP_DIM_VECTOR
]);
4963 dims
[GOMP_DIM_VECTOR
] = PTX_VECTOR_LENGTH
;
4967 /* Check the num workers is not too large. */
4968 if (dims
[GOMP_DIM_WORKER
] > PTX_WORKER_LENGTH
)
4970 warning_at (decl
? DECL_SOURCE_LOCATION (decl
) : UNKNOWN_LOCATION
, 0,
4971 "using num_workers (%d), ignoring %d",
4972 PTX_WORKER_LENGTH
, dims
[GOMP_DIM_WORKER
]);
4973 dims
[GOMP_DIM_WORKER
] = PTX_WORKER_LENGTH
;
4979 dims
[GOMP_DIM_VECTOR
] = PTX_VECTOR_LENGTH
;
4980 if (dims
[GOMP_DIM_WORKER
] < 0)
4981 dims
[GOMP_DIM_WORKER
] = PTX_WORKER_LENGTH
;
4982 if (dims
[GOMP_DIM_GANG
] < 0)
4983 dims
[GOMP_DIM_GANG
] = PTX_GANG_DEFAULT
;
4990 /* Return maximum dimension size, or zero for unbounded. */
4993 nvptx_dim_limit (int axis
)
4997 case GOMP_DIM_WORKER
:
4998 return PTX_WORKER_LENGTH
;
5000 case GOMP_DIM_VECTOR
:
5001 return PTX_VECTOR_LENGTH
;
5009 /* Determine whether fork & joins are needed. */
5012 nvptx_goacc_fork_join (gcall
*call
, const int dims
[],
5013 bool ARG_UNUSED (is_fork
))
5015 tree arg
= gimple_call_arg (call
, 2);
5016 unsigned axis
= TREE_INT_CST_LOW (arg
);
5018 /* We only care about worker and vector partitioning. */
5019 if (axis
< GOMP_DIM_WORKER
)
5022 /* If the size is 1, there's no partitioning. */
5023 if (dims
[axis
] == 1)
5029 /* Generate a PTX builtin function call that returns the address in
5030 the worker reduction buffer at OFFSET. TYPE is the type of the
5031 data at that location. */
5034 nvptx_get_worker_red_addr (tree type
, tree offset
)
5036 machine_mode mode
= TYPE_MODE (type
);
5037 tree fndecl
= nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR
, true);
5038 tree size
= build_int_cst (unsigned_type_node
, GET_MODE_SIZE (mode
));
5039 tree align
= build_int_cst (unsigned_type_node
,
5040 GET_MODE_ALIGNMENT (mode
) / BITS_PER_UNIT
);
5041 tree call
= build_call_expr (fndecl
, 3, offset
, size
, align
);
5043 return fold_convert (build_pointer_type (type
), call
);
5046 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
5047 will cast the variable if necessary. */
5050 nvptx_generate_vector_shuffle (location_t loc
,
5051 tree dest_var
, tree var
, unsigned shift
,
5054 unsigned fn
= NVPTX_BUILTIN_SHUFFLE
;
5055 tree_code code
= NOP_EXPR
;
5056 tree arg_type
= unsigned_type_node
;
5057 tree var_type
= TREE_TYPE (var
);
5058 tree dest_type
= var_type
;
5060 if (TREE_CODE (var_type
) == COMPLEX_TYPE
)
5061 var_type
= TREE_TYPE (var_type
);
5063 if (TREE_CODE (var_type
) == REAL_TYPE
)
5064 code
= VIEW_CONVERT_EXPR
;
5066 if (TYPE_SIZE (var_type
)
5067 == TYPE_SIZE (long_long_unsigned_type_node
))
5069 fn
= NVPTX_BUILTIN_SHUFFLELL
;
5070 arg_type
= long_long_unsigned_type_node
;
5073 tree call
= nvptx_builtin_decl (fn
, true);
5074 tree bits
= build_int_cst (unsigned_type_node
, shift
);
5075 tree kind
= build_int_cst (unsigned_type_node
, SHUFFLE_DOWN
);
5078 if (var_type
!= dest_type
)
5080 /* Do real and imaginary parts separately. */
5081 tree real
= fold_build1 (REALPART_EXPR
, var_type
, var
);
5082 real
= fold_build1 (code
, arg_type
, real
);
5083 real
= build_call_expr_loc (loc
, call
, 3, real
, bits
, kind
);
5084 real
= fold_build1 (code
, var_type
, real
);
5086 tree imag
= fold_build1 (IMAGPART_EXPR
, var_type
, var
);
5087 imag
= fold_build1 (code
, arg_type
, imag
);
5088 imag
= build_call_expr_loc (loc
, call
, 3, imag
, bits
, kind
);
5089 imag
= fold_build1 (code
, var_type
, imag
);
5091 expr
= fold_build2 (COMPLEX_EXPR
, dest_type
, real
, imag
);
5095 expr
= fold_build1 (code
, arg_type
, var
);
5096 expr
= build_call_expr_loc (loc
, call
, 3, expr
, bits
, kind
);
5097 expr
= fold_build1 (code
, dest_type
, expr
);
5100 gimplify_assign (dest_var
, expr
, seq
);
5103 /* Lazily generate the global lock var decl and return its address. */
5106 nvptx_global_lock_addr ()
5108 tree v
= global_lock_var
;
5112 tree name
= get_identifier ("__reduction_lock");
5113 tree type
= build_qualified_type (unsigned_type_node
,
5114 TYPE_QUAL_VOLATILE
);
5115 v
= build_decl (BUILTINS_LOCATION
, VAR_DECL
, name
, type
);
5116 global_lock_var
= v
;
5117 DECL_ARTIFICIAL (v
) = 1;
5118 DECL_EXTERNAL (v
) = 1;
5119 TREE_STATIC (v
) = 1;
5120 TREE_PUBLIC (v
) = 1;
5122 mark_addressable (v
);
5123 mark_decl_referenced (v
);
5126 return build_fold_addr_expr (v
);
5129 /* Insert code to locklessly update *PTR with *PTR OP VAR just before
5130 GSI. We use a lockless scheme for nearly all case, which looks
5132 actual = initval(OP);
5135 write = guess OP myval;
5136 actual = cmp&swap (ptr, guess, write)
5137 } while (actual bit-different-to guess);
5140 This relies on a cmp&swap instruction, which is available for 32-
5141 and 64-bit types. Larger types must use a locking scheme. */
5144 nvptx_lockless_update (location_t loc
, gimple_stmt_iterator
*gsi
,
5145 tree ptr
, tree var
, tree_code op
)
5147 unsigned fn
= NVPTX_BUILTIN_CMP_SWAP
;
5148 tree_code code
= NOP_EXPR
;
5149 tree arg_type
= unsigned_type_node
;
5150 tree var_type
= TREE_TYPE (var
);
5152 if (TREE_CODE (var_type
) == COMPLEX_TYPE
5153 || TREE_CODE (var_type
) == REAL_TYPE
)
5154 code
= VIEW_CONVERT_EXPR
;
5156 if (TYPE_SIZE (var_type
) == TYPE_SIZE (long_long_unsigned_type_node
))
5158 arg_type
= long_long_unsigned_type_node
;
5159 fn
= NVPTX_BUILTIN_CMP_SWAPLL
;
5162 tree swap_fn
= nvptx_builtin_decl (fn
, true);
5164 gimple_seq init_seq
= NULL
;
5165 tree init_var
= make_ssa_name (arg_type
);
5166 tree init_expr
= omp_reduction_init_op (loc
, op
, var_type
);
5167 init_expr
= fold_build1 (code
, arg_type
, init_expr
);
5168 gimplify_assign (init_var
, init_expr
, &init_seq
);
5169 gimple
*init_end
= gimple_seq_last (init_seq
);
5171 gsi_insert_seq_before (gsi
, init_seq
, GSI_SAME_STMT
);
5173 /* Split the block just after the init stmts. */
5174 basic_block pre_bb
= gsi_bb (*gsi
);
5175 edge pre_edge
= split_block (pre_bb
, init_end
);
5176 basic_block loop_bb
= pre_edge
->dest
;
5177 pre_bb
= pre_edge
->src
;
5178 /* Reset the iterator. */
5179 *gsi
= gsi_for_stmt (gsi_stmt (*gsi
));
5181 tree expect_var
= make_ssa_name (arg_type
);
5182 tree actual_var
= make_ssa_name (arg_type
);
5183 tree write_var
= make_ssa_name (arg_type
);
5185 /* Build and insert the reduction calculation. */
5186 gimple_seq red_seq
= NULL
;
5187 tree write_expr
= fold_build1 (code
, var_type
, expect_var
);
5188 write_expr
= fold_build2 (op
, var_type
, write_expr
, var
);
5189 write_expr
= fold_build1 (code
, arg_type
, write_expr
);
5190 gimplify_assign (write_var
, write_expr
, &red_seq
);
5192 gsi_insert_seq_before (gsi
, red_seq
, GSI_SAME_STMT
);
5194 /* Build & insert the cmp&swap sequence. */
5195 gimple_seq latch_seq
= NULL
;
5196 tree swap_expr
= build_call_expr_loc (loc
, swap_fn
, 3,
5197 ptr
, expect_var
, write_var
);
5198 gimplify_assign (actual_var
, swap_expr
, &latch_seq
);
5200 gcond
*cond
= gimple_build_cond (EQ_EXPR
, actual_var
, expect_var
,
5201 NULL_TREE
, NULL_TREE
);
5202 gimple_seq_add_stmt (&latch_seq
, cond
);
5204 gimple
*latch_end
= gimple_seq_last (latch_seq
);
5205 gsi_insert_seq_before (gsi
, latch_seq
, GSI_SAME_STMT
);
5207 /* Split the block just after the latch stmts. */
5208 edge post_edge
= split_block (loop_bb
, latch_end
);
5209 basic_block post_bb
= post_edge
->dest
;
5210 loop_bb
= post_edge
->src
;
5211 *gsi
= gsi_for_stmt (gsi_stmt (*gsi
));
5213 post_edge
->flags
^= EDGE_TRUE_VALUE
| EDGE_FALLTHRU
;
5214 post_edge
->probability
= profile_probability::even ();
5215 edge loop_edge
= make_edge (loop_bb
, loop_bb
, EDGE_FALSE_VALUE
);
5216 loop_edge
->probability
= profile_probability::even ();
5217 set_immediate_dominator (CDI_DOMINATORS
, loop_bb
, pre_bb
);
5218 set_immediate_dominator (CDI_DOMINATORS
, post_bb
, loop_bb
);
5220 gphi
*phi
= create_phi_node (expect_var
, loop_bb
);
5221 add_phi_arg (phi
, init_var
, pre_edge
, loc
);
5222 add_phi_arg (phi
, actual_var
, loop_edge
, loc
);
5224 loop
*loop
= alloc_loop ();
5225 loop
->header
= loop_bb
;
5226 loop
->latch
= loop_bb
;
5227 add_loop (loop
, loop_bb
->loop_father
);
5229 return fold_build1 (code
, var_type
, write_var
);
5232 /* Insert code to lockfully update *PTR with *PTR OP VAR just before
5233 GSI. This is necessary for types larger than 64 bits, where there
5234 is no cmp&swap instruction to implement a lockless scheme. We use
5235 a lock variable in global memory.
5237 while (cmp&swap (&lock_var, 0, 1))
5240 accum = accum OP var;
5242 cmp&swap (&lock_var, 1, 0);
5245 A lock in global memory is necessary to force execution engine
5246 descheduling and avoid resource starvation that can occur if the
5247 lock is in .shared memory. */
5250 nvptx_lockfull_update (location_t loc
, gimple_stmt_iterator
*gsi
,
5251 tree ptr
, tree var
, tree_code op
)
5253 tree var_type
= TREE_TYPE (var
);
5254 tree swap_fn
= nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP
, true);
5255 tree uns_unlocked
= build_int_cst (unsigned_type_node
, 0);
5256 tree uns_locked
= build_int_cst (unsigned_type_node
, 1);
5258 /* Split the block just before the gsi. Insert a gimple nop to make
5260 gimple
*nop
= gimple_build_nop ();
5261 gsi_insert_before (gsi
, nop
, GSI_SAME_STMT
);
5262 basic_block entry_bb
= gsi_bb (*gsi
);
5263 edge entry_edge
= split_block (entry_bb
, nop
);
5264 basic_block lock_bb
= entry_edge
->dest
;
5265 /* Reset the iterator. */
5266 *gsi
= gsi_for_stmt (gsi_stmt (*gsi
));
5268 /* Build and insert the locking sequence. */
5269 gimple_seq lock_seq
= NULL
;
5270 tree lock_var
= make_ssa_name (unsigned_type_node
);
5271 tree lock_expr
= nvptx_global_lock_addr ();
5272 lock_expr
= build_call_expr_loc (loc
, swap_fn
, 3, lock_expr
,
5273 uns_unlocked
, uns_locked
);
5274 gimplify_assign (lock_var
, lock_expr
, &lock_seq
);
5275 gcond
*cond
= gimple_build_cond (EQ_EXPR
, lock_var
, uns_unlocked
,
5276 NULL_TREE
, NULL_TREE
);
5277 gimple_seq_add_stmt (&lock_seq
, cond
);
5278 gimple
*lock_end
= gimple_seq_last (lock_seq
);
5279 gsi_insert_seq_before (gsi
, lock_seq
, GSI_SAME_STMT
);
5281 /* Split the block just after the lock sequence. */
5282 edge locked_edge
= split_block (lock_bb
, lock_end
);
5283 basic_block update_bb
= locked_edge
->dest
;
5284 lock_bb
= locked_edge
->src
;
5285 *gsi
= gsi_for_stmt (gsi_stmt (*gsi
));
5287 /* Create the lock loop ... */
5288 locked_edge
->flags
^= EDGE_TRUE_VALUE
| EDGE_FALLTHRU
;
5289 locked_edge
->probability
= profile_probability::even ();
5290 edge loop_edge
= make_edge (lock_bb
, lock_bb
, EDGE_FALSE_VALUE
);
5291 loop_edge
->probability
= profile_probability::even ();
5292 set_immediate_dominator (CDI_DOMINATORS
, lock_bb
, entry_bb
);
5293 set_immediate_dominator (CDI_DOMINATORS
, update_bb
, lock_bb
);
5295 /* ... and the loop structure. */
5296 loop
*lock_loop
= alloc_loop ();
5297 lock_loop
->header
= lock_bb
;
5298 lock_loop
->latch
= lock_bb
;
5299 lock_loop
->nb_iterations_estimate
= 1;
5300 lock_loop
->any_estimate
= true;
5301 add_loop (lock_loop
, entry_bb
->loop_father
);
5303 /* Build and insert the reduction calculation. */
5304 gimple_seq red_seq
= NULL
;
5305 tree acc_in
= make_ssa_name (var_type
);
5306 tree ref_in
= build_simple_mem_ref (ptr
);
5307 TREE_THIS_VOLATILE (ref_in
) = 1;
5308 gimplify_assign (acc_in
, ref_in
, &red_seq
);
5310 tree acc_out
= make_ssa_name (var_type
);
5311 tree update_expr
= fold_build2 (op
, var_type
, ref_in
, var
);
5312 gimplify_assign (acc_out
, update_expr
, &red_seq
);
5314 tree ref_out
= build_simple_mem_ref (ptr
);
5315 TREE_THIS_VOLATILE (ref_out
) = 1;
5316 gimplify_assign (ref_out
, acc_out
, &red_seq
);
5318 gsi_insert_seq_before (gsi
, red_seq
, GSI_SAME_STMT
);
5320 /* Build & insert the unlock sequence. */
5321 gimple_seq unlock_seq
= NULL
;
5322 tree unlock_expr
= nvptx_global_lock_addr ();
5323 unlock_expr
= build_call_expr_loc (loc
, swap_fn
, 3, unlock_expr
,
5324 uns_locked
, uns_unlocked
);
5325 gimplify_and_add (unlock_expr
, &unlock_seq
);
5326 gsi_insert_seq_before (gsi
, unlock_seq
, GSI_SAME_STMT
);
5331 /* Emit a sequence to update a reduction accumlator at *PTR with the
5332 value held in VAR using operator OP. Return the updated value.
5334 TODO: optimize for atomic ops and indepedent complex ops. */
5337 nvptx_reduction_update (location_t loc
, gimple_stmt_iterator
*gsi
,
5338 tree ptr
, tree var
, tree_code op
)
5340 tree type
= TREE_TYPE (var
);
5341 tree size
= TYPE_SIZE (type
);
5343 if (size
== TYPE_SIZE (unsigned_type_node
)
5344 || size
== TYPE_SIZE (long_long_unsigned_type_node
))
5345 return nvptx_lockless_update (loc
, gsi
, ptr
, var
, op
);
5347 return nvptx_lockfull_update (loc
, gsi
, ptr
, var
, op
);
5350 /* NVPTX implementation of GOACC_REDUCTION_SETUP. */
5353 nvptx_goacc_reduction_setup (gcall
*call
)
5355 gimple_stmt_iterator gsi
= gsi_for_stmt (call
);
5356 tree lhs
= gimple_call_lhs (call
);
5357 tree var
= gimple_call_arg (call
, 2);
5358 int level
= TREE_INT_CST_LOW (gimple_call_arg (call
, 3));
5359 gimple_seq seq
= NULL
;
5361 push_gimplify_context (true);
5363 if (level
!= GOMP_DIM_GANG
)
5365 /* Copy the receiver object. */
5366 tree ref_to_res
= gimple_call_arg (call
, 1);
5368 if (!integer_zerop (ref_to_res
))
5369 var
= build_simple_mem_ref (ref_to_res
);
5372 if (level
== GOMP_DIM_WORKER
)
5374 /* Store incoming value to worker reduction buffer. */
5375 tree offset
= gimple_call_arg (call
, 5);
5376 tree call
= nvptx_get_worker_red_addr (TREE_TYPE (var
), offset
);
5377 tree ptr
= make_ssa_name (TREE_TYPE (call
));
5379 gimplify_assign (ptr
, call
, &seq
);
5380 tree ref
= build_simple_mem_ref (ptr
);
5381 TREE_THIS_VOLATILE (ref
) = 1;
5382 gimplify_assign (ref
, var
, &seq
);
5386 gimplify_assign (lhs
, var
, &seq
);
5388 pop_gimplify_context (NULL
);
5389 gsi_replace_with_seq (&gsi
, seq
, true);
5392 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
5395 nvptx_goacc_reduction_init (gcall
*call
)
5397 gimple_stmt_iterator gsi
= gsi_for_stmt (call
);
5398 tree lhs
= gimple_call_lhs (call
);
5399 tree var
= gimple_call_arg (call
, 2);
5400 int level
= TREE_INT_CST_LOW (gimple_call_arg (call
, 3));
5401 enum tree_code rcode
5402 = (enum tree_code
)TREE_INT_CST_LOW (gimple_call_arg (call
, 4));
5403 tree init
= omp_reduction_init_op (gimple_location (call
), rcode
,
5405 gimple_seq seq
= NULL
;
5407 push_gimplify_context (true);
5409 if (level
== GOMP_DIM_VECTOR
)
5411 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
5412 tree tid
= make_ssa_name (integer_type_node
);
5413 tree dim_vector
= gimple_call_arg (call
, 3);
5414 gimple
*tid_call
= gimple_build_call_internal (IFN_GOACC_DIM_POS
, 1,
5416 gimple
*cond_stmt
= gimple_build_cond (NE_EXPR
, tid
, integer_zero_node
,
5417 NULL_TREE
, NULL_TREE
);
5419 gimple_call_set_lhs (tid_call
, tid
);
5420 gimple_seq_add_stmt (&seq
, tid_call
);
5421 gimple_seq_add_stmt (&seq
, cond_stmt
);
5423 /* Split the block just after the call. */
5424 edge init_edge
= split_block (gsi_bb (gsi
), call
);
5425 basic_block init_bb
= init_edge
->dest
;
5426 basic_block call_bb
= init_edge
->src
;
5428 /* Fixup flags from call_bb to init_bb. */
5429 init_edge
->flags
^= EDGE_FALLTHRU
| EDGE_TRUE_VALUE
;
5430 init_edge
->probability
= profile_probability::even ();
5432 /* Set the initialization stmts. */
5433 gimple_seq init_seq
= NULL
;
5434 tree init_var
= make_ssa_name (TREE_TYPE (var
));
5435 gimplify_assign (init_var
, init
, &init_seq
);
5436 gsi
= gsi_start_bb (init_bb
);
5437 gsi_insert_seq_before (&gsi
, init_seq
, GSI_SAME_STMT
);
5439 /* Split block just after the init stmt. */
5441 edge inited_edge
= split_block (gsi_bb (gsi
), gsi_stmt (gsi
));
5442 basic_block dst_bb
= inited_edge
->dest
;
5444 /* Create false edge from call_bb to dst_bb. */
5445 edge nop_edge
= make_edge (call_bb
, dst_bb
, EDGE_FALSE_VALUE
);
5446 nop_edge
->probability
= profile_probability::even ();
5448 /* Create phi node in dst block. */
5449 gphi
*phi
= create_phi_node (lhs
, dst_bb
);
5450 add_phi_arg (phi
, init_var
, inited_edge
, gimple_location (call
));
5451 add_phi_arg (phi
, var
, nop_edge
, gimple_location (call
));
5453 /* Reset dominator of dst bb. */
5454 set_immediate_dominator (CDI_DOMINATORS
, dst_bb
, call_bb
);
5456 /* Reset the gsi. */
5457 gsi
= gsi_for_stmt (call
);
5461 if (level
== GOMP_DIM_GANG
)
5463 /* If there's no receiver object, propagate the incoming VAR. */
5464 tree ref_to_res
= gimple_call_arg (call
, 1);
5465 if (integer_zerop (ref_to_res
))
5469 gimplify_assign (lhs
, init
, &seq
);
5472 pop_gimplify_context (NULL
);
5473 gsi_replace_with_seq (&gsi
, seq
, true);
5476 /* NVPTX implementation of GOACC_REDUCTION_FINI. */
5479 nvptx_goacc_reduction_fini (gcall
*call
)
5481 gimple_stmt_iterator gsi
= gsi_for_stmt (call
);
5482 tree lhs
= gimple_call_lhs (call
);
5483 tree ref_to_res
= gimple_call_arg (call
, 1);
5484 tree var
= gimple_call_arg (call
, 2);
5485 int level
= TREE_INT_CST_LOW (gimple_call_arg (call
, 3));
5487 = (enum tree_code
)TREE_INT_CST_LOW (gimple_call_arg (call
, 4));
5488 gimple_seq seq
= NULL
;
5489 tree r
= NULL_TREE
;;
5491 push_gimplify_context (true);
5493 if (level
== GOMP_DIM_VECTOR
)
5495 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
5496 but that requires a method of emitting a unified jump at the
5498 for (int shfl
= PTX_VECTOR_LENGTH
/ 2; shfl
> 0; shfl
= shfl
>> 1)
5500 tree other_var
= make_ssa_name (TREE_TYPE (var
));
5501 nvptx_generate_vector_shuffle (gimple_location (call
),
5502 other_var
, var
, shfl
, &seq
);
5504 r
= make_ssa_name (TREE_TYPE (var
));
5505 gimplify_assign (r
, fold_build2 (op
, TREE_TYPE (var
),
5506 var
, other_var
), &seq
);
5512 tree accum
= NULL_TREE
;
5514 if (level
== GOMP_DIM_WORKER
)
5516 /* Get reduction buffer address. */
5517 tree offset
= gimple_call_arg (call
, 5);
5518 tree call
= nvptx_get_worker_red_addr (TREE_TYPE (var
), offset
);
5519 tree ptr
= make_ssa_name (TREE_TYPE (call
));
5521 gimplify_assign (ptr
, call
, &seq
);
5524 else if (integer_zerop (ref_to_res
))
5531 /* UPDATE the accumulator. */
5532 gsi_insert_seq_before (&gsi
, seq
, GSI_SAME_STMT
);
5534 r
= nvptx_reduction_update (gimple_location (call
), &gsi
,
5540 gimplify_assign (lhs
, r
, &seq
);
5541 pop_gimplify_context (NULL
);
5543 gsi_replace_with_seq (&gsi
, seq
, true);
5546 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
5549 nvptx_goacc_reduction_teardown (gcall
*call
)
5551 gimple_stmt_iterator gsi
= gsi_for_stmt (call
);
5552 tree lhs
= gimple_call_lhs (call
);
5553 tree var
= gimple_call_arg (call
, 2);
5554 int level
= TREE_INT_CST_LOW (gimple_call_arg (call
, 3));
5555 gimple_seq seq
= NULL
;
5557 push_gimplify_context (true);
5558 if (level
== GOMP_DIM_WORKER
)
5560 /* Read the worker reduction buffer. */
5561 tree offset
= gimple_call_arg (call
, 5);
5562 tree call
= nvptx_get_worker_red_addr(TREE_TYPE (var
), offset
);
5563 tree ptr
= make_ssa_name (TREE_TYPE (call
));
5565 gimplify_assign (ptr
, call
, &seq
);
5566 var
= build_simple_mem_ref (ptr
);
5567 TREE_THIS_VOLATILE (var
) = 1;
5570 if (level
!= GOMP_DIM_GANG
)
5572 /* Write to the receiver object. */
5573 tree ref_to_res
= gimple_call_arg (call
, 1);
5575 if (!integer_zerop (ref_to_res
))
5576 gimplify_assign (build_simple_mem_ref (ref_to_res
), var
, &seq
);
5580 gimplify_assign (lhs
, var
, &seq
);
5582 pop_gimplify_context (NULL
);
5584 gsi_replace_with_seq (&gsi
, seq
, true);
5587 /* NVPTX reduction expander. */
5590 nvptx_goacc_reduction (gcall
*call
)
5592 unsigned code
= (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call
, 0));
5596 case IFN_GOACC_REDUCTION_SETUP
:
5597 nvptx_goacc_reduction_setup (call
);
5600 case IFN_GOACC_REDUCTION_INIT
:
5601 nvptx_goacc_reduction_init (call
);
5604 case IFN_GOACC_REDUCTION_FINI
:
5605 nvptx_goacc_reduction_fini (call
);
5608 case IFN_GOACC_REDUCTION_TEARDOWN
:
5609 nvptx_goacc_reduction_teardown (call
);
5618 nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED
,
5619 rtx x ATTRIBUTE_UNUSED
)
5625 nvptx_vector_mode_supported (machine_mode mode
)
5627 return (mode
== V2SImode
5628 || mode
== V2DImode
);
5631 /* Return the preferred mode for vectorizing scalar MODE. */
5634 nvptx_preferred_simd_mode (scalar_mode mode
)
5644 return default_preferred_simd_mode (mode
);
5649 nvptx_data_alignment (const_tree type
, unsigned int basic_align
)
5651 if (TREE_CODE (type
) == INTEGER_TYPE
)
5653 unsigned HOST_WIDE_INT size
= tree_to_uhwi (TYPE_SIZE_UNIT (type
));
5654 if (size
== GET_MODE_SIZE (TImode
))
5655 return GET_MODE_BITSIZE (maybe_split_mode (TImode
));
5661 /* Implement TARGET_MODES_TIEABLE_P. */
5664 nvptx_modes_tieable_p (machine_mode
, machine_mode
)
5669 /* Implement TARGET_HARD_REGNO_NREGS. */
5672 nvptx_hard_regno_nregs (unsigned int, machine_mode
)
5677 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
5680 nvptx_can_change_mode_class (machine_mode
, machine_mode
, reg_class_t
)
5685 #undef TARGET_OPTION_OVERRIDE
5686 #define TARGET_OPTION_OVERRIDE nvptx_option_override
5688 #undef TARGET_ATTRIBUTE_TABLE
5689 #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
5692 #define TARGET_LRA_P hook_bool_void_false
5694 #undef TARGET_LEGITIMATE_ADDRESS_P
5695 #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
5697 #undef TARGET_PROMOTE_FUNCTION_MODE
5698 #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
5700 #undef TARGET_FUNCTION_ARG
5701 #define TARGET_FUNCTION_ARG nvptx_function_arg
5702 #undef TARGET_FUNCTION_INCOMING_ARG
5703 #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
5704 #undef TARGET_FUNCTION_ARG_ADVANCE
5705 #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
5706 #undef TARGET_FUNCTION_ARG_BOUNDARY
5707 #define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
5708 #undef TARGET_PASS_BY_REFERENCE
5709 #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
5710 #undef TARGET_FUNCTION_VALUE_REGNO_P
5711 #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
5712 #undef TARGET_FUNCTION_VALUE
5713 #define TARGET_FUNCTION_VALUE nvptx_function_value
5714 #undef TARGET_LIBCALL_VALUE
5715 #define TARGET_LIBCALL_VALUE nvptx_libcall_value
5716 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
5717 #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
5718 #undef TARGET_GET_DRAP_RTX
5719 #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
5720 #undef TARGET_SPLIT_COMPLEX_ARG
5721 #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
5722 #undef TARGET_RETURN_IN_MEMORY
5723 #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
5724 #undef TARGET_OMIT_STRUCT_RETURN_REG
5725 #define TARGET_OMIT_STRUCT_RETURN_REG true
5726 #undef TARGET_STRICT_ARGUMENT_NAMING
5727 #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
5728 #undef TARGET_CALL_ARGS
5729 #define TARGET_CALL_ARGS nvptx_call_args
5730 #undef TARGET_END_CALL_ARGS
5731 #define TARGET_END_CALL_ARGS nvptx_end_call_args
5733 #undef TARGET_ASM_FILE_START
5734 #define TARGET_ASM_FILE_START nvptx_file_start
5735 #undef TARGET_ASM_FILE_END
5736 #define TARGET_ASM_FILE_END nvptx_file_end
5737 #undef TARGET_ASM_GLOBALIZE_LABEL
5738 #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
5739 #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
5740 #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
5741 #undef TARGET_PRINT_OPERAND
5742 #define TARGET_PRINT_OPERAND nvptx_print_operand
5743 #undef TARGET_PRINT_OPERAND_ADDRESS
5744 #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
5745 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
5746 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
5747 #undef TARGET_ASM_INTEGER
5748 #define TARGET_ASM_INTEGER nvptx_assemble_integer
5749 #undef TARGET_ASM_DECL_END
5750 #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
5751 #undef TARGET_ASM_DECLARE_CONSTANT_NAME
5752 #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
5753 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
5754 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
5755 #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
5756 #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
5758 #undef TARGET_MACHINE_DEPENDENT_REORG
5759 #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
5760 #undef TARGET_NO_REGISTER_ALLOCATION
5761 #define TARGET_NO_REGISTER_ALLOCATION true
5763 #undef TARGET_ENCODE_SECTION_INFO
5764 #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
5765 #undef TARGET_RECORD_OFFLOAD_SYMBOL
5766 #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
5768 #undef TARGET_VECTOR_ALIGNMENT
5769 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
5771 #undef TARGET_CANNOT_COPY_INSN_P
5772 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
5774 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
5775 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
5777 #undef TARGET_INIT_BUILTINS
5778 #define TARGET_INIT_BUILTINS nvptx_init_builtins
5779 #undef TARGET_EXPAND_BUILTIN
5780 #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
5781 #undef TARGET_BUILTIN_DECL
5782 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
5784 #undef TARGET_SIMT_VF
5785 #define TARGET_SIMT_VF nvptx_simt_vf
5787 #undef TARGET_GOACC_VALIDATE_DIMS
5788 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
5790 #undef TARGET_GOACC_DIM_LIMIT
5791 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
5793 #undef TARGET_GOACC_FORK_JOIN
5794 #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
5796 #undef TARGET_GOACC_REDUCTION
5797 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
5799 #undef TARGET_CANNOT_FORCE_CONST_MEM
5800 #define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
5802 #undef TARGET_VECTOR_MODE_SUPPORTED_P
5803 #define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported
5805 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
5806 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
5807 nvptx_preferred_simd_mode
5809 #undef TARGET_MODES_TIEABLE_P
5810 #define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p
5812 #undef TARGET_HARD_REGNO_NREGS
5813 #define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs
5815 #undef TARGET_CAN_CHANGE_MODE_CLASS
5816 #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
5818 struct gcc_target targetm
= TARGET_INITIALIZER
;
5820 #include "gt-nvptx.h"