1 /* Target code for NVPTX.
2 Copyright (C) 2014-2019 Free Software Foundation, Inc.
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
26 #include "coretypes.h"
40 #include "diagnostic.h"
42 #include "insn-flags.h"
44 #include "insn-attr.h"
53 #include "tm-constrs.h"
54 #include "langhooks.h"
58 #include "stor-layout.h"
60 #include "omp-general.h"
62 #include "omp-offload.h"
63 #include "gomp-constants.h"
65 #include "internal-fn.h"
66 #include "gimple-iterator.h"
67 #include "stringpool.h"
70 #include "tree-ssa-operands.h"
71 #include "tree-ssanames.h"
73 #include "tree-phinodes.h"
75 #include "fold-const.h"
78 /* This file should be included last. */
79 #include "target-def.h"
81 #define WORKAROUND_PTXJIT_BUG 1
82 #define WORKAROUND_PTXJIT_BUG_2 1
83 #define WORKAROUND_PTXJIT_BUG_3 1
85 #define PTX_WARP_SIZE 32
86 #define PTX_DEFAULT_VECTOR_LENGTH PTX_WARP_SIZE
87 #define PTX_MAX_VECTOR_LENGTH PTX_WARP_SIZE
88 #define PTX_WORKER_LENGTH 32
89 #define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime. */
91 /* The PTX concept CTA (Concurrent Thread Array) maps on the CUDA concept thread
92 block, which has had a maximum number of threads of 1024 since CUDA version
94 #define PTX_CTA_SIZE 1024
96 /* The various PTX memory areas an object might reside in. */
108 /* We record the data area in the target symbol flags. */
109 #define SYMBOL_DATA_AREA(SYM) \
110 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
112 #define SET_SYMBOL_DATA_AREA(SYM,AREA) \
113 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
115 /* Record the function decls we've written, and the libfuncs and function
116 decls corresponding to them. */
117 static std::stringstream func_decls
;
119 struct declared_libfunc_hasher
: ggc_cache_ptr_hash
<rtx_def
>
121 static hashval_t
hash (rtx x
) { return htab_hash_pointer (x
); }
122 static bool equal (rtx a
, rtx b
) { return a
== b
; }
126 hash_table
<declared_libfunc_hasher
> *declared_libfuncs_htab
;
128 struct tree_hasher
: ggc_cache_ptr_hash
<tree_node
>
130 static hashval_t
hash (tree t
) { return htab_hash_pointer (t
); }
131 static bool equal (tree a
, tree b
) { return a
== b
; }
134 static GTY((cache
)) hash_table
<tree_hasher
> *declared_fndecls_htab
;
135 static GTY((cache
)) hash_table
<tree_hasher
> *needed_fndecls_htab
;
137 /* Buffer needed to broadcast across workers and vectors. This is
138 used for both worker-neutering and worker broadcasting, and
139 vector-neutering and boardcasting when vector_length > 32. It is
140 shared by all functions emitted. The buffer is placed in shared
141 memory. It'd be nice if PTX supported common blocks, because then
142 this could be shared across TUs (taking the largest size). */
143 static unsigned oacc_bcast_size
;
144 static unsigned oacc_bcast_partition
;
145 static unsigned oacc_bcast_align
;
146 static GTY(()) rtx oacc_bcast_sym
;
148 /* Buffer needed for worker reductions. This has to be distinct from
149 the worker broadcast array, as both may be live concurrently. */
150 static unsigned worker_red_size
;
151 static unsigned worker_red_align
;
152 static GTY(()) rtx worker_red_sym
;
154 /* Buffer needed for vector reductions, when vector_length >
155 PTX_WARP_SIZE. This has to be distinct from the worker broadcast
156 array, as both may be live concurrently. */
157 static unsigned vector_red_size
;
158 static unsigned vector_red_align
;
159 static unsigned vector_red_partition
;
160 static GTY(()) rtx vector_red_sym
;
162 /* Global lock variable, needed for 128bit worker & gang reductions. */
163 static GTY(()) tree global_lock_var
;
165 /* True if any function references __nvptx_stacks. */
166 static bool need_softstack_decl
;
168 /* True if any function references __nvptx_uni. */
169 static bool need_unisimt_decl
;
171 static int nvptx_mach_max_workers ();
173 /* Allocate a new, cleared machine_function structure. */
175 static struct machine_function
*
176 nvptx_init_machine_status (void)
178 struct machine_function
*p
= ggc_cleared_alloc
<machine_function
> ();
179 p
->return_mode
= VOIDmode
;
183 /* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
184 and -fopenacc is also enabled. */
187 diagnose_openacc_conflict (bool optval
, const char *optname
)
189 if (flag_openacc
&& optval
)
190 error ("option %s is not supported together with -fopenacc", optname
);
193 /* Implement TARGET_OPTION_OVERRIDE. */
196 nvptx_option_override (void)
198 init_machine_status
= nvptx_init_machine_status
;
200 /* Set toplevel_reorder, unless explicitly disabled. We need
201 reordering so that we emit necessary assembler decls of
202 undeclared variables. */
203 if (!global_options_set
.x_flag_toplevel_reorder
)
204 flag_toplevel_reorder
= 1;
206 debug_nonbind_markers_p
= 0;
208 /* Set flag_no_common, unless explicitly disabled. We fake common
209 using .weak, and that's not entirely accurate, so avoid it
211 if (!global_options_set
.x_flag_no_common
)
214 /* The patch area requires nops, which we don't have. */
215 if (function_entry_patch_area_size
> 0)
216 sorry ("not generating patch area, nops not supported");
218 /* Assumes that it will see only hard registers. */
219 flag_var_tracking
= 0;
221 if (nvptx_optimize
< 0)
222 nvptx_optimize
= optimize
> 0;
224 declared_fndecls_htab
= hash_table
<tree_hasher
>::create_ggc (17);
225 needed_fndecls_htab
= hash_table
<tree_hasher
>::create_ggc (17);
226 declared_libfuncs_htab
227 = hash_table
<declared_libfunc_hasher
>::create_ggc (17);
229 oacc_bcast_sym
= gen_rtx_SYMBOL_REF (Pmode
, "__oacc_bcast");
230 SET_SYMBOL_DATA_AREA (oacc_bcast_sym
, DATA_AREA_SHARED
);
231 oacc_bcast_align
= GET_MODE_ALIGNMENT (SImode
) / BITS_PER_UNIT
;
232 oacc_bcast_partition
= 0;
234 worker_red_sym
= gen_rtx_SYMBOL_REF (Pmode
, "__worker_red");
235 SET_SYMBOL_DATA_AREA (worker_red_sym
, DATA_AREA_SHARED
);
236 worker_red_align
= GET_MODE_ALIGNMENT (SImode
) / BITS_PER_UNIT
;
238 vector_red_sym
= gen_rtx_SYMBOL_REF (Pmode
, "__vector_red");
239 SET_SYMBOL_DATA_AREA (vector_red_sym
, DATA_AREA_SHARED
);
240 vector_red_align
= GET_MODE_ALIGNMENT (SImode
) / BITS_PER_UNIT
;
241 vector_red_partition
= 0;
243 diagnose_openacc_conflict (TARGET_GOMP
, "-mgomp");
244 diagnose_openacc_conflict (TARGET_SOFT_STACK
, "-msoft-stack");
245 diagnose_openacc_conflict (TARGET_UNIFORM_SIMT
, "-muniform-simt");
248 target_flags
|= MASK_SOFT_STACK
| MASK_UNIFORM_SIMT
;
251 /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
252 deal with ptx ideosyncracies. */
255 nvptx_ptx_type_from_mode (machine_mode mode
, bool promote
)
290 /* Encode the PTX data area that DECL (which might not actually be a
291 _DECL) should reside in. */
294 nvptx_encode_section_info (tree decl
, rtx rtl
, int first
)
296 default_encode_section_info (decl
, rtl
, first
);
297 if (first
&& MEM_P (rtl
))
299 nvptx_data_area area
= DATA_AREA_GENERIC
;
301 if (TREE_CONSTANT (decl
))
302 area
= DATA_AREA_CONST
;
303 else if (TREE_CODE (decl
) == VAR_DECL
)
305 if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl
)))
307 area
= DATA_AREA_SHARED
;
308 if (DECL_INITIAL (decl
))
309 error ("static initialization of variable %q+D in %<.shared%>"
310 " memory is not supported", decl
);
313 area
= TREE_READONLY (decl
) ? DATA_AREA_CONST
: DATA_AREA_GLOBAL
;
316 SET_SYMBOL_DATA_AREA (XEXP (rtl
, 0), area
);
320 /* Return the PTX name of the data area in which SYM should be
321 placed. The symbol must have already been processed by
322 nvptx_encode_seciton_info, or equivalent. */
325 section_for_sym (rtx sym
)
327 nvptx_data_area area
= SYMBOL_DATA_AREA (sym
);
328 /* Same order as nvptx_data_area enum. */
329 static char const *const areas
[] =
330 {"", ".global", ".shared", ".local", ".const", ".param"};
335 /* Similarly for a decl. */
338 section_for_decl (const_tree decl
)
340 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree
, decl
)), 0));
343 /* Check NAME for special function names and redirect them by returning a
344 replacement. This applies to malloc, free and realloc, for which we
345 want to use libgcc wrappers, and call, which triggers a bug in
346 ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
347 not active in an offload compiler -- the names are all set by the
348 host-side compiler. */
351 nvptx_name_replacement (const char *name
)
353 if (strcmp (name
, "call") == 0)
354 return "__nvptx_call";
355 if (strcmp (name
, "malloc") == 0)
356 return "__nvptx_malloc";
357 if (strcmp (name
, "free") == 0)
358 return "__nvptx_free";
359 if (strcmp (name
, "realloc") == 0)
360 return "__nvptx_realloc";
364 /* If MODE should be treated as two registers of an inner mode, return
365 that inner mode. Otherwise return VOIDmode. */
368 maybe_split_mode (machine_mode mode
)
370 if (COMPLEX_MODE_P (mode
))
371 return GET_MODE_INNER (mode
);
379 /* Return true if mode should be treated as two registers. */
382 split_mode_p (machine_mode mode
)
384 return maybe_split_mode (mode
) != VOIDmode
;
387 /* Output a register, subreg, or register pair (with optional
388 enclosing braces). */
391 output_reg (FILE *file
, unsigned regno
, machine_mode inner_mode
,
392 int subreg_offset
= -1)
394 if (inner_mode
== VOIDmode
)
396 if (HARD_REGISTER_NUM_P (regno
))
397 fprintf (file
, "%s", reg_names
[regno
]);
399 fprintf (file
, "%%r%d", regno
);
401 else if (subreg_offset
>= 0)
403 output_reg (file
, regno
, VOIDmode
);
404 fprintf (file
, "$%d", subreg_offset
);
408 if (subreg_offset
== -1)
410 output_reg (file
, regno
, inner_mode
, GET_MODE_SIZE (inner_mode
));
412 output_reg (file
, regno
, inner_mode
, 0);
413 if (subreg_offset
== -1)
418 /* Emit forking instructions for MASK. */
421 nvptx_emit_forking (unsigned mask
, bool is_call
)
423 mask
&= (GOMP_DIM_MASK (GOMP_DIM_WORKER
)
424 | GOMP_DIM_MASK (GOMP_DIM_VECTOR
));
427 rtx op
= GEN_INT (mask
| (is_call
<< GOMP_DIM_MAX
));
429 /* Emit fork at all levels. This helps form SESE regions, as
430 it creates a block with a single successor before entering a
431 partitooned region. That is a good candidate for the end of
433 emit_insn (gen_nvptx_fork (op
));
434 emit_insn (gen_nvptx_forked (op
));
438 /* Emit joining instructions for MASK. */
441 nvptx_emit_joining (unsigned mask
, bool is_call
)
443 mask
&= (GOMP_DIM_MASK (GOMP_DIM_WORKER
)
444 | GOMP_DIM_MASK (GOMP_DIM_VECTOR
));
447 rtx op
= GEN_INT (mask
| (is_call
<< GOMP_DIM_MAX
));
449 /* Emit joining for all non-call pars to ensure there's a single
450 predecessor for the block the join insn ends up in. This is
451 needed for skipping entire loops. */
452 emit_insn (gen_nvptx_joining (op
));
453 emit_insn (gen_nvptx_join (op
));
458 /* Determine whether MODE and TYPE (possibly NULL) should be passed or
459 returned in memory. Integer and floating types supported by the
460 machine are passed in registers, everything else is passed in
461 memory. Complex types are split. */
464 pass_in_memory (machine_mode mode
, const_tree type
, bool for_return
)
468 if (AGGREGATE_TYPE_P (type
))
470 if (TREE_CODE (type
) == VECTOR_TYPE
)
474 if (!for_return
&& COMPLEX_MODE_P (mode
))
475 /* Complex types are passed as two underlying args. */
476 mode
= GET_MODE_INNER (mode
);
478 if (GET_MODE_CLASS (mode
) != MODE_INT
479 && GET_MODE_CLASS (mode
) != MODE_FLOAT
)
482 if (GET_MODE_SIZE (mode
) > UNITS_PER_WORD
)
488 /* A non-memory argument of mode MODE is being passed, determine the mode it
489 should be promoted to. This is also used for determining return
493 promote_arg (machine_mode mode
, bool prototyped
)
495 if (!prototyped
&& mode
== SFmode
)
496 /* K&R float promotion for unprototyped functions. */
498 else if (GET_MODE_SIZE (mode
) < GET_MODE_SIZE (SImode
))
504 /* A non-memory return type of MODE is being returned. Determine the
505 mode it should be promoted to. */
508 promote_return (machine_mode mode
)
510 return promote_arg (mode
, true);
513 /* Implement TARGET_FUNCTION_ARG. */
516 nvptx_function_arg (cumulative_args_t
ARG_UNUSED (cum_v
), machine_mode mode
,
517 const_tree
, bool named
)
519 if (mode
== VOIDmode
|| !named
)
522 return gen_reg_rtx (mode
);
525 /* Implement TARGET_FUNCTION_INCOMING_ARG. */
528 nvptx_function_incoming_arg (cumulative_args_t cum_v
, machine_mode mode
,
529 const_tree
, bool named
)
531 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
533 if (mode
== VOIDmode
|| !named
)
536 /* No need to deal with split modes here, the only case that can
537 happen is complex modes and those are dealt with by
538 TARGET_SPLIT_COMPLEX_ARG. */
539 return gen_rtx_UNSPEC (mode
,
540 gen_rtvec (1, GEN_INT (cum
->count
)),
544 /* Implement TARGET_FUNCTION_ARG_ADVANCE. */
547 nvptx_function_arg_advance (cumulative_args_t cum_v
,
548 machine_mode
ARG_UNUSED (mode
),
549 const_tree
ARG_UNUSED (type
),
550 bool ARG_UNUSED (named
))
552 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
557 /* Implement TARGET_FUNCTION_ARG_BOUNDARY.
559 For nvptx This is only used for varadic args. The type has already
560 been promoted and/or converted to invisible reference. */
563 nvptx_function_arg_boundary (machine_mode mode
, const_tree
ARG_UNUSED (type
))
565 return GET_MODE_ALIGNMENT (mode
);
568 /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
570 For nvptx, we know how to handle functions declared as stdarg: by
571 passing an extra pointer to the unnamed arguments. However, the
572 Fortran frontend can produce a different situation, where a
573 function pointer is declared with no arguments, but the actual
574 function and calls to it take more arguments. In that case, we
575 want to ensure the call matches the definition of the function. */
578 nvptx_strict_argument_naming (cumulative_args_t cum_v
)
580 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
582 return cum
->fntype
== NULL_TREE
|| stdarg_p (cum
->fntype
);
585 /* Implement TARGET_LIBCALL_VALUE. */
588 nvptx_libcall_value (machine_mode mode
, const_rtx
)
590 if (!cfun
|| !cfun
->machine
->doing_call
)
591 /* Pretend to return in a hard reg for early uses before pseudos can be
593 return gen_rtx_REG (mode
, NVPTX_RETURN_REGNUM
);
595 return gen_reg_rtx (mode
);
598 /* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
599 where function FUNC returns or receives a value of data type TYPE. */
602 nvptx_function_value (const_tree type
, const_tree
ARG_UNUSED (func
),
605 machine_mode mode
= promote_return (TYPE_MODE (type
));
610 cfun
->machine
->return_mode
= mode
;
611 return gen_rtx_REG (mode
, NVPTX_RETURN_REGNUM
);
614 return nvptx_libcall_value (mode
, NULL_RTX
);
617 /* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
620 nvptx_function_value_regno_p (const unsigned int regno
)
622 return regno
== NVPTX_RETURN_REGNUM
;
625 /* Types with a mode other than those supported by the machine are passed by
626 reference in memory. */
629 nvptx_pass_by_reference (cumulative_args_t
ARG_UNUSED (cum
),
630 machine_mode mode
, const_tree type
,
631 bool ARG_UNUSED (named
))
633 return pass_in_memory (mode
, type
, false);
636 /* Implement TARGET_RETURN_IN_MEMORY. */
639 nvptx_return_in_memory (const_tree type
, const_tree
)
641 return pass_in_memory (TYPE_MODE (type
), type
, true);
644 /* Implement TARGET_PROMOTE_FUNCTION_MODE. */
647 nvptx_promote_function_mode (const_tree type
, machine_mode mode
,
648 int *ARG_UNUSED (punsignedp
),
649 const_tree funtype
, int for_return
)
651 return promote_arg (mode
, for_return
|| !type
|| TYPE_ARG_TYPES (funtype
));
654 /* Helper for write_arg. Emit a single PTX argument of MODE, either
655 in a prototype, or as copy in a function prologue. ARGNO is the
656 index of this argument in the PTX function. FOR_REG is negative,
657 if we're emitting the PTX prototype. It is zero if we're copying
658 to an argument register and it is greater than zero if we're
659 copying to a specific hard register. */
662 write_arg_mode (std::stringstream
&s
, int for_reg
, int argno
,
665 const char *ptx_type
= nvptx_ptx_type_from_mode (mode
, false);
669 /* Writing PTX prototype. */
670 s
<< (argno
? ", " : " (");
671 s
<< ".param" << ptx_type
<< " %in_ar" << argno
;
675 s
<< "\t.reg" << ptx_type
<< " ";
677 s
<< reg_names
[for_reg
];
683 s
<< "\tld.param" << ptx_type
<< " ";
685 s
<< reg_names
[for_reg
];
688 s
<< ", [%in_ar" << argno
<< "];\n";
694 /* Process function parameter TYPE to emit one or more PTX
695 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
696 is true, if this is a prototyped function, rather than an old-style
697 C declaration. Returns the next argument number to use.
699 The promotion behavior here must match the regular GCC function
700 parameter marshalling machinery. */
703 write_arg_type (std::stringstream
&s
, int for_reg
, int argno
,
704 tree type
, bool prototyped
)
706 machine_mode mode
= TYPE_MODE (type
);
708 if (mode
== VOIDmode
)
711 if (pass_in_memory (mode
, type
, false))
715 bool split
= TREE_CODE (type
) == COMPLEX_TYPE
;
719 /* Complex types are sent as two separate args. */
720 type
= TREE_TYPE (type
);
721 mode
= TYPE_MODE (type
);
725 mode
= promote_arg (mode
, prototyped
);
727 argno
= write_arg_mode (s
, for_reg
, argno
, mode
);
730 return write_arg_mode (s
, for_reg
, argno
, mode
);
733 /* Emit a PTX return as a prototype or function prologue declaration
737 write_return_mode (std::stringstream
&s
, bool for_proto
, machine_mode mode
)
739 const char *ptx_type
= nvptx_ptx_type_from_mode (mode
, false);
740 const char *pfx
= "\t.reg";
741 const char *sfx
= ";\n";
744 pfx
= "(.param", sfx
= "_out) ";
746 s
<< pfx
<< ptx_type
<< " " << reg_names
[NVPTX_RETURN_REGNUM
] << sfx
;
749 /* Process a function return TYPE to emit a PTX return as a prototype
750 or function prologue declaration. Returns true if return is via an
751 additional pointer parameter. The promotion behavior here must
752 match the regular GCC function return mashalling. */
755 write_return_type (std::stringstream
&s
, bool for_proto
, tree type
)
757 machine_mode mode
= TYPE_MODE (type
);
759 if (mode
== VOIDmode
)
762 bool return_in_mem
= pass_in_memory (mode
, type
, true);
767 return return_in_mem
;
769 /* Named return values can cause us to return a pointer as well
770 as expect an argument for the return location. This is
771 optimization-level specific, so no caller can make use of
772 this data, but more importantly for us, we must ensure it
773 doesn't change the PTX prototype. */
774 mode
= (machine_mode
) cfun
->machine
->return_mode
;
776 if (mode
== VOIDmode
)
777 return return_in_mem
;
779 /* Clear return_mode to inhibit copy of retval to non-existent
781 cfun
->machine
->return_mode
= VOIDmode
;
784 mode
= promote_return (mode
);
786 write_return_mode (s
, for_proto
, mode
);
788 return return_in_mem
;
791 /* Look for attributes in ATTRS that would indicate we must write a function
792 as a .entry kernel rather than a .func. Return true if one is found. */
795 write_as_kernel (tree attrs
)
797 return (lookup_attribute ("kernel", attrs
) != NULL_TREE
798 || (lookup_attribute ("omp target entrypoint", attrs
) != NULL_TREE
799 && lookup_attribute ("oacc function", attrs
) != NULL_TREE
));
800 /* For OpenMP target regions, the corresponding kernel entry is emitted from
801 write_omp_entry as a separate function. */
804 /* Emit a linker marker for a function decl or defn. */
807 write_fn_marker (std::stringstream
&s
, bool is_defn
, bool globalize
,
813 s
<< " FUNCTION " << (is_defn
? "DEF: " : "DECL: ");
817 /* Emit a linker marker for a variable decl or defn. */
820 write_var_marker (FILE *file
, bool is_defn
, bool globalize
, const char *name
)
822 fprintf (file
, "\n// BEGIN%s VAR %s: ",
823 globalize
? " GLOBAL" : "",
824 is_defn
? "DEF" : "DECL");
825 assemble_name_raw (file
, name
);
829 /* Write a .func or .kernel declaration or definition along with
830 a helper comment for use by ld. S is the stream to write to, DECL
831 the decl for the function with name NAME. For definitions, emit
832 a declaration too. */
835 write_fn_proto (std::stringstream
&s
, bool is_defn
,
836 const char *name
, const_tree decl
)
839 /* Emit a declaration. The PTX assembler gets upset without it. */
840 name
= write_fn_proto (s
, false, name
, decl
);
843 /* Avoid repeating the name replacement. */
844 name
= nvptx_name_replacement (name
);
849 write_fn_marker (s
, is_defn
, TREE_PUBLIC (decl
), name
);
851 /* PTX declaration. */
852 if (DECL_EXTERNAL (decl
))
854 else if (TREE_PUBLIC (decl
))
855 s
<< (DECL_WEAK (decl
) ? ".weak " : ".visible ");
856 s
<< (write_as_kernel (DECL_ATTRIBUTES (decl
)) ? ".entry " : ".func ");
858 tree fntype
= TREE_TYPE (decl
);
859 tree result_type
= TREE_TYPE (fntype
);
861 /* atomic_compare_exchange_$n builtins have an exceptional calling
863 int not_atomic_weak_arg
= -1;
864 if (DECL_BUILT_IN_CLASS (decl
) == BUILT_IN_NORMAL
)
865 switch (DECL_FUNCTION_CODE (decl
))
867 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1
:
868 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2
:
869 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4
:
870 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8
:
871 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16
:
872 /* These atomics skip the 'weak' parm in an actual library
873 call. We must skip it in the prototype too. */
874 not_atomic_weak_arg
= 3;
881 /* Declare the result. */
882 bool return_in_mem
= write_return_type (s
, true, result_type
);
888 /* Emit argument list. */
890 argno
= write_arg_type (s
, -1, argno
, ptr_type_node
, true);
893 NULL in TYPE_ARG_TYPES, for old-style functions
894 NULL in DECL_ARGUMENTS, for builtin functions without another
896 So we have to pick the best one we have. */
897 tree args
= TYPE_ARG_TYPES (fntype
);
898 bool prototyped
= true;
901 args
= DECL_ARGUMENTS (decl
);
905 for (; args
; args
= TREE_CHAIN (args
), not_atomic_weak_arg
--)
907 tree type
= prototyped
? TREE_VALUE (args
) : TREE_TYPE (args
);
909 if (not_atomic_weak_arg
)
910 argno
= write_arg_type (s
, -1, argno
, type
, prototyped
);
912 gcc_assert (type
== boolean_type_node
);
915 if (stdarg_p (fntype
))
916 argno
= write_arg_type (s
, -1, argno
, ptr_type_node
, true);
918 if (DECL_STATIC_CHAIN (decl
))
919 argno
= write_arg_type (s
, -1, argno
, ptr_type_node
, true);
921 if (!argno
&& strcmp (name
, "main") == 0)
923 argno
= write_arg_type (s
, -1, argno
, integer_type_node
, true);
924 argno
= write_arg_type (s
, -1, argno
, ptr_type_node
, true);
930 s
<< (is_defn
? "\n" : ";\n");
935 /* Construct a function declaration from a call insn. This can be
936 necessary for two reasons - either we have an indirect call which
937 requires a .callprototype declaration, or we have a libcall
938 generated by emit_library_call for which no decl exists. */
941 write_fn_proto_from_insn (std::stringstream
&s
, const char *name
,
946 s
<< "\t.callprototype ";
951 name
= nvptx_name_replacement (name
);
952 write_fn_marker (s
, false, true, name
);
953 s
<< "\t.extern .func ";
956 if (result
!= NULL_RTX
)
957 write_return_mode (s
, true, GET_MODE (result
));
961 int arg_end
= XVECLEN (pat
, 0);
962 for (int i
= 1; i
< arg_end
; i
++)
964 /* We don't have to deal with mode splitting & promotion here,
965 as that was already done when generating the call
967 machine_mode mode
= GET_MODE (XEXP (XVECEXP (pat
, 0, i
), 0));
969 write_arg_mode (s
, -1, i
- 1, mode
);
976 /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
977 table and and write a ptx prototype. These are emitted at end of
981 nvptx_record_fndecl (tree decl
)
983 tree
*slot
= declared_fndecls_htab
->find_slot (decl
, INSERT
);
987 const char *name
= get_fnname_from_decl (decl
);
988 write_fn_proto (func_decls
, false, name
, decl
);
992 /* Record a libcall or unprototyped external function. CALLEE is the
993 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
994 declaration for it. */
997 nvptx_record_libfunc (rtx callee
, rtx retval
, rtx pat
)
999 rtx
*slot
= declared_libfuncs_htab
->find_slot (callee
, INSERT
);
1004 const char *name
= XSTR (callee
, 0);
1005 write_fn_proto_from_insn (func_decls
, name
, retval
, pat
);
1009 /* DECL is an external FUNCTION_DECL, that we're referencing. If it
1010 is prototyped, record it now. Otherwise record it as needed at end
1011 of compilation, when we might have more information about it. */
1014 nvptx_record_needed_fndecl (tree decl
)
1016 if (TYPE_ARG_TYPES (TREE_TYPE (decl
)) == NULL_TREE
)
1018 tree
*slot
= needed_fndecls_htab
->find_slot (decl
, INSERT
);
1023 nvptx_record_fndecl (decl
);
1026 /* SYM is a SYMBOL_REF. If it refers to an external function, record
1030 nvptx_maybe_record_fnsym (rtx sym
)
1032 tree decl
= SYMBOL_REF_DECL (sym
);
1034 if (decl
&& TREE_CODE (decl
) == FUNCTION_DECL
&& DECL_EXTERNAL (decl
))
1035 nvptx_record_needed_fndecl (decl
);
1038 /* Emit a local array to hold some part of a conventional stack frame
1039 and initialize REGNO to point to it. If the size is zero, it'll
1040 never be valid to dereference, so we can simply initialize to
1044 init_frame (FILE *file
, int regno
, unsigned align
, unsigned size
)
1047 fprintf (file
, "\t.local .align %d .b8 %s_ar[%u];\n",
1048 align
, reg_names
[regno
], size
);
1049 fprintf (file
, "\t.reg.u%d %s;\n",
1050 POINTER_SIZE
, reg_names
[regno
]);
1051 fprintf (file
, (size
? "\tcvta.local.u%d %s, %s_ar;\n"
1052 : "\tmov.u%d %s, 0;\n"),
1053 POINTER_SIZE
, reg_names
[regno
], reg_names
[regno
]);
1056 /* Emit soft stack frame setup sequence. */
1059 init_softstack_frame (FILE *file
, unsigned alignment
, HOST_WIDE_INT size
)
1061 /* Maintain 64-bit stack alignment. */
1062 unsigned keep_align
= BIGGEST_ALIGNMENT
/ BITS_PER_UNIT
;
1063 size
= ROUND_UP (size
, keep_align
);
1064 int bits
= POINTER_SIZE
;
1065 const char *reg_stack
= reg_names
[STACK_POINTER_REGNUM
];
1066 const char *reg_frame
= reg_names
[FRAME_POINTER_REGNUM
];
1067 const char *reg_sspslot
= reg_names
[SOFTSTACK_SLOT_REGNUM
];
1068 const char *reg_sspprev
= reg_names
[SOFTSTACK_PREV_REGNUM
];
1069 fprintf (file
, "\t.reg.u%d %s;\n", bits
, reg_stack
);
1070 fprintf (file
, "\t.reg.u%d %s;\n", bits
, reg_frame
);
1071 fprintf (file
, "\t.reg.u%d %s;\n", bits
, reg_sspslot
);
1072 fprintf (file
, "\t.reg.u%d %s;\n", bits
, reg_sspprev
);
1073 fprintf (file
, "\t{\n");
1074 fprintf (file
, "\t\t.reg.u32 %%fstmp0;\n");
1075 fprintf (file
, "\t\t.reg.u%d %%fstmp1;\n", bits
);
1076 fprintf (file
, "\t\t.reg.u%d %%fstmp2;\n", bits
);
1077 fprintf (file
, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1078 fprintf (file
, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1079 bits
== 64 ? ".wide" : ".lo", bits
/ 8);
1080 fprintf (file
, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits
);
1082 /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */
1083 fprintf (file
, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits
, reg_sspslot
);
1085 /* Initialize %sspprev = __nvptx_stacks[tid.y]. */
1086 fprintf (file
, "\t\tld.shared.u%d %s, [%s];\n",
1087 bits
, reg_sspprev
, reg_sspslot
);
1089 /* Initialize %frame = %sspprev - size. */
1090 fprintf (file
, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC
";\n",
1091 bits
, reg_frame
, reg_sspprev
, size
);
1093 /* Apply alignment, if larger than 64. */
1094 if (alignment
> keep_align
)
1095 fprintf (file
, "\t\tand.b%d %s, %s, %d;\n",
1096 bits
, reg_frame
, reg_frame
, -alignment
);
1098 size
= crtl
->outgoing_args_size
;
1099 gcc_assert (size
% keep_align
== 0);
1101 /* Initialize %stack. */
1102 fprintf (file
, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC
";\n",
1103 bits
, reg_stack
, reg_frame
, size
);
1106 fprintf (file
, "\t\tst.shared.u%d [%s], %s;\n",
1107 bits
, reg_sspslot
, reg_stack
);
1108 fprintf (file
, "\t}\n");
1109 cfun
->machine
->has_softstack
= true;
1110 need_softstack_decl
= true;
1113 /* Emit code to initialize the REGNO predicate register to indicate
1114 whether we are not lane zero on the NAME axis. */
1117 nvptx_init_axis_predicate (FILE *file
, int regno
, const char *name
)
1119 fprintf (file
, "\t{\n");
1120 fprintf (file
, "\t\t.reg.u32\t%%%s;\n", name
);
1121 if (strcmp (name
, "x") == 0 && cfun
->machine
->red_partition
)
1123 fprintf (file
, "\t\t.reg.u64\t%%t_red;\n");
1124 fprintf (file
, "\t\t.reg.u64\t%%y64;\n");
1126 fprintf (file
, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name
, name
);
1127 fprintf (file
, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno
, name
);
1128 if (strcmp (name
, "x") == 0 && cfun
->machine
->red_partition
)
1130 fprintf (file
, "\t\tcvt.u64.u32\t%%y64, %%tid.y;\n");
1131 fprintf (file
, "\t\tcvta.shared.u64\t%%t_red, __vector_red;\n");
1132 fprintf (file
, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_red; "
1133 "// vector reduction buffer\n",
1134 REGNO (cfun
->machine
->red_partition
),
1135 vector_red_partition
);
1137 /* Verify vector_red_size. */
1138 gcc_assert (vector_red_partition
* nvptx_mach_max_workers ()
1139 <= vector_red_size
);
1140 fprintf (file
, "\t}\n");
1143 /* Emit code to initialize OpenACC worker broadcast and synchronization
1147 nvptx_init_oacc_workers (FILE *file
)
1149 fprintf (file
, "\t{\n");
1150 fprintf (file
, "\t\t.reg.u32\t%%tidy;\n");
1151 if (cfun
->machine
->bcast_partition
)
1153 fprintf (file
, "\t\t.reg.u64\t%%t_bcast;\n");
1154 fprintf (file
, "\t\t.reg.u64\t%%y64;\n");
1156 fprintf (file
, "\t\tmov.u32\t\t%%tidy, %%tid.y;\n");
1157 if (cfun
->machine
->bcast_partition
)
1159 fprintf (file
, "\t\tcvt.u64.u32\t%%y64, %%tidy;\n");
1160 fprintf (file
, "\t\tadd.u64\t\t%%y64, %%y64, 1; // vector ID\n");
1161 fprintf (file
, "\t\tcvta.shared.u64\t%%t_bcast, __oacc_bcast;\n");
1162 fprintf (file
, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_bcast; "
1163 "// vector broadcast offset\n",
1164 REGNO (cfun
->machine
->bcast_partition
),
1165 oacc_bcast_partition
);
1167 /* Verify oacc_bcast_size. */
1168 gcc_assert (oacc_bcast_partition
* (nvptx_mach_max_workers () + 1)
1169 <= oacc_bcast_size
);
1170 if (cfun
->machine
->sync_bar
)
1171 fprintf (file
, "\t\tadd.u32\t\t%%r%d, %%tidy, 1; "
1172 "// vector synchronization barrier\n",
1173 REGNO (cfun
->machine
->sync_bar
));
1174 fprintf (file
, "\t}\n");
1177 /* Emit code to initialize predicate and master lane index registers for
1178 -muniform-simt code generation variant. */
1181 nvptx_init_unisimt_predicate (FILE *file
)
1183 cfun
->machine
->unisimt_location
= gen_reg_rtx (Pmode
);
1184 int loc
= REGNO (cfun
->machine
->unisimt_location
);
1185 int bits
= POINTER_SIZE
;
1186 fprintf (file
, "\t.reg.u%d %%r%d;\n", bits
, loc
);
1187 fprintf (file
, "\t{\n");
1188 fprintf (file
, "\t\t.reg.u32 %%ustmp0;\n");
1189 fprintf (file
, "\t\t.reg.u%d %%ustmp1;\n", bits
);
1190 fprintf (file
, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1191 fprintf (file
, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1192 bits
== 64 ? ".wide" : ".lo");
1193 fprintf (file
, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits
, loc
);
1194 fprintf (file
, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits
, loc
, loc
);
1195 if (cfun
->machine
->unisimt_predicate
)
1197 int master
= REGNO (cfun
->machine
->unisimt_master
);
1198 int pred
= REGNO (cfun
->machine
->unisimt_predicate
);
1199 fprintf (file
, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master
, loc
);
1200 fprintf (file
, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
1201 /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'. */
1202 fprintf (file
, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master
, master
);
1203 /* Compute predicate as 'tid.x == master'. */
1204 fprintf (file
, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred
, master
);
1206 fprintf (file
, "\t}\n");
1207 need_unisimt_decl
= true;
1210 /* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1212 extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1213 void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1215 __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1216 __nvptx_uni[tid.y] = 0;
1217 gomp_nvptx_main (ORIG, arg);
1219 ORIG itself should not be emitted as a PTX .entry function. */
1222 write_omp_entry (FILE *file
, const char *name
, const char *orig
)
1224 static bool gomp_nvptx_main_declared
;
1225 if (!gomp_nvptx_main_declared
)
1227 gomp_nvptx_main_declared
= true;
1228 write_fn_marker (func_decls
, false, true, "gomp_nvptx_main");
1229 func_decls
<< ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1230 << " %in_ar1, .param.u" << POINTER_SIZE
<< " %in_ar2);\n";
1232 /* PR79332. Single out this string; it confuses gcc.pot generation. */
1233 #define NTID_Y "%ntid.y"
1234 #define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1235 (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1238 .reg.u" PS " %R<4>;\n\
1239 mov.u32 %r0, %tid.y;\n\
1240 mov.u32 %r1, " NTID_Y ";\n\
1241 mov.u32 %r2, %ctaid.x;\n\
1242 cvt.u" PS ".u32 %R1, %r0;\n\
1243 " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1244 mov.u" PS " %R0, __nvptx_stacks;\n\
1245 " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1246 ld.param.u" PS " %R2, [%stack];\n\
1247 ld.param.u" PS " %R3, [%sz];\n\
1248 add.u" PS " %R2, %R2, %R3;\n\
1249 mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1250 st.shared.u" PS " [%R0], %R2;\n\
1251 mov.u" PS " %R0, __nvptx_uni;\n\
1252 " MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1254 st.shared.u32 [%R0], %r0;\n\
1255 mov.u" PS " %R0, \0;\n\
1256 ld.param.u" PS " %R1, [%arg];\n\
1258 .param.u" PS " %P<2>;\n\
1259 st.param.u" PS " [%P0], %R0;\n\
1260 st.param.u" PS " [%P1], %R1;\n\
1261 call.uni gomp_nvptx_main, (%P0, %P1);\n\
1265 static const char entry64
[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1266 static const char entry32
[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 ");
1267 #undef ENTRY_TEMPLATE
1269 const char *entry_1
= TARGET_ABI64
? entry64
: entry32
;
1270 /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */
1271 const char *entry_2
= entry_1
+ strlen (entry64
) + 1;
1272 fprintf (file
, ".visible .entry %s%s%s%s", name
, entry_1
, orig
, entry_2
);
1273 need_softstack_decl
= need_unisimt_decl
= true;
1276 /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
1277 function, including local var decls and copies from the arguments to
1281 nvptx_declare_function_name (FILE *file
, const char *name
, const_tree decl
)
1283 tree fntype
= TREE_TYPE (decl
);
1284 tree result_type
= TREE_TYPE (fntype
);
1287 if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl
))
1288 && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl
)))
1290 char *buf
= (char *) alloca (strlen (name
) + sizeof ("$impl"));
1291 sprintf (buf
, "%s$impl", name
);
1292 write_omp_entry (file
, name
, buf
);
1295 /* We construct the initial part of the function into a string
1296 stream, in order to share the prototype writing code. */
1297 std::stringstream s
;
1298 write_fn_proto (s
, true, name
, decl
);
1301 bool return_in_mem
= write_return_type (s
, false, result_type
);
1303 argno
= write_arg_type (s
, 0, argno
, ptr_type_node
, true);
1305 /* Declare and initialize incoming arguments. */
1306 tree args
= TYPE_ARG_TYPES (fntype
);
1307 bool prototyped
= true;
1310 args
= DECL_ARGUMENTS (decl
);
1314 for (; args
!= NULL_TREE
; args
= TREE_CHAIN (args
))
1316 tree type
= prototyped
? TREE_VALUE (args
) : TREE_TYPE (args
);
1318 argno
= write_arg_type (s
, 0, argno
, type
, prototyped
);
1321 if (stdarg_p (fntype
))
1322 argno
= write_arg_type (s
, ARG_POINTER_REGNUM
, argno
, ptr_type_node
,
1325 if (DECL_STATIC_CHAIN (decl
) || cfun
->machine
->has_chain
)
1326 write_arg_type (s
, STATIC_CHAIN_REGNUM
,
1327 DECL_STATIC_CHAIN (decl
) ? argno
: -1, ptr_type_node
,
1330 fprintf (file
, "%s", s
.str().c_str());
1332 /* Usually 'crtl->is_leaf' is computed during register allocator
1333 initialization (which is not done on NVPTX) or for pressure-sensitive
1334 optimizations. Initialize it here, except if already set. */
1336 crtl
->is_leaf
= leaf_function_p ();
1338 HOST_WIDE_INT sz
= get_frame_size ();
1339 bool need_frameptr
= sz
|| cfun
->machine
->has_chain
;
1340 int alignment
= crtl
->stack_alignment_needed
/ BITS_PER_UNIT
;
1341 if (!TARGET_SOFT_STACK
)
1343 /* Declare a local var for outgoing varargs. */
1344 if (cfun
->machine
->has_varadic
)
1345 init_frame (file
, STACK_POINTER_REGNUM
,
1346 UNITS_PER_WORD
, crtl
->outgoing_args_size
);
1348 /* Declare a local variable for the frame. Force its size to be
1349 DImode-compatible. */
1351 init_frame (file
, FRAME_POINTER_REGNUM
, alignment
,
1352 ROUND_UP (sz
, GET_MODE_SIZE (DImode
)));
1354 else if (need_frameptr
|| cfun
->machine
->has_varadic
|| cfun
->calls_alloca
1355 || (cfun
->machine
->has_simtreg
&& !crtl
->is_leaf
))
1356 init_softstack_frame (file
, alignment
, sz
);
1358 if (cfun
->machine
->has_simtreg
)
1360 unsigned HOST_WIDE_INT
&simtsz
= cfun
->machine
->simt_stack_size
;
1361 unsigned HOST_WIDE_INT
&align
= cfun
->machine
->simt_stack_align
;
1362 align
= MAX (align
, GET_MODE_SIZE (DImode
));
1363 if (!crtl
->is_leaf
|| cfun
->calls_alloca
)
1364 simtsz
= HOST_WIDE_INT_M1U
;
1365 if (simtsz
== HOST_WIDE_INT_M1U
)
1366 simtsz
= nvptx_softstack_size
;
1367 if (cfun
->machine
->has_softstack
)
1368 simtsz
+= POINTER_SIZE
/ 8;
1369 simtsz
= ROUND_UP (simtsz
, GET_MODE_SIZE (DImode
));
1370 if (align
> GET_MODE_SIZE (DImode
))
1371 simtsz
+= align
- GET_MODE_SIZE (DImode
);
1373 fprintf (file
, "\t.local.align 8 .b8 %%simtstack_ar["
1374 HOST_WIDE_INT_PRINT_DEC
"];\n", simtsz
);
1377 /* Restore the vector reduction partition register, if necessary.
1378 FIXME: Find out when and why this is necessary, and fix it. */
1379 if (cfun
->machine
->red_partition
)
1380 regno_reg_rtx
[REGNO (cfun
->machine
->red_partition
)]
1381 = cfun
->machine
->red_partition
;
1383 /* Declare the pseudos we have as ptx registers. */
1384 int maxregs
= max_reg_num ();
1385 for (int i
= LAST_VIRTUAL_REGISTER
+ 1; i
< maxregs
; i
++)
1387 if (regno_reg_rtx
[i
] != const0_rtx
)
1389 machine_mode mode
= PSEUDO_REGNO_MODE (i
);
1390 machine_mode split
= maybe_split_mode (mode
);
1392 if (split_mode_p (mode
))
1394 fprintf (file
, "\t.reg%s ", nvptx_ptx_type_from_mode (mode
, true));
1395 output_reg (file
, i
, split
, -2);
1396 fprintf (file
, ";\n");
1400 /* Emit axis predicates. */
1401 if (cfun
->machine
->axis_predicate
[0])
1402 nvptx_init_axis_predicate (file
,
1403 REGNO (cfun
->machine
->axis_predicate
[0]), "y");
1404 if (cfun
->machine
->axis_predicate
[1])
1405 nvptx_init_axis_predicate (file
,
1406 REGNO (cfun
->machine
->axis_predicate
[1]), "x");
1407 if (cfun
->machine
->unisimt_predicate
1408 || (cfun
->machine
->has_simtreg
&& !crtl
->is_leaf
))
1409 nvptx_init_unisimt_predicate (file
);
1410 if (cfun
->machine
->bcast_partition
|| cfun
->machine
->sync_bar
)
1411 nvptx_init_oacc_workers (file
);
1414 /* Output code for switching uniform-simt state. ENTERING indicates whether
1415 we are entering or leaving non-uniform execution region. */
1418 nvptx_output_unisimt_switch (FILE *file
, bool entering
)
1420 if (crtl
->is_leaf
&& !cfun
->machine
->unisimt_predicate
)
1422 fprintf (file
, "\t{\n");
1423 fprintf (file
, "\t\t.reg.u32 %%ustmp2;\n");
1424 fprintf (file
, "\t\tmov.u32 %%ustmp2, %d;\n", entering
? -1 : 0);
1427 int loc
= REGNO (cfun
->machine
->unisimt_location
);
1428 fprintf (file
, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc
);
1430 if (cfun
->machine
->unisimt_predicate
)
1432 int master
= REGNO (cfun
->machine
->unisimt_master
);
1433 int pred
= REGNO (cfun
->machine
->unisimt_predicate
);
1434 fprintf (file
, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
1435 fprintf (file
, "\t\tmov.u32 %%r%d, %s;\n",
1436 master
, entering
? "%ustmp2" : "0");
1437 fprintf (file
, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred
, master
);
1439 fprintf (file
, "\t}\n");
1442 /* Output code for allocating per-lane storage and switching soft-stack pointer.
1443 ENTERING indicates whether we are entering or leaving non-uniform execution.
1444 PTR is the register pointing to allocated storage, it is assigned to on
1445 entering and used to restore state on leaving. SIZE and ALIGN are used only
1449 nvptx_output_softstack_switch (FILE *file
, bool entering
,
1450 rtx ptr
, rtx size
, rtx align
)
1452 gcc_assert (REG_P (ptr
) && !HARD_REGISTER_P (ptr
));
1453 if (crtl
->is_leaf
&& !cfun
->machine
->simt_stack_size
)
1455 int bits
= POINTER_SIZE
, regno
= REGNO (ptr
);
1456 fprintf (file
, "\t{\n");
1459 fprintf (file
, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
1460 HOST_WIDE_INT_PRINT_DEC
";\n", bits
, regno
,
1461 cfun
->machine
->simt_stack_size
);
1462 fprintf (file
, "\t\tsub.u%d %%r%d, %%r%d, ", bits
, regno
, regno
);
1463 if (CONST_INT_P (size
))
1464 fprintf (file
, HOST_WIDE_INT_PRINT_DEC
,
1465 ROUND_UP (UINTVAL (size
), GET_MODE_SIZE (DImode
)));
1467 output_reg (file
, REGNO (size
), VOIDmode
);
1468 fputs (";\n", file
);
1469 if (!CONST_INT_P (size
) || UINTVAL (align
) > GET_MODE_SIZE (DImode
))
1471 "\t\tand.u%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC
";\n",
1472 bits
, regno
, regno
, UINTVAL (align
));
1474 if (cfun
->machine
->has_softstack
)
1476 const char *reg_stack
= reg_names
[STACK_POINTER_REGNUM
];
1479 fprintf (file
, "\t\tst.u%d [%%r%d + -%d], %s;\n",
1480 bits
, regno
, bits
/ 8, reg_stack
);
1481 fprintf (file
, "\t\tsub.u%d %s, %%r%d, %d;\n",
1482 bits
, reg_stack
, regno
, bits
/ 8);
1486 fprintf (file
, "\t\tld.u%d %s, [%%r%d + -%d];\n",
1487 bits
, reg_stack
, regno
, bits
/ 8);
1489 nvptx_output_set_softstack (REGNO (stack_pointer_rtx
));
1491 fprintf (file
, "\t}\n");
1494 /* Output code to enter non-uniform execution region. DEST is a register
1495 to hold a per-lane allocation given by SIZE and ALIGN. */
1498 nvptx_output_simt_enter (rtx dest
, rtx size
, rtx align
)
1500 nvptx_output_unisimt_switch (asm_out_file
, true);
1501 nvptx_output_softstack_switch (asm_out_file
, true, dest
, size
, align
);
1505 /* Output code to leave non-uniform execution region. SRC is the register
1506 holding per-lane storage previously allocated by omp_simt_enter insn. */
1509 nvptx_output_simt_exit (rtx src
)
1511 nvptx_output_unisimt_switch (asm_out_file
, false);
1512 nvptx_output_softstack_switch (asm_out_file
, false, src
, NULL_RTX
, NULL_RTX
);
1516 /* Output instruction that sets soft stack pointer in shared memory to the
1517 value in register given by SRC_REGNO. */
1520 nvptx_output_set_softstack (unsigned src_regno
)
1522 if (cfun
->machine
->has_softstack
&& !crtl
->is_leaf
)
1524 fprintf (asm_out_file
, "\tst.shared.u%d\t[%s], ",
1525 POINTER_SIZE
, reg_names
[SOFTSTACK_SLOT_REGNUM
]);
1526 output_reg (asm_out_file
, src_regno
, VOIDmode
);
1527 fprintf (asm_out_file
, ";\n");
1531 /* Output a return instruction. Also copy the return value to its outgoing
1535 nvptx_output_return (void)
1537 machine_mode mode
= (machine_mode
)cfun
->machine
->return_mode
;
1539 if (mode
!= VOIDmode
)
1540 fprintf (asm_out_file
, "\tst.param%s\t[%s_out], %s;\n",
1541 nvptx_ptx_type_from_mode (mode
, false),
1542 reg_names
[NVPTX_RETURN_REGNUM
],
1543 reg_names
[NVPTX_RETURN_REGNUM
]);
1548 /* Terminate a function by writing a closing brace to FILE. */
1551 nvptx_function_end (FILE *file
)
1553 fprintf (file
, "}\n");
1556 /* Decide whether we can make a sibling call to a function. For ptx, we
1560 nvptx_function_ok_for_sibcall (tree
, tree
)
1565 /* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1568 nvptx_get_drap_rtx (void)
1570 if (TARGET_SOFT_STACK
&& stack_realign_drap
)
1571 return arg_pointer_rtx
;
1575 /* Implement the TARGET_CALL_ARGS hook. Record information about one
1576 argument to the next call. */
1579 nvptx_call_args (rtx arg
, tree fntype
)
1581 if (!cfun
->machine
->doing_call
)
1583 cfun
->machine
->doing_call
= true;
1584 cfun
->machine
->is_varadic
= false;
1585 cfun
->machine
->num_args
= 0;
1587 if (fntype
&& stdarg_p (fntype
))
1589 cfun
->machine
->is_varadic
= true;
1590 cfun
->machine
->has_varadic
= true;
1591 cfun
->machine
->num_args
++;
1595 if (REG_P (arg
) && arg
!= pc_rtx
)
1597 cfun
->machine
->num_args
++;
1598 cfun
->machine
->call_args
= alloc_EXPR_LIST (VOIDmode
, arg
,
1599 cfun
->machine
->call_args
);
1603 /* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1604 information we recorded. */
1607 nvptx_end_call_args (void)
1609 cfun
->machine
->doing_call
= false;
1610 free_EXPR_LIST_list (&cfun
->machine
->call_args
);
1613 /* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1614 track of whether calls involving static chains or varargs were seen
1615 in the current function.
1616 For libcalls, maintain a hash table of decls we have seen, and
1617 record a function decl for later when encountering a new one. */
1620 nvptx_expand_call (rtx retval
, rtx address
)
1622 rtx callee
= XEXP (address
, 0);
1623 rtx varargs
= NULL_RTX
;
1624 unsigned parallel
= 0;
1626 if (!call_insn_operand (callee
, Pmode
))
1628 callee
= force_reg (Pmode
, callee
);
1629 address
= change_address (address
, QImode
, callee
);
1632 if (GET_CODE (callee
) == SYMBOL_REF
)
1634 tree decl
= SYMBOL_REF_DECL (callee
);
1635 if (decl
!= NULL_TREE
)
1637 if (DECL_STATIC_CHAIN (decl
))
1638 cfun
->machine
->has_chain
= true;
1640 tree attr
= oacc_get_fn_attrib (decl
);
1643 tree dims
= TREE_VALUE (attr
);
1645 parallel
= GOMP_DIM_MASK (GOMP_DIM_MAX
) - 1;
1646 for (int ix
= 0; ix
!= GOMP_DIM_MAX
; ix
++)
1648 if (TREE_PURPOSE (dims
)
1649 && !integer_zerop (TREE_PURPOSE (dims
)))
1651 /* Not on this axis. */
1652 parallel
^= GOMP_DIM_MASK (ix
);
1653 dims
= TREE_CHAIN (dims
);
1659 unsigned nargs
= cfun
->machine
->num_args
;
1660 if (cfun
->machine
->is_varadic
)
1662 varargs
= gen_reg_rtx (Pmode
);
1663 emit_move_insn (varargs
, stack_pointer_rtx
);
1666 rtvec vec
= rtvec_alloc (nargs
+ 1);
1667 rtx pat
= gen_rtx_PARALLEL (VOIDmode
, vec
);
1670 rtx call
= gen_rtx_CALL (VOIDmode
, address
, const0_rtx
);
1671 rtx tmp_retval
= retval
;
1674 if (!nvptx_register_operand (retval
, GET_MODE (retval
)))
1675 tmp_retval
= gen_reg_rtx (GET_MODE (retval
));
1676 call
= gen_rtx_SET (tmp_retval
, call
);
1678 XVECEXP (pat
, 0, vec_pos
++) = call
;
1680 /* Construct the call insn, including a USE for each argument pseudo
1681 register. These will be used when printing the insn. */
1682 for (rtx arg
= cfun
->machine
->call_args
; arg
; arg
= XEXP (arg
, 1))
1683 XVECEXP (pat
, 0, vec_pos
++) = gen_rtx_USE (VOIDmode
, XEXP (arg
, 0));
1686 XVECEXP (pat
, 0, vec_pos
++) = gen_rtx_USE (VOIDmode
, varargs
);
1688 gcc_assert (vec_pos
= XVECLEN (pat
, 0));
1690 nvptx_emit_forking (parallel
, true);
1691 emit_call_insn (pat
);
1692 nvptx_emit_joining (parallel
, true);
1694 if (tmp_retval
!= retval
)
1695 emit_move_insn (retval
, tmp_retval
);
1698 /* Emit a comparison COMPARE, and return the new test to be used in the
1702 nvptx_expand_compare (rtx compare
)
1704 rtx pred
= gen_reg_rtx (BImode
);
1705 rtx cmp
= gen_rtx_fmt_ee (GET_CODE (compare
), BImode
,
1706 XEXP (compare
, 0), XEXP (compare
, 1));
1707 emit_insn (gen_rtx_SET (pred
, cmp
));
1708 return gen_rtx_NE (BImode
, pred
, const0_rtx
);
1711 /* Expand the oacc fork & join primitive into ptx-required unspecs. */
1714 nvptx_expand_oacc_fork (unsigned mode
)
1716 nvptx_emit_forking (GOMP_DIM_MASK (mode
), false);
1720 nvptx_expand_oacc_join (unsigned mode
)
1722 nvptx_emit_joining (GOMP_DIM_MASK (mode
), false);
1725 /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1729 nvptx_gen_unpack (rtx dst0
, rtx dst1
, rtx src
)
1733 switch (GET_MODE (src
))
1736 res
= gen_unpackdisi2 (dst0
, dst1
, src
);
1739 res
= gen_unpackdfsi2 (dst0
, dst1
, src
);
1741 default: gcc_unreachable ();
1746 /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1750 nvptx_gen_pack (rtx dst
, rtx src0
, rtx src1
)
1754 switch (GET_MODE (dst
))
1757 res
= gen_packsidi2 (dst
, src0
, src1
);
1760 res
= gen_packsidf2 (dst
, src0
, src1
);
1762 default: gcc_unreachable ();
1767 /* Generate an instruction or sequence to broadcast register REG
1768 across the vectors of a single warp. */
1771 nvptx_gen_shuffle (rtx dst
, rtx src
, rtx idx
, nvptx_shuffle_kind kind
)
1775 switch (GET_MODE (dst
))
1778 res
= gen_nvptx_shufflesi (dst
, src
, idx
, GEN_INT (kind
));
1781 res
= gen_nvptx_shufflesf (dst
, src
, idx
, GEN_INT (kind
));
1786 rtx tmp0
= gen_reg_rtx (SImode
);
1787 rtx tmp1
= gen_reg_rtx (SImode
);
1790 emit_insn (nvptx_gen_unpack (tmp0
, tmp1
, src
));
1791 emit_insn (nvptx_gen_shuffle (tmp0
, tmp0
, idx
, kind
));
1792 emit_insn (nvptx_gen_shuffle (tmp1
, tmp1
, idx
, kind
));
1793 emit_insn (nvptx_gen_pack (dst
, tmp0
, tmp1
));
1800 rtx tmp
= gen_reg_rtx (SImode
);
1803 emit_insn (gen_sel_truesi (tmp
, src
, GEN_INT (1), const0_rtx
));
1804 emit_insn (nvptx_gen_shuffle (tmp
, tmp
, idx
, kind
));
1805 emit_insn (gen_rtx_SET (dst
, gen_rtx_NE (BImode
, tmp
, const0_rtx
)));
1813 rtx tmp
= gen_reg_rtx (SImode
);
1816 emit_insn (gen_rtx_SET (tmp
, gen_rtx_fmt_e (ZERO_EXTEND
, SImode
, src
)));
1817 emit_insn (nvptx_gen_shuffle (tmp
, tmp
, idx
, kind
));
1818 emit_insn (gen_rtx_SET (dst
, gen_rtx_fmt_e (TRUNCATE
, GET_MODE (dst
),
1831 /* Generate an instruction or sequence to broadcast register REG
1832 across the vectors of a single warp. */
1835 nvptx_gen_warp_bcast (rtx reg
)
1837 return nvptx_gen_shuffle (reg
, reg
, const0_rtx
, SHUFFLE_IDX
);
1840 /* Structure used when generating a worker-level spill or fill. */
1842 struct broadcast_data_t
1844 rtx base
; /* Register holding base addr of buffer. */
1845 rtx ptr
; /* Iteration var, if needed. */
1846 unsigned offset
; /* Offset into worker buffer. */
1849 /* Direction of the spill/fill and looping setup/teardown indicator. */
1855 PM_loop_begin
= 1 << 2,
1856 PM_loop_end
= 1 << 3,
1858 PM_read_write
= PM_read
| PM_write
1861 /* Generate instruction(s) to spill or fill register REG to/from the
1862 worker broadcast array. PM indicates what is to be done, REP
1863 how many loop iterations will be executed (0 for not a loop). */
1866 nvptx_gen_shared_bcast (rtx reg
, propagate_mask pm
, unsigned rep
,
1867 broadcast_data_t
*data
, bool vector
)
1870 machine_mode mode
= GET_MODE (reg
);
1876 rtx tmp
= gen_reg_rtx (SImode
);
1880 emit_insn (gen_sel_truesi (tmp
, reg
, GEN_INT (1), const0_rtx
));
1881 emit_insn (nvptx_gen_shared_bcast (tmp
, pm
, rep
, data
, vector
));
1883 emit_insn (gen_rtx_SET (reg
, gen_rtx_NE (BImode
, tmp
, const0_rtx
)));
1891 rtx addr
= data
->ptr
;
1895 unsigned align
= GET_MODE_ALIGNMENT (mode
) / BITS_PER_UNIT
;
1897 oacc_bcast_align
= MAX (oacc_bcast_align
, align
);
1898 data
->offset
= ROUND_UP (data
->offset
, align
);
1900 gcc_assert (data
->base
!= NULL
);
1902 addr
= gen_rtx_PLUS (Pmode
, addr
, GEN_INT (data
->offset
));
1905 addr
= gen_rtx_MEM (mode
, addr
);
1907 res
= gen_rtx_SET (addr
, reg
);
1908 else if (pm
== PM_write
)
1909 res
= gen_rtx_SET (reg
, addr
);
1915 /* We're using a ptr, increment it. */
1919 emit_insn (gen_adddi3 (data
->ptr
, data
->ptr
,
1920 GEN_INT (GET_MODE_SIZE (GET_MODE (reg
)))));
1926 data
->offset
+= rep
* GET_MODE_SIZE (GET_MODE (reg
));
1933 /* Returns true if X is a valid address for use in a memory reference. */
1936 nvptx_legitimate_address_p (machine_mode
, rtx x
, bool)
1938 enum rtx_code code
= GET_CODE (x
);
1946 if (REG_P (XEXP (x
, 0)) && CONST_INT_P (XEXP (x
, 1)))
1960 /* Machinery to output constant initializers. When beginning an
1961 initializer, we decide on a fragment size (which is visible in ptx
1962 in the type used), and then all initializer data is buffered until
1963 a fragment is filled and ready to be written out. */
1967 unsigned HOST_WIDE_INT mask
; /* Mask for storing fragment. */
1968 unsigned HOST_WIDE_INT val
; /* Current fragment value. */
1969 unsigned HOST_WIDE_INT remaining
; /* Remaining bytes to be written
1971 unsigned size
; /* Fragment size to accumulate. */
1972 unsigned offset
; /* Offset within current fragment. */
1973 bool started
; /* Whether we've output any initializer. */
1976 /* The current fragment is full, write it out. SYM may provide a
1977 symbolic reference we should output, in which case the fragment
1978 value is the addend. */
1981 output_init_frag (rtx sym
)
1983 fprintf (asm_out_file
, init_frag
.started
? ", " : " = { ");
1984 unsigned HOST_WIDE_INT val
= init_frag
.val
;
1986 init_frag
.started
= true;
1988 init_frag
.offset
= 0;
1989 init_frag
.remaining
--;
1993 bool function
= (SYMBOL_REF_DECL (sym
)
1994 && (TREE_CODE (SYMBOL_REF_DECL (sym
)) == FUNCTION_DECL
));
1996 fprintf (asm_out_file
, "generic(");
1997 output_address (VOIDmode
, sym
);
1999 fprintf (asm_out_file
, ")");
2001 fprintf (asm_out_file
, " + ");
2005 fprintf (asm_out_file
, HOST_WIDE_INT_PRINT_DEC
, val
);
2008 /* Add value VAL of size SIZE to the data we're emitting, and keep
2009 writing out chunks as they fill up. */
2012 nvptx_assemble_value (unsigned HOST_WIDE_INT val
, unsigned size
)
2014 val
&= ((unsigned HOST_WIDE_INT
)2 << (size
* BITS_PER_UNIT
- 1)) - 1;
2016 for (unsigned part
= 0; size
; size
-= part
)
2018 val
>>= part
* BITS_PER_UNIT
;
2019 part
= init_frag
.size
- init_frag
.offset
;
2020 part
= MIN (part
, size
);
2022 unsigned HOST_WIDE_INT partial
2023 = val
<< (init_frag
.offset
* BITS_PER_UNIT
);
2024 init_frag
.val
|= partial
& init_frag
.mask
;
2025 init_frag
.offset
+= part
;
2027 if (init_frag
.offset
== init_frag
.size
)
2028 output_init_frag (NULL
);
2032 /* Target hook for assembling integer object X of size SIZE. */
2035 nvptx_assemble_integer (rtx x
, unsigned int size
, int ARG_UNUSED (aligned_p
))
2037 HOST_WIDE_INT val
= 0;
2039 switch (GET_CODE (x
))
2042 /* Let the generic machinery figure it out, usually for a
2047 nvptx_assemble_value (INTVAL (x
), size
);
2052 gcc_assert (GET_CODE (x
) == PLUS
);
2053 val
= INTVAL (XEXP (x
, 1));
2055 gcc_assert (GET_CODE (x
) == SYMBOL_REF
);
2059 gcc_assert (size
== init_frag
.size
);
2060 if (init_frag
.offset
)
2061 sorry ("cannot emit unaligned pointers in ptx assembly");
2063 nvptx_maybe_record_fnsym (x
);
2064 init_frag
.val
= val
;
2065 output_init_frag (x
);
2072 /* Output SIZE zero bytes. We ignore the FILE argument since the
2073 functions we're calling to perform the output just use
2077 nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size
)
2079 /* Finish the current fragment, if it's started. */
2080 if (init_frag
.offset
)
2082 unsigned part
= init_frag
.size
- init_frag
.offset
;
2083 part
= MIN (part
, (unsigned)size
);
2085 nvptx_assemble_value (0, part
);
2088 /* If this skip doesn't terminate the initializer, write as many
2089 remaining pieces as possible directly. */
2090 if (size
< init_frag
.remaining
* init_frag
.size
)
2092 while (size
>= init_frag
.size
)
2094 size
-= init_frag
.size
;
2095 output_init_frag (NULL_RTX
);
2098 nvptx_assemble_value (0, size
);
2102 /* Output a string STR with length SIZE. As in nvptx_output_skip we
2103 ignore the FILE arg. */
2106 nvptx_output_ascii (FILE *, const char *str
, unsigned HOST_WIDE_INT size
)
2108 for (unsigned HOST_WIDE_INT i
= 0; i
< size
; i
++)
2109 nvptx_assemble_value (str
[i
], 1);
2112 /* Return true if TYPE is a record type where the last field is an array without
2116 flexible_array_member_type_p (const_tree type
)
2118 if (TREE_CODE (type
) != RECORD_TYPE
)
2121 const_tree last_field
= NULL_TREE
;
2122 for (const_tree f
= TYPE_FIELDS (type
); f
; f
= TREE_CHAIN (f
))
2128 const_tree last_field_type
= TREE_TYPE (last_field
);
2129 if (TREE_CODE (last_field_type
) != ARRAY_TYPE
)
2132 return (! TYPE_DOMAIN (last_field_type
)
2133 || ! TYPE_MAX_VALUE (TYPE_DOMAIN (last_field_type
)));
2136 /* Emit a PTX variable decl and prepare for emission of its
2137 initializer. NAME is the symbol name and SETION the PTX data
2138 area. The type is TYPE, object size SIZE and alignment is ALIGN.
2139 The caller has already emitted any indentation and linkage
2140 specifier. It is responsible for any initializer, terminating ;
2141 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
2142 this is the opposite way round that PTX wants them! */
2145 nvptx_assemble_decl_begin (FILE *file
, const char *name
, const char *section
,
2146 const_tree type
, HOST_WIDE_INT size
, unsigned align
,
2147 bool undefined
= false)
2149 bool atype
= (TREE_CODE (type
) == ARRAY_TYPE
)
2150 && (TYPE_DOMAIN (type
) == NULL_TREE
);
2152 if (undefined
&& flexible_array_member_type_p (type
))
2158 while (TREE_CODE (type
) == ARRAY_TYPE
)
2159 type
= TREE_TYPE (type
);
2161 if (TREE_CODE (type
) == VECTOR_TYPE
2162 || TREE_CODE (type
) == COMPLEX_TYPE
)
2163 /* Neither vector nor complex types can contain the other. */
2164 type
= TREE_TYPE (type
);
2166 unsigned elt_size
= int_size_in_bytes (type
);
2168 /* Largest mode we're prepared to accept. For BLKmode types we
2169 don't know if it'll contain pointer constants, so have to choose
2170 pointer size, otherwise we can choose DImode. */
2171 machine_mode elt_mode
= TYPE_MODE (type
) == BLKmode
? Pmode
: DImode
;
2173 elt_size
|= GET_MODE_SIZE (elt_mode
);
2174 elt_size
&= -elt_size
; /* Extract LSB set. */
2176 init_frag
.size
= elt_size
;
2177 /* Avoid undefined shift behavior by using '2'. */
2178 init_frag
.mask
= ((unsigned HOST_WIDE_INT
)2
2179 << (elt_size
* BITS_PER_UNIT
- 1)) - 1;
2181 init_frag
.offset
= 0;
2182 init_frag
.started
= false;
2183 /* Size might not be a multiple of elt size, if there's an
2184 initialized trailing struct array with smaller type than
2186 init_frag
.remaining
= (size
+ elt_size
- 1) / elt_size
;
2188 fprintf (file
, "%s .align %d .u%d ",
2189 section
, align
/ BITS_PER_UNIT
,
2190 elt_size
* BITS_PER_UNIT
);
2191 assemble_name (file
, name
);
2194 /* We make everything an array, to simplify any initialization
2196 fprintf (file
, "[" HOST_WIDE_INT_PRINT_DEC
"]", init_frag
.remaining
);
2198 fprintf (file
, "[]");
2201 /* Called when the initializer for a decl has been completely output through
2202 combinations of the three functions above. */
2205 nvptx_assemble_decl_end (void)
2207 if (init_frag
.offset
)
2208 /* This can happen with a packed struct with trailing array member. */
2209 nvptx_assemble_value (0, init_frag
.size
- init_frag
.offset
);
2210 fprintf (asm_out_file
, init_frag
.started
? " };\n" : ";\n");
2213 /* Output an uninitialized common or file-scope variable. */
2216 nvptx_output_aligned_decl (FILE *file
, const char *name
,
2217 const_tree decl
, HOST_WIDE_INT size
, unsigned align
)
2219 write_var_marker (file
, true, TREE_PUBLIC (decl
), name
);
2221 /* If this is public, it is common. The nearest thing we have to
2223 fprintf (file
, "\t%s", TREE_PUBLIC (decl
) ? ".weak " : "");
2225 nvptx_assemble_decl_begin (file
, name
, section_for_decl (decl
),
2226 TREE_TYPE (decl
), size
, align
);
2227 nvptx_assemble_decl_end ();
2230 /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
2231 writing a constant variable EXP with NAME and SIZE and its
2232 initializer to FILE. */
2235 nvptx_asm_declare_constant_name (FILE *file
, const char *name
,
2236 const_tree exp
, HOST_WIDE_INT obj_size
)
2238 write_var_marker (file
, true, false, name
);
2240 fprintf (file
, "\t");
2242 tree type
= TREE_TYPE (exp
);
2243 nvptx_assemble_decl_begin (file
, name
, ".const", type
, obj_size
,
2247 /* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
2248 a variable DECL with NAME to FILE. */
2251 nvptx_declare_object_name (FILE *file
, const char *name
, const_tree decl
)
2253 write_var_marker (file
, true, TREE_PUBLIC (decl
), name
);
2255 fprintf (file
, "\t%s", (!TREE_PUBLIC (decl
) ? ""
2256 : DECL_WEAK (decl
) ? ".weak " : ".visible "));
2258 tree type
= TREE_TYPE (decl
);
2259 HOST_WIDE_INT obj_size
= tree_to_shwi (DECL_SIZE_UNIT (decl
));
2260 nvptx_assemble_decl_begin (file
, name
, section_for_decl (decl
),
2261 type
, obj_size
, DECL_ALIGN (decl
));
2264 /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
2267 nvptx_globalize_label (FILE *, const char *)
2271 /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
2272 declaration only for variable DECL with NAME to FILE. */
2275 nvptx_assemble_undefined_decl (FILE *file
, const char *name
, const_tree decl
)
2277 /* The middle end can place constant pool decls into the varpool as
2278 undefined. Until that is fixed, catch the problem here. */
2279 if (DECL_IN_CONSTANT_POOL (decl
))
2282 /* We support weak defintions, and hence have the right
2283 ASM_WEAKEN_DECL definition. Diagnose the problem here. */
2284 if (DECL_WEAK (decl
))
2285 error_at (DECL_SOURCE_LOCATION (decl
),
2286 "PTX does not support weak declarations"
2287 " (only weak definitions)");
2288 write_var_marker (file
, false, TREE_PUBLIC (decl
), name
);
2290 fprintf (file
, "\t.extern ");
2291 tree size
= DECL_SIZE_UNIT (decl
);
2292 nvptx_assemble_decl_begin (file
, name
, section_for_decl (decl
),
2293 TREE_TYPE (decl
), size
? tree_to_shwi (size
) : 0,
2294 DECL_ALIGN (decl
), true);
2295 nvptx_assemble_decl_end ();
2298 /* Output a pattern for a move instruction. */
2301 nvptx_output_mov_insn (rtx dst
, rtx src
)
2303 machine_mode dst_mode
= GET_MODE (dst
);
2304 machine_mode dst_inner
= (GET_CODE (dst
) == SUBREG
2305 ? GET_MODE (XEXP (dst
, 0)) : dst_mode
);
2306 machine_mode src_inner
= (GET_CODE (src
) == SUBREG
2307 ? GET_MODE (XEXP (src
, 0)) : dst_mode
);
2310 if (GET_CODE (sym
) == CONST
)
2311 sym
= XEXP (XEXP (sym
, 0), 0);
2312 if (SYMBOL_REF_P (sym
))
2314 if (SYMBOL_DATA_AREA (sym
) != DATA_AREA_GENERIC
)
2315 return "%.\tcvta%D1%t0\t%0, %1;";
2316 nvptx_maybe_record_fnsym (sym
);
2319 if (src_inner
== dst_inner
)
2320 return "%.\tmov%t0\t%0, %1;";
2322 if (CONSTANT_P (src
))
2323 return (GET_MODE_CLASS (dst_inner
) == MODE_INT
2324 && GET_MODE_CLASS (src_inner
) != MODE_FLOAT
2325 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2327 if (GET_MODE_SIZE (dst_inner
) == GET_MODE_SIZE (src_inner
))
2329 if (GET_MODE_BITSIZE (dst_mode
) == 128
2330 && GET_MODE_BITSIZE (GET_MODE (src
)) == 128)
2332 /* mov.b128 is not supported. */
2333 if (dst_inner
== V2DImode
&& src_inner
== TImode
)
2334 return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;";
2335 else if (dst_inner
== TImode
&& src_inner
== V2DImode
)
2336 return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;";
2340 return "%.\tmov.b%T0\t%0, %1;";
2343 return "%.\tcvt%t0%t1\t%0, %1;";
2346 static void nvptx_print_operand (FILE *, rtx
, int);
2348 /* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
2349 involves writing .param declarations and in/out copies into them. For
2350 indirect calls, also write the .callprototype. */
2353 nvptx_output_call_insn (rtx_insn
*insn
, rtx result
, rtx callee
)
2357 bool needs_tgt
= register_operand (callee
, Pmode
);
2358 rtx pat
= PATTERN (insn
);
2359 if (GET_CODE (pat
) == COND_EXEC
)
2360 pat
= COND_EXEC_CODE (pat
);
2361 int arg_end
= XVECLEN (pat
, 0);
2362 tree decl
= NULL_TREE
;
2364 fprintf (asm_out_file
, "\t{\n");
2366 fprintf (asm_out_file
, "\t\t.param%s %s_in;\n",
2367 nvptx_ptx_type_from_mode (GET_MODE (result
), false),
2368 reg_names
[NVPTX_RETURN_REGNUM
]);
2370 /* Ensure we have a ptx declaration in the output if necessary. */
2371 if (GET_CODE (callee
) == SYMBOL_REF
)
2373 decl
= SYMBOL_REF_DECL (callee
);
2375 || (DECL_EXTERNAL (decl
) && !TYPE_ARG_TYPES (TREE_TYPE (decl
))))
2376 nvptx_record_libfunc (callee
, result
, pat
);
2377 else if (DECL_EXTERNAL (decl
))
2378 nvptx_record_fndecl (decl
);
2383 ASM_GENERATE_INTERNAL_LABEL (buf
, "LCT", labelno
);
2385 ASM_OUTPUT_LABEL (asm_out_file
, buf
);
2386 std::stringstream s
;
2387 write_fn_proto_from_insn (s
, NULL
, result
, pat
);
2388 fputs (s
.str().c_str(), asm_out_file
);
2391 for (int argno
= 1; argno
< arg_end
; argno
++)
2393 rtx t
= XEXP (XVECEXP (pat
, 0, argno
), 0);
2394 machine_mode mode
= GET_MODE (t
);
2395 const char *ptx_type
= nvptx_ptx_type_from_mode (mode
, false);
2397 /* Mode splitting has already been done. */
2398 fprintf (asm_out_file
, "\t\t.param%s %%out_arg%d;\n"
2399 "\t\tst.param%s [%%out_arg%d], ",
2400 ptx_type
, argno
, ptx_type
, argno
);
2401 output_reg (asm_out_file
, REGNO (t
), VOIDmode
);
2402 fprintf (asm_out_file
, ";\n");
2405 /* The '.' stands for the call's predicate, if any. */
2406 nvptx_print_operand (asm_out_file
, NULL_RTX
, '.');
2407 fprintf (asm_out_file
, "\t\tcall ");
2408 if (result
!= NULL_RTX
)
2409 fprintf (asm_out_file
, "(%s_in), ", reg_names
[NVPTX_RETURN_REGNUM
]);
2413 const char *name
= get_fnname_from_decl (decl
);
2414 name
= nvptx_name_replacement (name
);
2415 assemble_name (asm_out_file
, name
);
2418 output_address (VOIDmode
, callee
);
2420 const char *open
= "(";
2421 for (int argno
= 1; argno
< arg_end
; argno
++)
2423 fprintf (asm_out_file
, ", %s%%out_arg%d", open
, argno
);
2426 if (decl
&& DECL_STATIC_CHAIN (decl
))
2428 fprintf (asm_out_file
, ", %s%s", open
, reg_names
[STATIC_CHAIN_REGNUM
]);
2432 fprintf (asm_out_file
, ")");
2436 fprintf (asm_out_file
, ", ");
2437 assemble_name (asm_out_file
, buf
);
2439 fprintf (asm_out_file
, ";\n");
2441 if (find_reg_note (insn
, REG_NORETURN
, NULL
))
2443 /* No return functions confuse the PTX JIT, as it doesn't realize
2444 the flow control barrier they imply. It can seg fault if it
2445 encounters what looks like an unexitable loop. Emit a trailing
2446 trap and exit, which it does grok. */
2447 fprintf (asm_out_file
, "\t\ttrap; // (noreturn)\n");
2448 fprintf (asm_out_file
, "\t\texit; // (noreturn)\n");
2453 static char rval
[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2456 /* We must escape the '%' that starts RETURN_REGNUM. */
2457 sprintf (rval
, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2458 reg_names
[NVPTX_RETURN_REGNUM
]);
2465 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
2468 nvptx_print_operand_punct_valid_p (unsigned char c
)
2470 return c
== '.' || c
== '#';
2473 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
2476 nvptx_print_address_operand (FILE *file
, rtx x
, machine_mode
)
2479 if (GET_CODE (x
) == CONST
)
2481 switch (GET_CODE (x
))
2485 output_address (VOIDmode
, XEXP (x
, 0));
2486 fprintf (file
, "+");
2487 output_address (VOIDmode
, off
);
2492 output_addr_const (file
, x
);
2496 gcc_assert (GET_CODE (x
) != MEM
);
2497 nvptx_print_operand (file
, x
, 0);
2502 /* Write assembly language output for the address ADDR to FILE. */
2505 nvptx_print_operand_address (FILE *file
, machine_mode mode
, rtx addr
)
2507 nvptx_print_address_operand (file
, addr
, mode
);
2510 /* Print an operand, X, to FILE, with an optional modifier in CODE.
2513 . -- print the predicate for the instruction or an emptry string for an
2515 # -- print a rounding mode for the instruction
2517 A -- print a data area for a MEM
2518 c -- print an opcode suffix for a comparison operator, including a type code
2519 D -- print a data area for a MEM operand
2520 S -- print a shuffle kind specified by CONST_INT
2521 t -- print a type opcode suffix, promoting QImode to 32 bits
2522 T -- print a type size in bits
2523 u -- print a type opcode suffix without promotions. */
2526 nvptx_print_operand (FILE *file
, rtx x
, int code
)
2530 x
= current_insn_predicate
;
2534 if (GET_CODE (x
) == EQ
)
2536 output_reg (file
, REGNO (XEXP (x
, 0)), VOIDmode
);
2540 else if (code
== '#')
2542 fputs (".rn", file
);
2546 enum rtx_code x_code
= GET_CODE (x
);
2547 machine_mode mode
= GET_MODE (x
);
2556 if (GET_CODE (x
) == CONST
)
2558 if (GET_CODE (x
) == PLUS
)
2561 if (GET_CODE (x
) == SYMBOL_REF
)
2562 fputs (section_for_sym (x
), file
);
2567 if (x_code
== SUBREG
)
2569 machine_mode inner_mode
= GET_MODE (SUBREG_REG (x
));
2570 if (VECTOR_MODE_P (inner_mode
)
2571 && (GET_MODE_SIZE (mode
)
2572 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode
))))
2573 mode
= GET_MODE_INNER (inner_mode
);
2574 else if (split_mode_p (inner_mode
))
2575 mode
= maybe_split_mode (inner_mode
);
2579 fprintf (file
, "%s", nvptx_ptx_type_from_mode (mode
, code
== 't'));
2585 rtx inner_x
= SUBREG_REG (x
);
2586 machine_mode inner_mode
= GET_MODE (inner_x
);
2587 machine_mode split
= maybe_split_mode (inner_mode
);
2589 output_reg (file
, REGNO (inner_x
), split
,
2591 ? GET_MODE_SIZE (inner_mode
) / 2
2598 nvptx_shuffle_kind kind
= (nvptx_shuffle_kind
) UINTVAL (x
);
2599 /* Same order as nvptx_shuffle_kind. */
2600 static const char *const kinds
[] =
2601 {".up", ".down", ".bfly", ".idx"};
2602 fputs (kinds
[kind
], file
);
2607 fprintf (file
, "%d", GET_MODE_BITSIZE (mode
));
2611 fprintf (file
, "@");
2615 fprintf (file
, "@!");
2619 mode
= GET_MODE (XEXP (x
, 0));
2623 fputs (".eq", file
);
2626 if (FLOAT_MODE_P (mode
))
2627 fputs (".neu", file
);
2629 fputs (".ne", file
);
2633 fputs (".le", file
);
2637 fputs (".ge", file
);
2641 fputs (".lt", file
);
2645 fputs (".gt", file
);
2648 fputs (".ne", file
);
2651 fputs (".equ", file
);
2654 fputs (".leu", file
);
2657 fputs (".geu", file
);
2660 fputs (".ltu", file
);
2663 fputs (".gtu", file
);
2666 fputs (".nan", file
);
2669 fputs (".num", file
);
2674 if (FLOAT_MODE_P (mode
)
2675 || x_code
== EQ
|| x_code
== NE
2676 || x_code
== GEU
|| x_code
== GTU
2677 || x_code
== LEU
|| x_code
== LTU
)
2678 fputs (nvptx_ptx_type_from_mode (mode
, true), file
);
2680 fprintf (file
, ".s%d", GET_MODE_BITSIZE (mode
));
2688 rtx inner_x
= SUBREG_REG (x
);
2689 machine_mode inner_mode
= GET_MODE (inner_x
);
2690 machine_mode split
= maybe_split_mode (inner_mode
);
2692 if (VECTOR_MODE_P (inner_mode
)
2693 && (GET_MODE_SIZE (mode
)
2694 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode
))))
2696 output_reg (file
, REGNO (inner_x
), VOIDmode
);
2697 fprintf (file
, ".%s", SUBREG_BYTE (x
) == 0 ? "x" : "y");
2699 else if (split_mode_p (inner_mode
)
2700 && (GET_MODE_SIZE (inner_mode
) == GET_MODE_SIZE (mode
)))
2701 output_reg (file
, REGNO (inner_x
), split
);
2703 output_reg (file
, REGNO (inner_x
), split
, SUBREG_BYTE (x
));
2708 output_reg (file
, REGNO (x
), maybe_split_mode (mode
));
2713 nvptx_print_address_operand (file
, XEXP (x
, 0), mode
);
2718 output_addr_const (file
, x
);
2724 /* We could use output_addr_const, but that can print things like
2725 "x-8", which breaks ptxas. Need to ensure it is output as
2727 nvptx_print_address_operand (file
, x
, VOIDmode
);
2732 real_to_target (vals
, CONST_DOUBLE_REAL_VALUE (x
), mode
);
2733 vals
[0] &= 0xffffffff;
2734 vals
[1] &= 0xffffffff;
2736 fprintf (file
, "0f%08lx", vals
[0]);
2738 fprintf (file
, "0d%08lx%08lx", vals
[1], vals
[0]);
2743 unsigned n
= CONST_VECTOR_NUNITS (x
);
2744 fprintf (file
, "{ ");
2745 for (unsigned i
= 0; i
< n
; ++i
)
2748 fprintf (file
, ", ");
2750 rtx elem
= CONST_VECTOR_ELT (x
, i
);
2751 output_addr_const (file
, elem
);
2753 fprintf (file
, " }");
2758 output_addr_const (file
, x
);
2763 /* Record replacement regs used to deal with subreg operands. */
2766 rtx replacement
[MAX_RECOG_OPERANDS
];
2772 /* Allocate or reuse a replacement in R and return the rtx. */
2775 get_replacement (struct reg_replace
*r
)
2777 if (r
->n_allocated
== r
->n_in_use
)
2778 r
->replacement
[r
->n_allocated
++] = gen_reg_rtx (r
->mode
);
2779 return r
->replacement
[r
->n_in_use
++];
2782 /* Clean up subreg operands. In ptx assembly, everything is typed, and
2783 the presence of subregs would break the rules for most instructions.
2784 Replace them with a suitable new register of the right size, plus
2785 conversion copyin/copyout instructions. */
2788 nvptx_reorg_subreg (void)
2790 struct reg_replace qiregs
, hiregs
, siregs
, diregs
;
2791 rtx_insn
*insn
, *next
;
2793 qiregs
.n_allocated
= 0;
2794 hiregs
.n_allocated
= 0;
2795 siregs
.n_allocated
= 0;
2796 diregs
.n_allocated
= 0;
2797 qiregs
.mode
= QImode
;
2798 hiregs
.mode
= HImode
;
2799 siregs
.mode
= SImode
;
2800 diregs
.mode
= DImode
;
2802 for (insn
= get_insns (); insn
; insn
= next
)
2804 next
= NEXT_INSN (insn
);
2805 if (!NONDEBUG_INSN_P (insn
)
2806 || asm_noperands (PATTERN (insn
)) >= 0
2807 || GET_CODE (PATTERN (insn
)) == USE
2808 || GET_CODE (PATTERN (insn
)) == CLOBBER
)
2811 qiregs
.n_in_use
= 0;
2812 hiregs
.n_in_use
= 0;
2813 siregs
.n_in_use
= 0;
2814 diregs
.n_in_use
= 0;
2815 extract_insn (insn
);
2816 enum attr_subregs_ok s_ok
= get_attr_subregs_ok (insn
);
2818 for (int i
= 0; i
< recog_data
.n_operands
; i
++)
2820 rtx op
= recog_data
.operand
[i
];
2821 if (GET_CODE (op
) != SUBREG
)
2824 rtx inner
= SUBREG_REG (op
);
2826 machine_mode outer_mode
= GET_MODE (op
);
2827 machine_mode inner_mode
= GET_MODE (inner
);
2830 && (GET_MODE_PRECISION (inner_mode
)
2831 >= GET_MODE_PRECISION (outer_mode
)))
2833 gcc_assert (SCALAR_INT_MODE_P (outer_mode
));
2834 struct reg_replace
*r
= (outer_mode
== QImode
? &qiregs
2835 : outer_mode
== HImode
? &hiregs
2836 : outer_mode
== SImode
? &siregs
2838 rtx new_reg
= get_replacement (r
);
2840 if (recog_data
.operand_type
[i
] != OP_OUT
)
2843 if (GET_MODE_PRECISION (inner_mode
)
2844 < GET_MODE_PRECISION (outer_mode
))
2849 rtx pat
= gen_rtx_SET (new_reg
,
2850 gen_rtx_fmt_e (code
, outer_mode
, inner
));
2851 emit_insn_before (pat
, insn
);
2854 if (recog_data
.operand_type
[i
] != OP_IN
)
2857 if (GET_MODE_PRECISION (inner_mode
)
2858 < GET_MODE_PRECISION (outer_mode
))
2863 rtx pat
= gen_rtx_SET (inner
,
2864 gen_rtx_fmt_e (code
, inner_mode
, new_reg
));
2865 emit_insn_after (pat
, insn
);
2867 validate_change (insn
, recog_data
.operand_loc
[i
], new_reg
, false);
2872 /* Return a SImode "master lane index" register for uniform-simt, allocating on
2876 nvptx_get_unisimt_master ()
2878 rtx
&master
= cfun
->machine
->unisimt_master
;
2879 return master
? master
: master
= gen_reg_rtx (SImode
);
2882 /* Return a BImode "predicate" register for uniform-simt, similar to above. */
2885 nvptx_get_unisimt_predicate ()
2887 rtx
&pred
= cfun
->machine
->unisimt_predicate
;
2888 return pred
? pred
: pred
= gen_reg_rtx (BImode
);
2891 /* Return true if given call insn references one of the functions provided by
2892 the CUDA runtime: malloc, free, vprintf. */
2895 nvptx_call_insn_is_syscall_p (rtx_insn
*insn
)
2897 rtx pat
= PATTERN (insn
);
2898 gcc_checking_assert (GET_CODE (pat
) == PARALLEL
);
2899 pat
= XVECEXP (pat
, 0, 0);
2900 if (GET_CODE (pat
) == SET
)
2901 pat
= SET_SRC (pat
);
2902 gcc_checking_assert (GET_CODE (pat
) == CALL
2903 && GET_CODE (XEXP (pat
, 0)) == MEM
);
2904 rtx addr
= XEXP (XEXP (pat
, 0), 0);
2905 if (GET_CODE (addr
) != SYMBOL_REF
)
2907 const char *name
= XSTR (addr
, 0);
2908 /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
2909 references with forced assembler name refer to PTX syscalls. For vprintf,
2910 accept both normal and forced-assembler-name references. */
2911 return (!strcmp (name
, "vprintf") || !strcmp (name
, "*vprintf")
2912 || !strcmp (name
, "*malloc")
2913 || !strcmp (name
, "*free"));
2916 /* If SET subexpression of INSN sets a register, emit a shuffle instruction to
2917 propagate its value from lane MASTER to current lane. */
2920 nvptx_unisimt_handle_set (rtx set
, rtx_insn
*insn
, rtx master
)
2923 if (GET_CODE (set
) == SET
&& REG_P (reg
= SET_DEST (set
)))
2924 emit_insn_after (nvptx_gen_shuffle (reg
, reg
, master
, SHUFFLE_IDX
), insn
);
2927 /* Adjust code for uniform-simt code generation variant by making atomics and
2928 "syscalls" conditionally executed, and inserting shuffle-based propagation
2929 for registers being set. */
2932 nvptx_reorg_uniform_simt ()
2934 rtx_insn
*insn
, *next
;
2936 for (insn
= get_insns (); insn
; insn
= next
)
2938 next
= NEXT_INSN (insn
);
2939 if (!(CALL_P (insn
) && nvptx_call_insn_is_syscall_p (insn
))
2940 && !(NONJUMP_INSN_P (insn
)
2941 && GET_CODE (PATTERN (insn
)) == PARALLEL
2942 && get_attr_atomic (insn
)))
2944 rtx pat
= PATTERN (insn
);
2945 rtx master
= nvptx_get_unisimt_master ();
2946 for (int i
= 0; i
< XVECLEN (pat
, 0); i
++)
2947 nvptx_unisimt_handle_set (XVECEXP (pat
, 0, i
), insn
, master
);
2948 rtx pred
= nvptx_get_unisimt_predicate ();
2949 pred
= gen_rtx_NE (BImode
, pred
, const0_rtx
);
2950 pat
= gen_rtx_COND_EXEC (VOIDmode
, pred
, pat
);
2951 validate_change (insn
, &PATTERN (insn
), pat
, false);
2955 /* Offloading function attributes. */
2957 struct offload_attrs
2965 /* Define entries for cfun->machine->axis_dim. */
2967 #define MACH_VECTOR_LENGTH 0
2968 #define MACH_MAX_WORKERS 1
2970 static void populate_offload_attrs (offload_attrs
*oa
);
2973 init_axis_dim (void)
2978 populate_offload_attrs (&oa
);
2980 if (oa
.num_workers
== 0)
2981 max_workers
= PTX_CTA_SIZE
/ oa
.vector_length
;
2983 max_workers
= oa
.num_workers
;
2985 cfun
->machine
->axis_dim
[MACH_VECTOR_LENGTH
] = oa
.vector_length
;
2986 cfun
->machine
->axis_dim
[MACH_MAX_WORKERS
] = max_workers
;
2987 cfun
->machine
->axis_dim_init_p
= true;
2990 static int ATTRIBUTE_UNUSED
2991 nvptx_mach_max_workers ()
2993 if (!cfun
->machine
->axis_dim_init_p
)
2995 return cfun
->machine
->axis_dim
[MACH_MAX_WORKERS
];
2998 static int ATTRIBUTE_UNUSED
2999 nvptx_mach_vector_length ()
3001 if (!cfun
->machine
->axis_dim_init_p
)
3003 return cfun
->machine
->axis_dim
[MACH_VECTOR_LENGTH
];
3006 /* Loop structure of the function. The entire function is described as
3011 /* Parent parallel. */
3014 /* Next sibling parallel. */
3017 /* First child parallel. */
3020 /* Partitioning mask of the parallel. */
3023 /* Partitioning used within inner parallels. */
3024 unsigned inner_mask
;
3026 /* Location of parallel forked and join. The forked is the first
3027 block in the parallel and the join is the first block after of
3029 basic_block forked_block
;
3030 basic_block join_block
;
3032 rtx_insn
*forked_insn
;
3033 rtx_insn
*join_insn
;
3035 rtx_insn
*fork_insn
;
3036 rtx_insn
*joining_insn
;
3038 /* Basic blocks in this parallel, but not in child parallels. The
3039 FORKED and JOINING blocks are in the partition. The FORK and JOIN
3041 auto_vec
<basic_block
> blocks
;
3044 parallel (parallel
*parent
, unsigned mode
);
3048 /* Constructor links the new parallel into it's parent's chain of
3051 parallel::parallel (parallel
*parent_
, unsigned mask_
)
3052 :parent (parent_
), next (0), inner (0), mask (mask_
), inner_mask (0)
3054 forked_block
= join_block
= 0;
3055 forked_insn
= join_insn
= 0;
3056 fork_insn
= joining_insn
= 0;
3060 next
= parent
->inner
;
3061 parent
->inner
= this;
3065 parallel::~parallel ()
3071 /* Map of basic blocks to insns */
3072 typedef hash_map
<basic_block
, rtx_insn
*> bb_insn_map_t
;
3074 /* A tuple of an insn of interest and the BB in which it resides. */
3075 typedef std::pair
<rtx_insn
*, basic_block
> insn_bb_t
;
3076 typedef auto_vec
<insn_bb_t
> insn_bb_vec_t
;
3078 /* Split basic blocks such that each forked and join unspecs are at
3079 the start of their basic blocks. Thus afterwards each block will
3080 have a single partitioning mode. We also do the same for return
3081 insns, as they are executed by every thread. Return the
3082 partitioning mode of the function as a whole. Populate MAP with
3083 head and tail blocks. We also clear the BB visited flag, which is
3084 used when finding partitions. */
3087 nvptx_split_blocks (bb_insn_map_t
*map
)
3089 insn_bb_vec_t worklist
;
3093 /* Locate all the reorg instructions of interest. */
3094 FOR_ALL_BB_FN (block
, cfun
)
3096 bool seen_insn
= false;
3098 /* Clear visited flag, for use by parallel locator */
3099 block
->flags
&= ~BB_VISITED
;
3101 FOR_BB_INSNS (block
, insn
)
3105 switch (recog_memoized (insn
))
3110 case CODE_FOR_nvptx_forked
:
3111 case CODE_FOR_nvptx_join
:
3114 case CODE_FOR_return
:
3115 /* We also need to split just before return insns, as
3116 that insn needs executing by all threads, but the
3117 block it is in probably does not. */
3122 /* We've found an instruction that must be at the start of
3123 a block, but isn't. Add it to the worklist. */
3124 worklist
.safe_push (insn_bb_t (insn
, block
));
3126 /* It was already the first instruction. Just add it to
3128 map
->get_or_insert (block
) = insn
;
3133 /* Split blocks on the worklist. */
3136 basic_block remap
= 0;
3137 for (ix
= 0; worklist
.iterate (ix
, &elt
); ix
++)
3139 if (remap
!= elt
->second
)
3141 block
= elt
->second
;
3145 /* Split block before insn. The insn is in the new block */
3146 edge e
= split_block (block
, PREV_INSN (elt
->first
));
3149 map
->get_or_insert (block
) = elt
->first
;
3153 /* Return true if MASK contains parallelism that requires shared
3154 memory to broadcast. */
3157 nvptx_needs_shared_bcast (unsigned mask
)
3159 bool worker
= mask
& GOMP_DIM_MASK (GOMP_DIM_WORKER
);
3160 bool large_vector
= (mask
& GOMP_DIM_MASK (GOMP_DIM_VECTOR
))
3161 && nvptx_mach_vector_length () != PTX_WARP_SIZE
;
3163 return worker
|| large_vector
;
3166 /* BLOCK is a basic block containing a head or tail instruction.
3167 Locate the associated prehead or pretail instruction, which must be
3168 in the single predecessor block. */
3171 nvptx_discover_pre (basic_block block
, int expected
)
3173 gcc_assert (block
->preds
->length () == 1);
3174 basic_block pre_block
= (*block
->preds
)[0]->src
;
3177 for (pre_insn
= BB_END (pre_block
); !INSN_P (pre_insn
);
3178 pre_insn
= PREV_INSN (pre_insn
))
3179 gcc_assert (pre_insn
!= BB_HEAD (pre_block
));
3181 gcc_assert (recog_memoized (pre_insn
) == expected
);
3185 /* Dump this parallel and all its inner parallels. */
3188 nvptx_dump_pars (parallel
*par
, unsigned depth
)
3190 fprintf (dump_file
, "%u: mask %d head=%d, tail=%d\n",
3192 par
->forked_block
? par
->forked_block
->index
: -1,
3193 par
->join_block
? par
->join_block
->index
: -1);
3195 fprintf (dump_file
, " blocks:");
3198 for (unsigned ix
= 0; par
->blocks
.iterate (ix
, &block
); ix
++)
3199 fprintf (dump_file
, " %d", block
->index
);
3200 fprintf (dump_file
, "\n");
3202 nvptx_dump_pars (par
->inner
, depth
+ 1);
3205 nvptx_dump_pars (par
->next
, depth
);
3208 /* If BLOCK contains a fork/join marker, process it to create or
3209 terminate a loop structure. Add this block to the current loop,
3210 and then walk successor blocks. */
3213 nvptx_find_par (bb_insn_map_t
*map
, parallel
*par
, basic_block block
)
3215 if (block
->flags
& BB_VISITED
)
3217 block
->flags
|= BB_VISITED
;
3219 if (rtx_insn
**endp
= map
->get (block
))
3221 rtx_insn
*end
= *endp
;
3223 /* This is a block head or tail, or return instruction. */
3224 switch (recog_memoized (end
))
3226 case CODE_FOR_return
:
3227 /* Return instructions are in their own block, and we
3228 don't need to do anything more. */
3231 case CODE_FOR_nvptx_forked
:
3232 /* Loop head, create a new inner loop and add it into
3233 our parent's child list. */
3235 unsigned mask
= UINTVAL (XVECEXP (PATTERN (end
), 0, 0));
3238 par
= new parallel (par
, mask
);
3239 par
->forked_block
= block
;
3240 par
->forked_insn
= end
;
3241 if (nvptx_needs_shared_bcast (mask
))
3243 = nvptx_discover_pre (block
, CODE_FOR_nvptx_fork
);
3247 case CODE_FOR_nvptx_join
:
3248 /* A loop tail. Finish the current loop and return to
3251 unsigned mask
= UINTVAL (XVECEXP (PATTERN (end
), 0, 0));
3253 gcc_assert (par
->mask
== mask
);
3254 par
->join_block
= block
;
3255 par
->join_insn
= end
;
3256 if (nvptx_needs_shared_bcast (mask
))
3258 = nvptx_discover_pre (block
, CODE_FOR_nvptx_joining
);
3269 /* Add this block onto the current loop's list of blocks. */
3270 par
->blocks
.safe_push (block
);
3272 /* This must be the entry block. Create a NULL parallel. */
3273 par
= new parallel (0, 0);
3275 /* Walk successor blocks. */
3279 FOR_EACH_EDGE (e
, ei
, block
->succs
)
3280 nvptx_find_par (map
, par
, e
->dest
);
3285 /* DFS walk the CFG looking for fork & join markers. Construct
3286 loop structures as we go. MAP is a mapping of basic blocks
3287 to head & tail markers, discovered when splitting blocks. This
3288 speeds up the discovery. We rely on the BB visited flag having
3289 been cleared when splitting blocks. */
3292 nvptx_discover_pars (bb_insn_map_t
*map
)
3296 /* Mark exit blocks as visited. */
3297 block
= EXIT_BLOCK_PTR_FOR_FN (cfun
);
3298 block
->flags
|= BB_VISITED
;
3300 /* And entry block as not. */
3301 block
= ENTRY_BLOCK_PTR_FOR_FN (cfun
);
3302 block
->flags
&= ~BB_VISITED
;
3304 parallel
*par
= nvptx_find_par (map
, 0, block
);
3308 fprintf (dump_file
, "\nLoops\n");
3309 nvptx_dump_pars (par
, 0);
3310 fprintf (dump_file
, "\n");
3316 /* Analyse a group of BBs within a partitioned region and create N
3317 Single-Entry-Single-Exit regions. Some of those regions will be
3318 trivial ones consisting of a single BB. The blocks of a
3319 partitioned region might form a set of disjoint graphs -- because
3320 the region encloses a differently partitoned sub region.
3322 We use the linear time algorithm described in 'Finding Regions Fast:
3323 Single Entry Single Exit and control Regions in Linear Time'
3324 Johnson, Pearson & Pingali. That algorithm deals with complete
3325 CFGs, where a back edge is inserted from END to START, and thus the
3326 problem becomes one of finding equivalent loops.
3328 In this case we have a partial CFG. We complete it by redirecting
3329 any incoming edge to the graph to be from an arbitrary external BB,
3330 and similarly redirecting any outgoing edge to be to that BB.
3331 Thus we end up with a closed graph.
3333 The algorithm works by building a spanning tree of an undirected
3334 graph and keeping track of back edges from nodes further from the
3335 root in the tree to nodes nearer to the root in the tree. In the
3336 description below, the root is up and the tree grows downwards.
3338 We avoid having to deal with degenerate back-edges to the same
3339 block, by splitting each BB into 3 -- one for input edges, one for
3340 the node itself and one for the output edges. Such back edges are
3341 referred to as 'Brackets'. Cycle equivalent nodes will have the
3342 same set of brackets.
3344 Determining bracket equivalency is done by maintaining a list of
3345 brackets in such a manner that the list length and final bracket
3346 uniquely identify the set.
3348 We use coloring to mark all BBs with cycle equivalency with the
3349 same color. This is the output of the 'Finding Regions Fast'
3350 algorithm. Notice it doesn't actually find the set of nodes within
3351 a particular region, just unorderd sets of nodes that are the
3352 entries and exits of SESE regions.
3354 After determining cycle equivalency, we need to find the minimal
3355 set of SESE regions. Do this with a DFS coloring walk of the
3356 complete graph. We're either 'looking' or 'coloring'. When
3357 looking, and we're in the subgraph, we start coloring the color of
3358 the current node, and remember that node as the start of the
3359 current color's SESE region. Every time we go to a new node, we
3360 decrement the count of nodes with thet color. If it reaches zero,
3361 we remember that node as the end of the current color's SESE region
3362 and return to 'looking'. Otherwise we color the node the current
3365 This way we end up with coloring the inside of non-trivial SESE
3366 regions with the color of that region. */
3368 /* A pair of BBs. We use this to represent SESE regions. */
3369 typedef std::pair
<basic_block
, basic_block
> bb_pair_t
;
3370 typedef auto_vec
<bb_pair_t
> bb_pair_vec_t
;
3372 /* A node in the undirected CFG. The discriminator SECOND indicates just
3373 above or just below the BB idicated by FIRST. */
3374 typedef std::pair
<basic_block
, int> pseudo_node_t
;
3376 /* A bracket indicates an edge towards the root of the spanning tree of the
3377 undirected graph. Each bracket has a color, determined
3378 from the currrent set of brackets. */
3381 pseudo_node_t back
; /* Back target */
3383 /* Current color and size of set. */
3387 bracket (pseudo_node_t back_
)
3388 : back (back_
), color (~0u), size (~0u)
3392 unsigned get_color (auto_vec
<unsigned> &color_counts
, unsigned length
)
3397 color
= color_counts
.length ();
3398 color_counts
.quick_push (0);
3400 color_counts
[color
]++;
3405 typedef auto_vec
<bracket
> bracket_vec_t
;
3407 /* Basic block info for finding SESE regions. */
3411 int node
; /* Node number in spanning tree. */
3412 int parent
; /* Parent node number. */
3414 /* The algorithm splits each node A into Ai, A', Ao. The incoming
3415 edges arrive at pseudo-node Ai and the outgoing edges leave at
3416 pseudo-node Ao. We have to remember which way we arrived at a
3417 particular node when generating the spanning tree. dir > 0 means
3418 we arrived at Ai, dir < 0 means we arrived at Ao. */
3421 /* Lowest numbered pseudo-node reached via a backedge from thsis
3422 node, or any descendant. */
3425 int color
; /* Cycle-equivalence color */
3427 /* Stack of brackets for this node. */
3428 bracket_vec_t brackets
;
3430 bb_sese (unsigned node_
, unsigned p
, int dir_
)
3431 :node (node_
), parent (p
), dir (dir_
)
3436 /* Push a bracket ending at BACK. */
3437 void push (const pseudo_node_t
&back
)
3440 fprintf (dump_file
, "Pushing backedge %d:%+d\n",
3441 back
.first
? back
.first
->index
: 0, back
.second
);
3442 brackets
.safe_push (bracket (back
));
3445 void append (bb_sese
*child
);
3446 void remove (const pseudo_node_t
&);
3448 /* Set node's color. */
3449 void set_color (auto_vec
<unsigned> &color_counts
)
3451 color
= brackets
.last ().get_color (color_counts
, brackets
.length ());
3455 bb_sese::~bb_sese ()
3459 /* Destructively append CHILD's brackets. */
3462 bb_sese::append (bb_sese
*child
)
3464 if (int len
= child
->brackets
.length ())
3470 for (ix
= 0; ix
< len
; ix
++)
3472 const pseudo_node_t
&pseudo
= child
->brackets
[ix
].back
;
3473 fprintf (dump_file
, "Appending (%d)'s backedge %d:%+d\n",
3474 child
->node
, pseudo
.first
? pseudo
.first
->index
: 0,
3478 if (!brackets
.length ())
3479 std::swap (brackets
, child
->brackets
);
3482 brackets
.reserve (len
);
3483 for (ix
= 0; ix
< len
; ix
++)
3484 brackets
.quick_push (child
->brackets
[ix
]);
3489 /* Remove brackets that terminate at PSEUDO. */
3492 bb_sese::remove (const pseudo_node_t
&pseudo
)
3494 unsigned removed
= 0;
3495 int len
= brackets
.length ();
3497 for (int ix
= 0; ix
< len
; ix
++)
3499 if (brackets
[ix
].back
== pseudo
)
3502 fprintf (dump_file
, "Removing backedge %d:%+d\n",
3503 pseudo
.first
? pseudo
.first
->index
: 0, pseudo
.second
);
3507 brackets
[ix
-removed
] = brackets
[ix
];
3513 /* Accessors for BB's aux pointer. */
3514 #define BB_SET_SESE(B, S) ((B)->aux = (S))
3515 #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
3517 /* DFS walk creating SESE data structures. Only cover nodes with
3518 BB_VISITED set. Append discovered blocks to LIST. We number in
3519 increments of 3 so that the above and below pseudo nodes can be
3520 implicitly numbered too. */
3523 nvptx_sese_number (int n
, int p
, int dir
, basic_block b
,
3524 auto_vec
<basic_block
> *list
)
3526 if (BB_GET_SESE (b
))
3530 fprintf (dump_file
, "Block %d(%d), parent (%d), orientation %+d\n",
3531 b
->index
, n
, p
, dir
);
3533 BB_SET_SESE (b
, new bb_sese (n
, p
, dir
));
3537 list
->quick_push (b
);
3539 /* First walk the nodes on the 'other side' of this node, then walk
3540 the nodes on the same side. */
3541 for (unsigned ix
= 2; ix
; ix
--)
3543 vec
<edge
, va_gc
> *edges
= dir
> 0 ? b
->succs
: b
->preds
;
3544 size_t offset
= (dir
> 0 ? offsetof (edge_def
, dest
)
3545 : offsetof (edge_def
, src
));
3549 FOR_EACH_EDGE (e
, ei
, edges
)
3551 basic_block target
= *(basic_block
*)((char *)e
+ offset
);
3553 if (target
->flags
& BB_VISITED
)
3554 n
= nvptx_sese_number (n
, p
, dir
, target
, list
);
3561 /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
3562 EDGES are the outgoing edges and OFFSET is the offset to the src
3563 or dst block on the edges. */
3566 nvptx_sese_pseudo (basic_block me
, bb_sese
*sese
, int depth
, int dir
,
3567 vec
<edge
, va_gc
> *edges
, size_t offset
)
3571 int hi_back
= depth
;
3572 pseudo_node_t
node_back (0, depth
);
3573 int hi_child
= depth
;
3574 pseudo_node_t
node_child (0, depth
);
3575 basic_block child
= NULL
;
3576 unsigned num_children
= 0;
3577 int usd
= -dir
* sese
->dir
;
3580 fprintf (dump_file
, "\nProcessing %d(%d) %+d\n",
3581 me
->index
, sese
->node
, dir
);
3585 /* This is the above pseudo-child. It has the BB itself as an
3586 additional child node. */
3587 node_child
= sese
->high
;
3588 hi_child
= node_child
.second
;
3589 if (node_child
.first
)
3590 hi_child
+= BB_GET_SESE (node_child
.first
)->node
;
3594 /* Examine each edge.
3595 - if it is a child (a) append its bracket list and (b) record
3596 whether it is the child with the highest reaching bracket.
3597 - if it is an edge to ancestor, record whether it's the highest
3598 reaching backlink. */
3599 FOR_EACH_EDGE (e
, ei
, edges
)
3601 basic_block target
= *(basic_block
*)((char *)e
+ offset
);
3603 if (bb_sese
*t_sese
= BB_GET_SESE (target
))
3605 if (t_sese
->parent
== sese
->node
&& !(t_sese
->dir
+ usd
))
3607 /* Child node. Append its bracket list. */
3609 sese
->append (t_sese
);
3611 /* Compare it's hi value. */
3612 int t_hi
= t_sese
->high
.second
;
3614 if (basic_block child_hi_block
= t_sese
->high
.first
)
3615 t_hi
+= BB_GET_SESE (child_hi_block
)->node
;
3617 if (hi_child
> t_hi
)
3620 node_child
= t_sese
->high
;
3624 else if (t_sese
->node
< sese
->node
+ dir
3625 && !(dir
< 0 && sese
->parent
== t_sese
->node
))
3627 /* Non-parental ancestor node -- a backlink. */
3628 int d
= usd
* t_sese
->dir
;
3629 int back
= t_sese
->node
+ d
;
3634 node_back
= pseudo_node_t (target
, d
);
3639 { /* Fallen off graph, backlink to entry node. */
3641 node_back
= pseudo_node_t (0, 0);
3645 /* Remove any brackets that terminate at this pseudo node. */
3646 sese
->remove (pseudo_node_t (me
, dir
));
3648 /* Now push any backlinks from this pseudo node. */
3649 FOR_EACH_EDGE (e
, ei
, edges
)
3651 basic_block target
= *(basic_block
*)((char *)e
+ offset
);
3652 if (bb_sese
*t_sese
= BB_GET_SESE (target
))
3654 if (t_sese
->node
< sese
->node
+ dir
3655 && !(dir
< 0 && sese
->parent
== t_sese
->node
))
3656 /* Non-parental ancestor node - backedge from me. */
3657 sese
->push (pseudo_node_t (target
, usd
* t_sese
->dir
));
3661 /* back edge to entry node */
3662 sese
->push (pseudo_node_t (0, 0));
3666 /* If this node leads directly or indirectly to a no-return region of
3667 the graph, then fake a backedge to entry node. */
3668 if (!sese
->brackets
.length () || !edges
|| !edges
->length ())
3671 node_back
= pseudo_node_t (0, 0);
3672 sese
->push (node_back
);
3675 /* Record the highest reaching backedge from us or a descendant. */
3676 sese
->high
= hi_back
< hi_child
? node_back
: node_child
;
3678 if (num_children
> 1)
3680 /* There is more than one child -- this is a Y shaped piece of
3681 spanning tree. We have to insert a fake backedge from this
3682 node to the highest ancestor reached by not-the-highest
3683 reaching child. Note that there may be multiple children
3684 with backedges to the same highest node. That's ok and we
3685 insert the edge to that highest node. */
3687 if (dir
< 0 && child
)
3689 node_child
= sese
->high
;
3690 hi_child
= node_child
.second
;
3691 if (node_child
.first
)
3692 hi_child
+= BB_GET_SESE (node_child
.first
)->node
;
3695 FOR_EACH_EDGE (e
, ei
, edges
)
3697 basic_block target
= *(basic_block
*)((char *)e
+ offset
);
3699 if (target
== child
)
3700 /* Ignore the highest child. */
3703 bb_sese
*t_sese
= BB_GET_SESE (target
);
3706 if (t_sese
->parent
!= sese
->node
)
3710 /* Compare its hi value. */
3711 int t_hi
= t_sese
->high
.second
;
3713 if (basic_block child_hi_block
= t_sese
->high
.first
)
3714 t_hi
+= BB_GET_SESE (child_hi_block
)->node
;
3716 if (hi_child
> t_hi
)
3719 node_child
= t_sese
->high
;
3723 sese
->push (node_child
);
3728 /* DFS walk of BB graph. Color node BLOCK according to COLORING then
3729 proceed to successors. Set SESE entry and exit nodes of
3733 nvptx_sese_color (auto_vec
<unsigned> &color_counts
, bb_pair_vec_t
®ions
,
3734 basic_block block
, int coloring
)
3736 bb_sese
*sese
= BB_GET_SESE (block
);
3738 if (block
->flags
& BB_VISITED
)
3740 /* If we've already encountered this block, either we must not
3741 be coloring, or it must have been colored the current color. */
3742 gcc_assert (coloring
< 0 || (sese
&& coloring
== sese
->color
));
3746 block
->flags
|= BB_VISITED
;
3752 /* Start coloring a region. */
3753 regions
[sese
->color
].first
= block
;
3754 coloring
= sese
->color
;
3757 if (!--color_counts
[sese
->color
] && sese
->color
== coloring
)
3759 /* Found final block of SESE region. */
3760 regions
[sese
->color
].second
= block
;
3764 /* Color the node, so we can assert on revisiting the node
3765 that the graph is indeed SESE. */
3766 sese
->color
= coloring
;
3769 /* Fallen off the subgraph, we cannot be coloring. */
3770 gcc_assert (coloring
< 0);
3772 /* Walk each successor block. */
3773 if (block
->succs
&& block
->succs
->length ())
3778 FOR_EACH_EDGE (e
, ei
, block
->succs
)
3779 nvptx_sese_color (color_counts
, regions
, e
->dest
, coloring
);
3782 gcc_assert (coloring
< 0);
3785 /* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3786 end up with NULL entries in it. */
3789 nvptx_find_sese (auto_vec
<basic_block
> &blocks
, bb_pair_vec_t
®ions
)
3794 /* First clear each BB of the whole function. */
3795 FOR_ALL_BB_FN (block
, cfun
)
3797 block
->flags
&= ~BB_VISITED
;
3798 BB_SET_SESE (block
, 0);
3801 /* Mark blocks in the function that are in this graph. */
3802 for (ix
= 0; blocks
.iterate (ix
, &block
); ix
++)
3803 block
->flags
|= BB_VISITED
;
3805 /* Counts of nodes assigned to each color. There cannot be more
3806 colors than blocks (and hopefully there will be fewer). */
3807 auto_vec
<unsigned> color_counts
;
3808 color_counts
.reserve (blocks
.length ());
3810 /* Worklist of nodes in the spanning tree. Again, there cannot be
3811 more nodes in the tree than blocks (there will be fewer if the
3812 CFG of blocks is disjoint). */
3813 auto_vec
<basic_block
> spanlist
;
3814 spanlist
.reserve (blocks
.length ());
3816 /* Make sure every block has its cycle class determined. */
3817 for (ix
= 0; blocks
.iterate (ix
, &block
); ix
++)
3819 if (BB_GET_SESE (block
))
3820 /* We already met this block in an earlier graph solve. */
3824 fprintf (dump_file
, "Searching graph starting at %d\n", block
->index
);
3826 /* Number the nodes reachable from block initial DFS order. */
3827 int depth
= nvptx_sese_number (2, 0, +1, block
, &spanlist
);
3829 /* Now walk in reverse DFS order to find cycle equivalents. */
3830 while (spanlist
.length ())
3832 block
= spanlist
.pop ();
3833 bb_sese
*sese
= BB_GET_SESE (block
);
3835 /* Do the pseudo node below. */
3836 nvptx_sese_pseudo (block
, sese
, depth
, +1,
3837 sese
->dir
> 0 ? block
->succs
: block
->preds
,
3838 (sese
->dir
> 0 ? offsetof (edge_def
, dest
)
3839 : offsetof (edge_def
, src
)));
3840 sese
->set_color (color_counts
);
3841 /* Do the pseudo node above. */
3842 nvptx_sese_pseudo (block
, sese
, depth
, -1,
3843 sese
->dir
< 0 ? block
->succs
: block
->preds
,
3844 (sese
->dir
< 0 ? offsetof (edge_def
, dest
)
3845 : offsetof (edge_def
, src
)));
3848 fprintf (dump_file
, "\n");
3854 const char *comma
= "";
3856 fprintf (dump_file
, "Found %d cycle equivalents\n",
3857 color_counts
.length ());
3858 for (ix
= 0; color_counts
.iterate (ix
, &count
); ix
++)
3860 fprintf (dump_file
, "%s%d[%d]={", comma
, ix
, count
);
3863 for (unsigned jx
= 0; blocks
.iterate (jx
, &block
); jx
++)
3864 if (BB_GET_SESE (block
)->color
== ix
)
3866 block
->flags
|= BB_VISITED
;
3867 fprintf (dump_file
, "%s%d", comma
, block
->index
);
3870 fprintf (dump_file
, "}");
3873 fprintf (dump_file
, "\n");
3876 /* Now we've colored every block in the subgraph. We now need to
3877 determine the minimal set of SESE regions that cover that
3878 subgraph. Do this with a DFS walk of the complete function.
3879 During the walk we're either 'looking' or 'coloring'. When we
3880 reach the last node of a particular color, we stop coloring and
3881 return to looking. */
3883 /* There cannot be more SESE regions than colors. */
3884 regions
.reserve (color_counts
.length ());
3885 for (ix
= color_counts
.length (); ix
--;)
3886 regions
.quick_push (bb_pair_t (0, 0));
3888 for (ix
= 0; blocks
.iterate (ix
, &block
); ix
++)
3889 block
->flags
&= ~BB_VISITED
;
3891 nvptx_sese_color (color_counts
, regions
, ENTRY_BLOCK_PTR_FOR_FN (cfun
), -1);
3895 const char *comma
= "";
3896 int len
= regions
.length ();
3898 fprintf (dump_file
, "SESE regions:");
3899 for (ix
= 0; ix
!= len
; ix
++)
3901 basic_block from
= regions
[ix
].first
;
3902 basic_block to
= regions
[ix
].second
;
3906 fprintf (dump_file
, "%s %d{%d", comma
, ix
, from
->index
);
3908 fprintf (dump_file
, "->%d", to
->index
);
3910 int color
= BB_GET_SESE (from
)->color
;
3912 /* Print the blocks within the region (excluding ends). */
3913 FOR_EACH_BB_FN (block
, cfun
)
3915 bb_sese
*sese
= BB_GET_SESE (block
);
3917 if (sese
&& sese
->color
== color
3918 && block
!= from
&& block
!= to
)
3919 fprintf (dump_file
, ".%d", block
->index
);
3921 fprintf (dump_file
, "}");
3925 fprintf (dump_file
, "\n\n");
3928 for (ix
= 0; blocks
.iterate (ix
, &block
); ix
++)
3929 delete BB_GET_SESE (block
);
3935 /* Propagate live state at the start of a partitioned region. IS_CALL
3936 indicates whether the propagation is for a (partitioned) call
3937 instruction. BLOCK provides the live register information, and
3938 might not contain INSN. Propagation is inserted just after INSN. RW
3939 indicates whether we are reading and/or writing state. This
3940 separation is needed for worker-level proppagation where we
3941 essentially do a spill & fill. FN is the underlying worker
3942 function to generate the propagation instructions for single
3943 register. DATA is user data.
3945 Returns true if we didn't emit any instructions.
3947 We propagate the live register set for non-calls and the entire
3948 frame for calls and non-calls. We could do better by (a)
3949 propagating just the live set that is used within the partitioned
3950 regions and (b) only propagating stack entries that are used. The
3951 latter might be quite hard to determine. */
3953 typedef rtx (*propagator_fn
) (rtx
, propagate_mask
, unsigned, void *, bool);
3956 nvptx_propagate (bool is_call
, basic_block block
, rtx_insn
*insn
,
3957 propagate_mask rw
, propagator_fn fn
, void *data
, bool vector
)
3959 bitmap live
= DF_LIVE_IN (block
);
3960 bitmap_iterator iterator
;
3964 /* Copy the frame array. */
3965 HOST_WIDE_INT fs
= get_frame_size ();
3968 rtx tmp
= gen_reg_rtx (DImode
);
3970 rtx ptr
= gen_reg_rtx (Pmode
);
3971 rtx pred
= NULL_RTX
;
3972 rtx_code_label
*label
= NULL
;
3975 /* The frame size might not be DImode compatible, but the frame
3976 array's declaration will be. So it's ok to round up here. */
3977 fs
= (fs
+ GET_MODE_SIZE (DImode
) - 1) / GET_MODE_SIZE (DImode
);
3978 /* Detect single iteration loop. */
3983 emit_insn (gen_rtx_SET (ptr
, frame_pointer_rtx
));
3986 idx
= gen_reg_rtx (SImode
);
3987 pred
= gen_reg_rtx (BImode
);
3988 label
= gen_label_rtx ();
3990 emit_insn (gen_rtx_SET (idx
, GEN_INT (fs
)));
3991 /* Allow worker function to initialize anything needed. */
3992 rtx init
= fn (tmp
, PM_loop_begin
, fs
, data
, vector
);
3996 LABEL_NUSES (label
)++;
3997 emit_insn (gen_addsi3 (idx
, idx
, GEN_INT (-1)));
4000 emit_insn (gen_rtx_SET (tmp
, gen_rtx_MEM (DImode
, ptr
)));
4001 emit_insn (fn (tmp
, rw
, fs
, data
, vector
));
4003 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode
, ptr
), tmp
));
4006 emit_insn (gen_rtx_SET (pred
, gen_rtx_NE (BImode
, idx
, const0_rtx
)));
4007 emit_insn (gen_adddi3 (ptr
, ptr
, GEN_INT (GET_MODE_SIZE (DImode
))));
4008 emit_insn (gen_br_true_uni (pred
, label
));
4009 rtx fini
= fn (tmp
, PM_loop_end
, fs
, data
, vector
);
4012 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx
), idx
));
4014 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp
), tmp
));
4015 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr
), ptr
));
4016 rtx cpy
= get_insns ();
4018 insn
= emit_insn_after (cpy
, insn
);
4022 /* Copy live registers. */
4023 EXECUTE_IF_SET_IN_BITMAP (live
, 0, ix
, iterator
)
4025 rtx reg
= regno_reg_rtx
[ix
];
4027 if (REGNO (reg
) >= FIRST_PSEUDO_REGISTER
)
4029 rtx bcast
= fn (reg
, rw
, 0, data
, vector
);
4031 insn
= emit_insn_after (bcast
, insn
);
4038 /* Worker for nvptx_warp_propagate. */
4041 warp_prop_gen (rtx reg
, propagate_mask pm
,
4042 unsigned ARG_UNUSED (count
), void *ARG_UNUSED (data
),
4043 bool ARG_UNUSED (vector
))
4045 if (!(pm
& PM_read_write
))
4048 return nvptx_gen_warp_bcast (reg
);
4051 /* Propagate state that is live at start of BLOCK across the vectors
4052 of a single warp. Propagation is inserted just after INSN.
4053 IS_CALL and return as for nvptx_propagate. */
4056 nvptx_warp_propagate (bool is_call
, basic_block block
, rtx_insn
*insn
)
4058 return nvptx_propagate (is_call
, block
, insn
, PM_read_write
,
4059 warp_prop_gen
, 0, false);
4062 /* Worker for nvptx_shared_propagate. */
4065 shared_prop_gen (rtx reg
, propagate_mask pm
, unsigned rep
, void *data_
,
4068 broadcast_data_t
*data
= (broadcast_data_t
*)data_
;
4070 if (pm
& PM_loop_begin
)
4072 /* Starting a loop, initialize pointer. */
4073 unsigned align
= GET_MODE_ALIGNMENT (GET_MODE (reg
)) / BITS_PER_UNIT
;
4075 oacc_bcast_align
= MAX (oacc_bcast_align
, align
);
4076 data
->offset
= ROUND_UP (data
->offset
, align
);
4078 data
->ptr
= gen_reg_rtx (Pmode
);
4080 return gen_adddi3 (data
->ptr
, data
->base
, GEN_INT (data
->offset
));
4082 else if (pm
& PM_loop_end
)
4084 rtx clobber
= gen_rtx_CLOBBER (GET_MODE (data
->ptr
), data
->ptr
);
4085 data
->ptr
= NULL_RTX
;
4089 return nvptx_gen_shared_bcast (reg
, pm
, rep
, data
, vector
);
4092 /* Spill or fill live state that is live at start of BLOCK. PRE_P
4093 indicates if this is just before partitioned mode (do spill), or
4094 just after it starts (do fill). Sequence is inserted just after
4095 INSN. IS_CALL and return as for nvptx_propagate. */
4098 nvptx_shared_propagate (bool pre_p
, bool is_call
, basic_block block
,
4099 rtx_insn
*insn
, bool vector
)
4101 broadcast_data_t data
;
4103 data
.base
= gen_reg_rtx (Pmode
);
4105 data
.ptr
= NULL_RTX
;
4107 bool empty
= nvptx_propagate (is_call
, block
, insn
,
4108 pre_p
? PM_read
: PM_write
, shared_prop_gen
,
4110 gcc_assert (empty
== !data
.offset
);
4113 rtx bcast_sym
= oacc_bcast_sym
;
4115 /* Stuff was emitted, initialize the base pointer now. */
4116 if (vector
&& nvptx_mach_max_workers () > 1)
4118 if (!cfun
->machine
->bcast_partition
)
4120 /* It would be nice to place this register in
4121 DATA_AREA_SHARED. */
4122 cfun
->machine
->bcast_partition
= gen_reg_rtx (DImode
);
4124 if (!cfun
->machine
->sync_bar
)
4125 cfun
->machine
->sync_bar
= gen_reg_rtx (SImode
);
4127 bcast_sym
= cfun
->machine
->bcast_partition
;
4130 rtx init
= gen_rtx_SET (data
.base
, bcast_sym
);
4131 emit_insn_after (init
, insn
);
4133 unsigned int psize
= ROUND_UP (data
.offset
, oacc_bcast_align
);
4134 unsigned int pnum
= (nvptx_mach_vector_length () > PTX_WARP_SIZE
4135 ? nvptx_mach_max_workers () + 1
4138 oacc_bcast_partition
= MAX (oacc_bcast_partition
, psize
);
4139 oacc_bcast_size
= MAX (oacc_bcast_size
, psize
* pnum
);
4144 /* Emit a CTA-level synchronization barrier. LOCK is the barrier number,
4145 which is an integer or a register. THREADS is the number of threads
4146 controlled by the barrier. */
4149 nvptx_cta_sync (rtx lock
, int threads
)
4151 return gen_nvptx_barsync (lock
, GEN_INT (threads
));
4154 #if WORKAROUND_PTXJIT_BUG
4155 /* Return first real insn in BB, or return NULL_RTX if BB does not contain
4159 bb_first_real_insn (basic_block bb
)
4163 /* Find first insn of from block. */
4164 FOR_BB_INSNS (bb
, insn
)
4172 /* Return true if INSN needs neutering. */
4175 needs_neutering_p (rtx_insn
*insn
)
4180 switch (recog_memoized (insn
))
4182 case CODE_FOR_nvptx_fork
:
4183 case CODE_FOR_nvptx_forked
:
4184 case CODE_FOR_nvptx_joining
:
4185 case CODE_FOR_nvptx_join
:
4186 case CODE_FOR_nvptx_barsync
:
4193 /* Verify position of VECTOR_{JUMP,LABEL} and WORKER_{JUMP,LABEL} in FROM. */
4196 verify_neutering_jumps (basic_block from
,
4197 rtx_insn
*vector_jump
, rtx_insn
*worker_jump
,
4198 rtx_insn
*vector_label
, rtx_insn
*worker_label
)
4200 basic_block bb
= from
;
4201 rtx_insn
*insn
= BB_HEAD (bb
);
4202 bool seen_worker_jump
= false;
4203 bool seen_vector_jump
= false;
4204 bool seen_worker_label
= false;
4205 bool seen_vector_label
= false;
4206 bool worker_neutered
= false;
4207 bool vector_neutered
= false;
4210 if (insn
== worker_jump
)
4212 seen_worker_jump
= true;
4213 worker_neutered
= true;
4214 gcc_assert (!vector_neutered
);
4216 else if (insn
== vector_jump
)
4218 seen_vector_jump
= true;
4219 vector_neutered
= true;
4221 else if (insn
== worker_label
)
4223 seen_worker_label
= true;
4224 gcc_assert (worker_neutered
);
4225 worker_neutered
= false;
4227 else if (insn
== vector_label
)
4229 seen_vector_label
= true;
4230 gcc_assert (vector_neutered
);
4231 vector_neutered
= false;
4233 else if (INSN_P (insn
))
4234 switch (recog_memoized (insn
))
4236 case CODE_FOR_nvptx_barsync
:
4237 gcc_assert (!vector_neutered
&& !worker_neutered
);
4243 if (insn
!= BB_END (bb
))
4244 insn
= NEXT_INSN (insn
);
4245 else if (JUMP_P (insn
) && single_succ_p (bb
)
4246 && !seen_vector_jump
&& !seen_worker_jump
)
4248 bb
= single_succ (bb
);
4249 insn
= BB_HEAD (bb
);
4255 gcc_assert (!(vector_jump
&& !seen_vector_jump
));
4256 gcc_assert (!(worker_jump
&& !seen_worker_jump
));
4258 if (seen_vector_label
|| seen_worker_label
)
4260 gcc_assert (!(vector_label
&& !seen_vector_label
));
4261 gcc_assert (!(worker_label
&& !seen_worker_label
));
4269 /* Verify position of VECTOR_LABEL and WORKER_LABEL in TO. */
4272 verify_neutering_labels (basic_block to
, rtx_insn
*vector_label
,
4273 rtx_insn
*worker_label
)
4275 basic_block bb
= to
;
4276 rtx_insn
*insn
= BB_END (bb
);
4277 bool seen_worker_label
= false;
4278 bool seen_vector_label
= false;
4281 if (insn
== worker_label
)
4283 seen_worker_label
= true;
4284 gcc_assert (!seen_vector_label
);
4286 else if (insn
== vector_label
)
4287 seen_vector_label
= true;
4288 else if (INSN_P (insn
))
4289 switch (recog_memoized (insn
))
4291 case CODE_FOR_nvptx_barsync
:
4292 gcc_assert (!seen_vector_label
&& !seen_worker_label
);
4296 if (insn
!= BB_HEAD (bb
))
4297 insn
= PREV_INSN (insn
);
4302 gcc_assert (!(vector_label
&& !seen_vector_label
));
4303 gcc_assert (!(worker_label
&& !seen_worker_label
));
4306 /* Single neutering according to MASK. FROM is the incoming block and
4307 TO is the outgoing block. These may be the same block. Insert at
4310 if (tid.<axis>) goto end.
4312 and insert before ending branch of TO (if there is such an insn):
4315 <possibly-broadcast-cond>
4318 We currently only use differnt FROM and TO when skipping an entire
4319 loop. We could do more if we detected superblocks. */
4322 nvptx_single (unsigned mask
, basic_block from
, basic_block to
)
4324 rtx_insn
*head
= BB_HEAD (from
);
4325 rtx_insn
*tail
= BB_END (to
);
4326 unsigned skip_mask
= mask
;
4330 /* Find first insn of from block. */
4331 while (head
!= BB_END (from
) && !needs_neutering_p (head
))
4332 head
= NEXT_INSN (head
);
4337 if (!(JUMP_P (head
) && single_succ_p (from
)))
4340 basic_block jump_target
= single_succ (from
);
4341 if (!single_pred_p (jump_target
))
4345 head
= BB_HEAD (from
);
4348 /* Find last insn of to block */
4349 rtx_insn
*limit
= from
== to
? head
: BB_HEAD (to
);
4350 while (tail
!= limit
&& !INSN_P (tail
) && !LABEL_P (tail
))
4351 tail
= PREV_INSN (tail
);
4353 /* Detect if tail is a branch. */
4354 rtx tail_branch
= NULL_RTX
;
4355 rtx cond_branch
= NULL_RTX
;
4356 if (tail
&& INSN_P (tail
))
4358 tail_branch
= PATTERN (tail
);
4359 if (GET_CODE (tail_branch
) != SET
|| SET_DEST (tail_branch
) != pc_rtx
)
4360 tail_branch
= NULL_RTX
;
4363 cond_branch
= SET_SRC (tail_branch
);
4364 if (GET_CODE (cond_branch
) != IF_THEN_ELSE
)
4365 cond_branch
= NULL_RTX
;
4371 /* If this is empty, do nothing. */
4372 if (!head
|| !needs_neutering_p (head
))
4377 /* If we're only doing vector single, there's no need to
4378 emit skip code because we'll not insert anything. */
4379 if (!(mask
& GOMP_DIM_MASK (GOMP_DIM_VECTOR
)))
4382 else if (tail_branch
)
4383 /* Block with only unconditional branch. Nothing to do. */
4387 /* Insert the vector test inside the worker test. */
4389 rtx_insn
*before
= tail
;
4390 rtx_insn
*neuter_start
= NULL
;
4391 rtx_insn
*worker_label
= NULL
, *vector_label
= NULL
;
4392 rtx_insn
*worker_jump
= NULL
, *vector_jump
= NULL
;
4393 for (mode
= GOMP_DIM_WORKER
; mode
<= GOMP_DIM_VECTOR
; mode
++)
4394 if (GOMP_DIM_MASK (mode
) & skip_mask
)
4396 rtx_code_label
*label
= gen_label_rtx ();
4397 rtx pred
= cfun
->machine
->axis_predicate
[mode
- GOMP_DIM_WORKER
];
4398 rtx_insn
**mode_jump
= mode
== GOMP_DIM_VECTOR
? &vector_jump
: &worker_jump
;
4399 rtx_insn
**mode_label
= mode
== GOMP_DIM_VECTOR
? &vector_label
: &worker_label
;
4403 pred
= gen_reg_rtx (BImode
);
4404 cfun
->machine
->axis_predicate
[mode
- GOMP_DIM_WORKER
] = pred
;
4408 if (mode
== GOMP_DIM_VECTOR
)
4409 br
= gen_br_true (pred
, label
);
4411 br
= gen_br_true_uni (pred
, label
);
4413 neuter_start
= emit_insn_after (br
, neuter_start
);
4415 neuter_start
= emit_insn_before (br
, head
);
4416 *mode_jump
= neuter_start
;
4418 LABEL_NUSES (label
)++;
4419 rtx_insn
*label_insn
;
4422 label_insn
= emit_label_before (label
, before
);
4423 before
= label_insn
;
4427 label_insn
= emit_label_after (label
, tail
);
4428 if ((mode
== GOMP_DIM_VECTOR
|| mode
== GOMP_DIM_WORKER
)
4429 && CALL_P (tail
) && find_reg_note (tail
, REG_NORETURN
, NULL
))
4430 emit_insn_after (gen_exit (), label_insn
);
4433 if (mode
== GOMP_DIM_VECTOR
)
4434 vector_label
= label_insn
;
4436 worker_label
= label_insn
;
4439 /* Now deal with propagating the branch condition. */
4442 rtx pvar
= XEXP (XEXP (cond_branch
, 0), 0);
4444 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR
) == mask
4445 && nvptx_mach_vector_length () == PTX_WARP_SIZE
)
4447 /* Vector mode only, do a shuffle. */
4448 #if WORKAROUND_PTXJIT_BUG
4449 /* The branch condition %rcond is propagated like this:
4454 setp.ne.u32 %rnotvzero,%x,0;
4457 @%rnotvzero bra Lskip;
4458 setp.<op>.<type> %rcond,op1,op2;
4460 selp.u32 %rcondu32,1,0,%rcond;
4461 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4462 setp.ne.u32 %rcond,%rcondu32,0;
4464 There seems to be a bug in the ptx JIT compiler (observed at driver
4465 version 381.22, at -O1 and higher for sm_61), that drops the shfl
4466 unless %rcond is initialized to something before 'bra Lskip'. The
4467 bug is not observed with ptxas from cuda 8.0.61.
4469 It is true that the code is non-trivial: at Lskip, %rcond is
4470 uninitialized in threads 1-31, and after the selp the same holds
4471 for %rcondu32. But shfl propagates the defined value in thread 0
4472 to threads 1-31, so after the shfl %rcondu32 is defined in threads
4473 0-31, and after the setp.ne %rcond is defined in threads 0-31.
4475 There is nothing in the PTX spec to suggest that this is wrong, or
4476 to explain why the extra initialization is needed. So, we classify
4477 it as a JIT bug, and the extra initialization as workaround:
4482 setp.ne.u32 %rnotvzero,%x,0;
4485 +.reg .pred %rcond2;
4486 +setp.eq.u32 %rcond2, 1, 0;
4488 @%rnotvzero bra Lskip;
4489 setp.<op>.<type> %rcond,op1,op2;
4490 +mov.pred %rcond2, %rcond;
4492 +mov.pred %rcond, %rcond2;
4493 selp.u32 %rcondu32,1,0,%rcond;
4494 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4495 setp.ne.u32 %rcond,%rcondu32,0;
4497 rtx_insn
*label
= PREV_INSN (tail
);
4498 gcc_assert (label
&& LABEL_P (label
));
4499 rtx tmp
= gen_reg_rtx (BImode
);
4500 emit_insn_before (gen_movbi (tmp
, const0_rtx
),
4501 bb_first_real_insn (from
));
4502 emit_insn_before (gen_rtx_SET (tmp
, pvar
), label
);
4503 emit_insn_before (gen_rtx_SET (pvar
, tmp
), tail
);
4505 emit_insn_before (nvptx_gen_warp_bcast (pvar
), tail
);
4509 /* Includes worker mode, do spill & fill. By construction
4510 we should never have worker mode only. */
4511 broadcast_data_t data
;
4512 unsigned size
= GET_MODE_SIZE (SImode
);
4513 bool vector
= (GOMP_DIM_MASK (GOMP_DIM_VECTOR
) == mask
) != 0;
4514 bool worker
= (GOMP_DIM_MASK (GOMP_DIM_WORKER
) == mask
) != 0;
4515 rtx barrier
= GEN_INT (0);
4518 data
.base
= oacc_bcast_sym
;
4521 bool use_partitioning_p
= (vector
&& !worker
4522 && nvptx_mach_max_workers () > 1
4523 && cfun
->machine
->bcast_partition
);
4524 if (use_partitioning_p
)
4526 data
.base
= cfun
->machine
->bcast_partition
;
4527 barrier
= cfun
->machine
->sync_bar
;
4528 threads
= nvptx_mach_vector_length ();
4530 gcc_assert (data
.base
!= NULL
);
4531 gcc_assert (barrier
);
4533 unsigned int psize
= ROUND_UP (size
, oacc_bcast_align
);
4534 unsigned int pnum
= (nvptx_mach_vector_length () > PTX_WARP_SIZE
4535 ? nvptx_mach_max_workers () + 1
4538 oacc_bcast_partition
= MAX (oacc_bcast_partition
, psize
);
4539 oacc_bcast_size
= MAX (oacc_bcast_size
, psize
* pnum
);
4542 emit_insn_before (nvptx_gen_shared_bcast (pvar
, PM_read
, 0, &data
,
4546 /* Barrier so other workers can see the write. */
4547 emit_insn_before (nvptx_cta_sync (barrier
, threads
), tail
);
4549 emit_insn_before (nvptx_gen_shared_bcast (pvar
, PM_write
, 0, &data
,
4552 /* This barrier is needed to avoid worker zero clobbering
4553 the broadcast buffer before all the other workers have
4554 had a chance to read this instance of it. */
4555 emit_insn_before (nvptx_cta_sync (barrier
, threads
), tail
);
4558 extract_insn (tail
);
4559 rtx unsp
= gen_rtx_UNSPEC (BImode
, gen_rtvec (1, pvar
),
4561 validate_change (tail
, recog_data
.operand_loc
[0], unsp
, false);
4564 bool seen_label
= verify_neutering_jumps (from
, vector_jump
, worker_jump
,
4565 vector_label
, worker_label
);
4567 verify_neutering_labels (to
, vector_label
, worker_label
);
4570 /* PAR is a parallel that is being skipped in its entirety according to
4571 MASK. Treat this as skipping a superblock starting at forked
4572 and ending at joining. */
4575 nvptx_skip_par (unsigned mask
, parallel
*par
)
4577 basic_block tail
= par
->join_block
;
4578 gcc_assert (tail
->preds
->length () == 1);
4580 basic_block pre_tail
= (*tail
->preds
)[0]->src
;
4581 gcc_assert (pre_tail
->succs
->length () == 1);
4583 nvptx_single (mask
, par
->forked_block
, pre_tail
);
4586 /* If PAR has a single inner parallel and PAR itself only contains
4587 empty entry and exit blocks, swallow the inner PAR. */
4590 nvptx_optimize_inner (parallel
*par
)
4592 parallel
*inner
= par
->inner
;
4594 /* We mustn't be the outer dummy par. */
4598 /* We must have a single inner par. */
4599 if (!inner
|| inner
->next
)
4602 /* We must only contain 2 blocks ourselves -- the head and tail of
4604 if (par
->blocks
.length () != 2)
4607 /* We must be disjoint partitioning. As we only have vector and
4608 worker partitioning, this is sufficient to guarantee the pars
4609 have adjacent partitioning. */
4610 if ((par
->mask
& inner
->mask
) & (GOMP_DIM_MASK (GOMP_DIM_MAX
) - 1))
4611 /* This indicates malformed code generation. */
4614 /* The outer forked insn should be immediately followed by the inner
4616 rtx_insn
*forked
= par
->forked_insn
;
4617 rtx_insn
*fork
= BB_END (par
->forked_block
);
4619 if (NEXT_INSN (forked
) != fork
)
4621 gcc_checking_assert (recog_memoized (fork
) == CODE_FOR_nvptx_fork
);
4623 /* The outer joining insn must immediately follow the inner join
4625 rtx_insn
*joining
= par
->joining_insn
;
4626 rtx_insn
*join
= inner
->join_insn
;
4627 if (NEXT_INSN (join
) != joining
)
4630 /* Preconditions met. Swallow the inner par. */
4632 fprintf (dump_file
, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
4633 inner
->mask
, inner
->forked_block
->index
,
4634 inner
->join_block
->index
,
4635 par
->mask
, par
->forked_block
->index
, par
->join_block
->index
);
4637 par
->mask
|= inner
->mask
& (GOMP_DIM_MASK (GOMP_DIM_MAX
) - 1);
4639 par
->blocks
.reserve (inner
->blocks
.length ());
4640 while (inner
->blocks
.length ())
4641 par
->blocks
.quick_push (inner
->blocks
.pop ());
4643 par
->inner
= inner
->inner
;
4644 inner
->inner
= NULL
;
4649 /* Process the parallel PAR and all its contained
4650 parallels. We do everything but the neutering. Return mask of
4651 partitioned modes used within this parallel. */
4654 nvptx_process_pars (parallel
*par
)
4657 nvptx_optimize_inner (par
);
4659 unsigned inner_mask
= par
->mask
;
4661 /* Do the inner parallels first. */
4664 par
->inner_mask
= nvptx_process_pars (par
->inner
);
4665 inner_mask
|= par
->inner_mask
;
4668 bool is_call
= (par
->mask
& GOMP_DIM_MASK (GOMP_DIM_MAX
)) != 0;
4669 bool worker
= (par
->mask
& GOMP_DIM_MASK (GOMP_DIM_WORKER
));
4670 bool large_vector
= ((par
->mask
& GOMP_DIM_MASK (GOMP_DIM_VECTOR
))
4671 && nvptx_mach_vector_length () > PTX_WARP_SIZE
);
4673 if (worker
|| large_vector
)
4675 nvptx_shared_propagate (false, is_call
, par
->forked_block
,
4676 par
->forked_insn
, !worker
);
4678 = nvptx_shared_propagate (true, is_call
, par
->forked_block
,
4679 par
->fork_insn
, !worker
);
4681 = !is_call
&& (NEXT_INSN (par
->forked_insn
)
4682 && NEXT_INSN (par
->forked_insn
) == par
->joining_insn
);
4683 rtx barrier
= GEN_INT (0);
4686 if (!worker
&& cfun
->machine
->sync_bar
)
4688 barrier
= cfun
->machine
->sync_bar
;
4689 threads
= nvptx_mach_vector_length ();
4692 if (no_prop_p
&& empty_loop_p
)
4694 else if (no_prop_p
&& is_call
)
4698 /* Insert begin and end synchronizations. */
4699 emit_insn_before (nvptx_cta_sync (barrier
, threads
),
4701 emit_insn_before (nvptx_cta_sync (barrier
, threads
), par
->join_insn
);
4704 else if (par
->mask
& GOMP_DIM_MASK (GOMP_DIM_VECTOR
))
4705 nvptx_warp_propagate (is_call
, par
->forked_block
, par
->forked_insn
);
4707 /* Now do siblings. */
4709 inner_mask
|= nvptx_process_pars (par
->next
);
4713 /* Neuter the parallel described by PAR. We recurse in depth-first
4714 order. MODES are the partitioning of the execution and OUTER is
4715 the partitioning of the parallels we are contained in. */
4718 nvptx_neuter_pars (parallel
*par
, unsigned modes
, unsigned outer
)
4720 unsigned me
= (par
->mask
4721 & (GOMP_DIM_MASK (GOMP_DIM_WORKER
)
4722 | GOMP_DIM_MASK (GOMP_DIM_VECTOR
)));
4723 unsigned skip_mask
= 0, neuter_mask
= 0;
4726 nvptx_neuter_pars (par
->inner
, modes
, outer
| me
);
4728 for (unsigned mode
= GOMP_DIM_WORKER
; mode
<= GOMP_DIM_VECTOR
; mode
++)
4730 if ((outer
| me
) & GOMP_DIM_MASK (mode
))
4731 {} /* Mode is partitioned: no neutering. */
4732 else if (!(modes
& GOMP_DIM_MASK (mode
)))
4733 {} /* Mode is not used: nothing to do. */
4734 else if (par
->inner_mask
& GOMP_DIM_MASK (mode
)
4735 || !par
->forked_insn
)
4736 /* Partitioned in inner parallels, or we're not a partitioned
4737 at all: neuter individual blocks. */
4738 neuter_mask
|= GOMP_DIM_MASK (mode
);
4739 else if (!par
->parent
|| !par
->parent
->forked_insn
4740 || par
->parent
->inner_mask
& GOMP_DIM_MASK (mode
))
4741 /* Parent isn't a parallel or contains this paralleling: skip
4742 parallel at this level. */
4743 skip_mask
|= GOMP_DIM_MASK (mode
);
4745 {} /* Parent will skip this parallel itself. */
4754 /* Neuter whole SESE regions. */
4755 bb_pair_vec_t regions
;
4757 nvptx_find_sese (par
->blocks
, regions
);
4758 len
= regions
.length ();
4759 for (ix
= 0; ix
!= len
; ix
++)
4761 basic_block from
= regions
[ix
].first
;
4762 basic_block to
= regions
[ix
].second
;
4765 nvptx_single (neuter_mask
, from
, to
);
4772 /* Neuter each BB individually. */
4773 len
= par
->blocks
.length ();
4774 for (ix
= 0; ix
!= len
; ix
++)
4776 basic_block block
= par
->blocks
[ix
];
4778 nvptx_single (neuter_mask
, block
, block
);
4784 nvptx_skip_par (skip_mask
, par
);
4787 nvptx_neuter_pars (par
->next
, modes
, outer
);
4791 populate_offload_attrs (offload_attrs
*oa
)
4793 tree attr
= oacc_get_fn_attrib (current_function_decl
);
4794 tree dims
= TREE_VALUE (attr
);
4799 for (ix
= 0; ix
!= GOMP_DIM_MAX
; ix
++, dims
= TREE_CHAIN (dims
))
4801 tree t
= TREE_VALUE (dims
);
4802 int size
= (t
== NULL_TREE
) ? -1 : TREE_INT_CST_LOW (t
);
4803 tree allowed
= TREE_PURPOSE (dims
);
4805 if (size
!= 1 && !(allowed
&& integer_zerop (allowed
)))
4806 oa
->mask
|= GOMP_DIM_MASK (ix
);
4811 oa
->num_gangs
= size
;
4814 case GOMP_DIM_WORKER
:
4815 oa
->num_workers
= size
;
4818 case GOMP_DIM_VECTOR
:
4819 oa
->vector_length
= size
;
4825 #if WORKAROUND_PTXJIT_BUG_2
4826 /* Variant of pc_set that only requires JUMP_P (INSN) if STRICT. This variant
4827 is needed in the nvptx target because the branches generated for
4828 parititioning are NONJUMP_INSN_P, not JUMP_P. */
4831 nvptx_pc_set (const rtx_insn
*insn
, bool strict
= true)
4834 if ((strict
&& !JUMP_P (insn
))
4835 || (!strict
&& !INSN_P (insn
)))
4837 pat
= PATTERN (insn
);
4839 /* The set is allowed to appear either as the insn pattern or
4840 the first set in a PARALLEL. */
4841 if (GET_CODE (pat
) == PARALLEL
)
4842 pat
= XVECEXP (pat
, 0, 0);
4843 if (GET_CODE (pat
) == SET
&& GET_CODE (SET_DEST (pat
)) == PC
)
4849 /* Variant of condjump_label that only requires JUMP_P (INSN) if STRICT. */
4852 nvptx_condjump_label (const rtx_insn
*insn
, bool strict
= true)
4854 rtx x
= nvptx_pc_set (insn
, strict
);
4859 if (GET_CODE (x
) == LABEL_REF
)
4861 if (GET_CODE (x
) != IF_THEN_ELSE
)
4863 if (XEXP (x
, 2) == pc_rtx
&& GET_CODE (XEXP (x
, 1)) == LABEL_REF
)
4865 if (XEXP (x
, 1) == pc_rtx
&& GET_CODE (XEXP (x
, 2)) == LABEL_REF
)
4870 /* Insert a dummy ptx insn when encountering a branch to a label with no ptx
4871 insn inbetween the branch and the label. This works around a JIT bug
4872 observed at driver version 384.111, at -O0 for sm_50. */
4875 prevent_branch_around_nothing (void)
4877 rtx_insn
*seen_label
= NULL
;
4878 for (rtx_insn
*insn
= get_insns (); insn
; insn
= NEXT_INSN (insn
))
4880 if (INSN_P (insn
) && condjump_p (insn
))
4882 seen_label
= label_ref_label (nvptx_condjump_label (insn
, false));
4886 if (seen_label
== NULL
)
4889 if (NOTE_P (insn
) || DEBUG_INSN_P (insn
))
4893 switch (recog_memoized (insn
))
4895 case CODE_FOR_nvptx_fork
:
4896 case CODE_FOR_nvptx_forked
:
4897 case CODE_FOR_nvptx_joining
:
4898 case CODE_FOR_nvptx_join
:
4905 if (LABEL_P (insn
) && insn
== seen_label
)
4906 emit_insn_before (gen_fake_nop (), insn
);
4913 #ifdef WORKAROUND_PTXJIT_BUG_3
4914 /* Insert two membar.cta insns inbetween two subsequent bar.sync insns. This
4915 works around a hang observed at driver version 390.48 for sm_50. */
4918 workaround_barsyncs (void)
4920 bool seen_barsync
= false;
4921 for (rtx_insn
*insn
= get_insns (); insn
; insn
= NEXT_INSN (insn
))
4923 if (INSN_P (insn
) && recog_memoized (insn
) == CODE_FOR_nvptx_barsync
)
4927 emit_insn_before (gen_nvptx_membar_cta (), insn
);
4928 emit_insn_before (gen_nvptx_membar_cta (), insn
);
4931 seen_barsync
= true;
4938 if (NOTE_P (insn
) || DEBUG_INSN_P (insn
))
4940 else if (INSN_P (insn
))
4941 switch (recog_memoized (insn
))
4943 case CODE_FOR_nvptx_fork
:
4944 case CODE_FOR_nvptx_forked
:
4945 case CODE_FOR_nvptx_joining
:
4946 case CODE_FOR_nvptx_join
:
4952 seen_barsync
= false;
4957 /* PTX-specific reorganization
4958 - Split blocks at fork and join instructions
4959 - Compute live registers
4960 - Mark now-unused registers, so function begin doesn't declare
4962 - Insert state propagation when entering partitioned mode
4963 - Insert neutering instructions when in single mode
4964 - Replace subregs with suitable sequences.
4970 /* We are freeing block_for_insn in the toplev to keep compatibility
4971 with old MDEP_REORGS that are not CFG based. Recompute it now. */
4972 compute_bb_for_insn ();
4974 thread_prologue_and_epilogue_insns ();
4976 /* Split blocks and record interesting unspecs. */
4977 bb_insn_map_t bb_insn_map
;
4979 nvptx_split_blocks (&bb_insn_map
);
4981 /* Compute live regs */
4982 df_clear_flags (DF_LR_RUN_DCE
);
4983 df_set_flags (DF_NO_INSN_RESCAN
| DF_NO_HARD_REGS
);
4984 df_live_add_problem ();
4985 df_live_set_all_dirty ();
4987 regstat_init_n_sets_and_refs ();
4990 df_dump (dump_file
);
4992 /* Mark unused regs as unused. */
4993 int max_regs
= max_reg_num ();
4994 for (int i
= LAST_VIRTUAL_REGISTER
+ 1; i
< max_regs
; i
++)
4995 if (REG_N_SETS (i
) == 0 && REG_N_REFS (i
) == 0)
4996 regno_reg_rtx
[i
] = const0_rtx
;
4998 /* Determine launch dimensions of the function. If it is not an
4999 offloaded function (i.e. this is a regular compiler), the
5000 function has no neutering. */
5001 tree attr
= oacc_get_fn_attrib (current_function_decl
);
5004 /* If we determined this mask before RTL expansion, we could
5005 elide emission of some levels of forks and joins. */
5008 populate_offload_attrs (&oa
);
5010 /* If there is worker neutering, there must be vector
5011 neutering. Otherwise the hardware will fail. */
5012 gcc_assert (!(oa
.mask
& GOMP_DIM_MASK (GOMP_DIM_WORKER
))
5013 || (oa
.mask
& GOMP_DIM_MASK (GOMP_DIM_VECTOR
)));
5015 /* Discover & process partitioned regions. */
5016 parallel
*pars
= nvptx_discover_pars (&bb_insn_map
);
5017 nvptx_process_pars (pars
);
5018 nvptx_neuter_pars (pars
, oa
.mask
, 0);
5022 /* Replace subregs. */
5023 nvptx_reorg_subreg ();
5025 if (TARGET_UNIFORM_SIMT
)
5026 nvptx_reorg_uniform_simt ();
5028 #if WORKAROUND_PTXJIT_BUG_2
5029 prevent_branch_around_nothing ();
5032 #ifdef WORKAROUND_PTXJIT_BUG_3
5033 workaround_barsyncs ();
5036 regstat_free_n_sets_and_refs ();
5038 df_finish_pass (true);
5041 /* Handle a "kernel" attribute; arguments as in
5042 struct attribute_spec.handler. */
5045 nvptx_handle_kernel_attribute (tree
*node
, tree name
, tree
ARG_UNUSED (args
),
5046 int ARG_UNUSED (flags
), bool *no_add_attrs
)
5050 if (TREE_CODE (decl
) != FUNCTION_DECL
)
5052 error ("%qE attribute only applies to functions", name
);
5053 *no_add_attrs
= true;
5055 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl
))))
5057 error ("%qE attribute requires a void return type", name
);
5058 *no_add_attrs
= true;
5064 /* Handle a "shared" attribute; arguments as in
5065 struct attribute_spec.handler. */
5068 nvptx_handle_shared_attribute (tree
*node
, tree name
, tree
ARG_UNUSED (args
),
5069 int ARG_UNUSED (flags
), bool *no_add_attrs
)
5073 if (TREE_CODE (decl
) != VAR_DECL
)
5075 error ("%qE attribute only applies to variables", name
);
5076 *no_add_attrs
= true;
5078 else if (!(TREE_PUBLIC (decl
) || TREE_STATIC (decl
)))
5080 error ("%qE attribute not allowed with auto storage class", name
);
5081 *no_add_attrs
= true;
5087 /* Table of valid machine attributes. */
5088 static const struct attribute_spec nvptx_attribute_table
[] =
5090 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
5091 affects_type_identity, handler, exclude } */
5092 { "kernel", 0, 0, true, false, false, false, nvptx_handle_kernel_attribute
,
5094 { "shared", 0, 0, true, false, false, false, nvptx_handle_shared_attribute
,
5096 { NULL
, 0, 0, false, false, false, false, NULL
, NULL
}
5099 /* Limit vector alignments to BIGGEST_ALIGNMENT. */
5101 static HOST_WIDE_INT
5102 nvptx_vector_alignment (const_tree type
)
5104 HOST_WIDE_INT align
= tree_to_shwi (TYPE_SIZE (type
));
5106 return MIN (align
, BIGGEST_ALIGNMENT
);
5109 /* Indicate that INSN cannot be duplicated. */
5112 nvptx_cannot_copy_insn_p (rtx_insn
*insn
)
5114 switch (recog_memoized (insn
))
5116 case CODE_FOR_nvptx_shufflesi
:
5117 case CODE_FOR_nvptx_shufflesf
:
5118 case CODE_FOR_nvptx_barsync
:
5119 case CODE_FOR_nvptx_fork
:
5120 case CODE_FOR_nvptx_forked
:
5121 case CODE_FOR_nvptx_joining
:
5122 case CODE_FOR_nvptx_join
:
5129 /* Section anchors do not work. Initialization for flag_section_anchor
5130 probes the existence of the anchoring target hooks and prevents
5131 anchoring if they don't exist. However, we may be being used with
5132 a host-side compiler that does support anchoring, and hence see
5133 the anchor flag set (as it's not recalculated). So provide an
5134 implementation denying anchoring. */
5137 nvptx_use_anchors_for_symbol_p (const_rtx
ARG_UNUSED (a
))
5142 /* Record a symbol for mkoffload to enter into the mapping table. */
5145 nvptx_record_offload_symbol (tree decl
)
5147 switch (TREE_CODE (decl
))
5150 fprintf (asm_out_file
, "//:VAR_MAP \"%s\"\n",
5151 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl
)));
5156 tree attr
= oacc_get_fn_attrib (decl
);
5157 /* OpenMP offloading does not set this attribute. */
5158 tree dims
= attr
? TREE_VALUE (attr
) : NULL_TREE
;
5160 fprintf (asm_out_file
, "//:FUNC_MAP \"%s\"",
5161 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl
)));
5163 for (; dims
; dims
= TREE_CHAIN (dims
))
5165 int size
= TREE_INT_CST_LOW (TREE_VALUE (dims
));
5167 gcc_assert (!TREE_PURPOSE (dims
));
5168 fprintf (asm_out_file
, ", %#x", size
);
5171 fprintf (asm_out_file
, "\n");
5180 /* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
5181 at the start of a file. */
5184 nvptx_file_start (void)
5186 fputs ("// BEGIN PREAMBLE\n", asm_out_file
);
5187 fputs ("\t.version\t3.1\n", asm_out_file
);
5189 fputs ("\t.target\tsm_35\n", asm_out_file
);
5191 fputs ("\t.target\tsm_30\n", asm_out_file
);
5192 fprintf (asm_out_file
, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode
));
5193 fputs ("// END PREAMBLE\n", asm_out_file
);
5196 /* Emit a declaration for a worker and vector-level buffer in .shared
5200 write_shared_buffer (FILE *file
, rtx sym
, unsigned align
, unsigned size
)
5202 const char *name
= XSTR (sym
, 0);
5204 write_var_marker (file
, true, false, name
);
5205 fprintf (file
, ".shared .align %d .u8 %s[%d];\n",
5209 /* Write out the function declarations we've collected and declare storage
5210 for the broadcast buffer. */
5213 nvptx_file_end (void)
5215 hash_table
<tree_hasher
>::iterator iter
;
5217 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab
, decl
, tree
, iter
)
5218 nvptx_record_fndecl (decl
);
5219 fputs (func_decls
.str().c_str(), asm_out_file
);
5221 if (oacc_bcast_size
)
5222 write_shared_buffer (asm_out_file
, oacc_bcast_sym
,
5223 oacc_bcast_align
, oacc_bcast_size
);
5225 if (worker_red_size
)
5226 write_shared_buffer (asm_out_file
, worker_red_sym
,
5227 worker_red_align
, worker_red_size
);
5229 if (vector_red_size
)
5230 write_shared_buffer (asm_out_file
, vector_red_sym
,
5231 vector_red_align
, vector_red_size
);
5233 if (need_softstack_decl
)
5235 write_var_marker (asm_out_file
, false, true, "__nvptx_stacks");
5236 /* 32 is the maximum number of warps in a block. Even though it's an
5237 external declaration, emit the array size explicitly; otherwise, it
5238 may fail at PTX JIT time if the definition is later in link order. */
5239 fprintf (asm_out_file
, ".extern .shared .u%d __nvptx_stacks[32];\n",
5242 if (need_unisimt_decl
)
5244 write_var_marker (asm_out_file
, false, true, "__nvptx_uni");
5245 fprintf (asm_out_file
, ".extern .shared .u32 __nvptx_uni[32];\n");
5249 /* Expander for the shuffle builtins. */
5252 nvptx_expand_shuffle (tree exp
, rtx target
, machine_mode mode
, int ignore
)
5257 rtx src
= expand_expr (CALL_EXPR_ARG (exp
, 0),
5258 NULL_RTX
, mode
, EXPAND_NORMAL
);
5260 src
= copy_to_mode_reg (mode
, src
);
5262 rtx idx
= expand_expr (CALL_EXPR_ARG (exp
, 1),
5263 NULL_RTX
, SImode
, EXPAND_NORMAL
);
5264 rtx op
= expand_expr (CALL_EXPR_ARG (exp
, 2),
5265 NULL_RTX
, SImode
, EXPAND_NORMAL
);
5267 if (!REG_P (idx
) && GET_CODE (idx
) != CONST_INT
)
5268 idx
= copy_to_mode_reg (SImode
, idx
);
5270 rtx pat
= nvptx_gen_shuffle (target
, src
, idx
,
5271 (nvptx_shuffle_kind
) INTVAL (op
));
5279 nvptx_output_red_partition (rtx dst
, rtx offset
)
5281 const char *zero_offset
= "\t\tmov.u64\t%%r%d, %%r%d; // vred buffer\n";
5282 const char *with_offset
= "\t\tadd.u64\t%%r%d, %%r%d, %d; // vred buffer\n";
5284 if (offset
== const0_rtx
)
5285 fprintf (asm_out_file
, zero_offset
, REGNO (dst
),
5286 REGNO (cfun
->machine
->red_partition
));
5288 fprintf (asm_out_file
, with_offset
, REGNO (dst
),
5289 REGNO (cfun
->machine
->red_partition
), UINTVAL (offset
));
5294 /* Shared-memory reduction address expander. */
5297 nvptx_expand_shared_addr (tree exp
, rtx target
,
5298 machine_mode
ARG_UNUSED (mode
), int ignore
,
5304 unsigned align
= TREE_INT_CST_LOW (CALL_EXPR_ARG (exp
, 2));
5305 unsigned offset
= TREE_INT_CST_LOW (CALL_EXPR_ARG (exp
, 0));
5306 unsigned size
= TREE_INT_CST_LOW (CALL_EXPR_ARG (exp
, 1));
5307 rtx addr
= worker_red_sym
;
5313 populate_offload_attrs (&oa
);
5315 unsigned int psize
= ROUND_UP (size
+ offset
, align
);
5316 unsigned int pnum
= nvptx_mach_max_workers ();
5317 vector_red_partition
= MAX (vector_red_partition
, psize
);
5318 vector_red_size
= MAX (vector_red_size
, psize
* pnum
);
5319 vector_red_align
= MAX (vector_red_align
, align
);
5321 if (cfun
->machine
->red_partition
== NULL
)
5322 cfun
->machine
->red_partition
= gen_reg_rtx (Pmode
);
5324 addr
= gen_reg_rtx (Pmode
);
5325 emit_insn (gen_nvptx_red_partition (addr
, GEN_INT (offset
)));
5329 worker_red_align
= MAX (worker_red_align
, align
);
5330 worker_red_size
= MAX (worker_red_size
, size
+ offset
);
5334 addr
= gen_rtx_PLUS (Pmode
, addr
, GEN_INT (offset
));
5335 addr
= gen_rtx_CONST (Pmode
, addr
);
5339 emit_move_insn (target
, addr
);
5343 /* Expand the CMP_SWAP PTX builtins. We have our own versions that do
5344 not require taking the address of any object, other than the memory
5345 cell being operated on. */
5348 nvptx_expand_cmp_swap (tree exp
, rtx target
,
5349 machine_mode
ARG_UNUSED (m
), int ARG_UNUSED (ignore
))
5351 machine_mode mode
= TYPE_MODE (TREE_TYPE (exp
));
5354 target
= gen_reg_rtx (mode
);
5356 rtx mem
= expand_expr (CALL_EXPR_ARG (exp
, 0),
5357 NULL_RTX
, Pmode
, EXPAND_NORMAL
);
5358 rtx cmp
= expand_expr (CALL_EXPR_ARG (exp
, 1),
5359 NULL_RTX
, mode
, EXPAND_NORMAL
);
5360 rtx src
= expand_expr (CALL_EXPR_ARG (exp
, 2),
5361 NULL_RTX
, mode
, EXPAND_NORMAL
);
5364 mem
= gen_rtx_MEM (mode
, mem
);
5366 cmp
= copy_to_mode_reg (mode
, cmp
);
5368 src
= copy_to_mode_reg (mode
, src
);
5371 pat
= gen_atomic_compare_and_swapsi_1 (target
, mem
, cmp
, src
, const0_rtx
);
5373 pat
= gen_atomic_compare_and_swapdi_1 (target
, mem
, cmp
, src
, const0_rtx
);
5381 /* Codes for all the NVPTX builtins. */
5384 NVPTX_BUILTIN_SHUFFLE
,
5385 NVPTX_BUILTIN_SHUFFLELL
,
5386 NVPTX_BUILTIN_WORKER_ADDR
,
5387 NVPTX_BUILTIN_VECTOR_ADDR
,
5388 NVPTX_BUILTIN_CMP_SWAP
,
5389 NVPTX_BUILTIN_CMP_SWAPLL
,
5393 static GTY(()) tree nvptx_builtin_decls
[NVPTX_BUILTIN_MAX
];
5395 /* Return the NVPTX builtin for CODE. */
5398 nvptx_builtin_decl (unsigned code
, bool ARG_UNUSED (initialize_p
))
5400 if (code
>= NVPTX_BUILTIN_MAX
)
5401 return error_mark_node
;
5403 return nvptx_builtin_decls
[code
];
5406 /* Set up all builtin functions for this target. */
5409 nvptx_init_builtins (void)
5411 #define DEF(ID, NAME, T) \
5412 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
5413 = add_builtin_function ("__builtin_nvptx_" NAME, \
5414 build_function_type_list T, \
5415 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
5417 #define UINT unsigned_type_node
5418 #define LLUINT long_long_unsigned_type_node
5419 #define PTRVOID ptr_type_node
5421 DEF (SHUFFLE
, "shuffle", (UINT
, UINT
, UINT
, UINT
, NULL_TREE
));
5422 DEF (SHUFFLELL
, "shufflell", (LLUINT
, LLUINT
, UINT
, UINT
, NULL_TREE
));
5423 DEF (WORKER_ADDR
, "worker_addr",
5424 (PTRVOID
, ST
, UINT
, UINT
, NULL_TREE
));
5425 DEF (VECTOR_ADDR
, "vector_addr",
5426 (PTRVOID
, ST
, UINT
, UINT
, NULL_TREE
));
5427 DEF (CMP_SWAP
, "cmp_swap", (UINT
, PTRVOID
, UINT
, UINT
, NULL_TREE
));
5428 DEF (CMP_SWAPLL
, "cmp_swapll", (LLUINT
, PTRVOID
, LLUINT
, LLUINT
, NULL_TREE
));
5437 /* Expand an expression EXP that calls a built-in function,
5438 with result going to TARGET if that's convenient
5439 (and in mode MODE if that's convenient).
5440 SUBTARGET may be used as the target for computing one of EXP's operands.
5441 IGNORE is nonzero if the value is to be ignored. */
5444 nvptx_expand_builtin (tree exp
, rtx target
, rtx
ARG_UNUSED (subtarget
),
5445 machine_mode mode
, int ignore
)
5447 tree fndecl
= TREE_OPERAND (CALL_EXPR_FN (exp
), 0);
5448 switch (DECL_FUNCTION_CODE (fndecl
))
5450 case NVPTX_BUILTIN_SHUFFLE
:
5451 case NVPTX_BUILTIN_SHUFFLELL
:
5452 return nvptx_expand_shuffle (exp
, target
, mode
, ignore
);
5454 case NVPTX_BUILTIN_WORKER_ADDR
:
5455 return nvptx_expand_shared_addr (exp
, target
, mode
, ignore
, false);
5457 case NVPTX_BUILTIN_VECTOR_ADDR
:
5458 return nvptx_expand_shared_addr (exp
, target
, mode
, ignore
, true);
5460 case NVPTX_BUILTIN_CMP_SWAP
:
5461 case NVPTX_BUILTIN_CMP_SWAPLL
:
5462 return nvptx_expand_cmp_swap (exp
, target
, mode
, ignore
);
5464 default: gcc_unreachable ();
5468 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
5473 return PTX_WARP_SIZE
;
5477 nvptx_welformed_vector_length_p (int l
)
5480 return l
% PTX_WARP_SIZE
== 0;
5484 nvptx_apply_dim_limits (int dims
[])
5486 /* Check that the vector_length is not too large. */
5487 if (dims
[GOMP_DIM_VECTOR
] > PTX_MAX_VECTOR_LENGTH
)
5488 dims
[GOMP_DIM_VECTOR
] = PTX_MAX_VECTOR_LENGTH
;
5490 /* Check that the number of workers is not too large. */
5491 if (dims
[GOMP_DIM_WORKER
] > PTX_WORKER_LENGTH
)
5492 dims
[GOMP_DIM_WORKER
] = PTX_WORKER_LENGTH
;
5494 /* Ensure that num_worker * vector_length <= cta size. */
5495 if (dims
[GOMP_DIM_WORKER
] > 0 && dims
[GOMP_DIM_VECTOR
] > 0
5496 && dims
[GOMP_DIM_WORKER
] * dims
[GOMP_DIM_VECTOR
] > PTX_CTA_SIZE
)
5497 dims
[GOMP_DIM_VECTOR
] = PTX_WARP_SIZE
;
5500 /* Return true if FNDECL contains calls to vector-partitionable routines. */
5503 has_vector_partitionable_routine_calls_p (tree fndecl
)
5509 FOR_EACH_BB_FN (bb
, DECL_STRUCT_FUNCTION (fndecl
))
5510 for (gimple_stmt_iterator i
= gsi_start_bb (bb
); !gsi_end_p (i
);
5511 gsi_next_nondebug (&i
))
5513 gimple
*stmt
= gsi_stmt (i
);
5514 if (gimple_code (stmt
) != GIMPLE_CALL
)
5517 tree callee
= gimple_call_fndecl (stmt
);
5521 tree attrs
= oacc_get_fn_attrib (callee
);
5522 if (attrs
== NULL_TREE
)
5525 int partition_level
= oacc_fn_attrib_level (attrs
);
5526 bool seq_routine_p
= partition_level
== GOMP_DIM_MAX
;
5534 /* As nvptx_goacc_validate_dims, but does not return bool to indicate whether
5535 DIMS has changed. */
5538 nvptx_goacc_validate_dims_1 (tree decl
, int dims
[], int fn_level
)
5540 bool oacc_default_dims_p
= false;
5541 bool oacc_min_dims_p
= false;
5542 bool offload_region_p
= false;
5543 bool routine_p
= false;
5544 bool routine_seq_p
= false;
5546 if (decl
== NULL_TREE
)
5549 oacc_default_dims_p
= true;
5550 else if (fn_level
== -2)
5551 oacc_min_dims_p
= true;
5555 else if (fn_level
== -1)
5556 offload_region_p
= true;
5557 else if (0 <= fn_level
&& fn_level
<= GOMP_DIM_MAX
)
5560 routine_seq_p
= fn_level
== GOMP_DIM_MAX
;
5567 /* OpenACC routines in C arrive here with the following attributes
5568 (omitting the 'omp declare target'):
5569 seq : __attribute__((oacc function (0 1, 0 1, 0 1)))
5570 vector: __attribute__((oacc function (0 1, 0 1, 1 0)))
5571 worker: __attribute__((oacc function (0 1, 1 0, 1 0)))
5572 gang : __attribute__((oacc function (1 0, 1 0, 1 0)))
5574 If we take f.i. the oacc function attribute of the worker routine
5575 (0 1, 1 0, 1 0), then:
5576 - the slice (0, 1, 1) is interpreted by oacc_fn_attrib_level as
5577 meaning: worker routine, that is:
5578 - can't contain gang loop (0),
5579 - can contain worker loop (1),
5580 - can contain vector loop (1).
5581 - the slice (1, 0, 0) is interpreted by oacc_validate_dims as the
5582 dimensions: gang: 1, worker: 0, vector: 0.
5584 OTOH, routines in Fortran arrive here with these attributes:
5585 seq : __attribute__((oacc function (0 0, 0 0, 0 0)))
5586 vector: __attribute__((oacc function (0 0, 0 0, 1 0)))
5587 worker: __attribute__((oacc function (0 0, 1 0, 1 0)))
5588 gang : __attribute__((oacc function (1 0, 1 0, 1 0)))
5589 that is, the same as for C but with the dimensions set to 0.
5591 This is due to a bug in the Fortran front-end: PR72741. Work around
5592 this bug by forcing the dimensions to be the same in Fortran as for C,
5593 to be able to handle C and Fortran routines uniformly in this
5595 dims
[GOMP_DIM_VECTOR
] = fn_level
> GOMP_DIM_VECTOR
? 1 : 0;
5596 dims
[GOMP_DIM_WORKER
] = fn_level
> GOMP_DIM_WORKER
? 1 : 0;
5597 dims
[GOMP_DIM_GANG
] = fn_level
> GOMP_DIM_GANG
? 1 : 0;
5600 if (oacc_min_dims_p
)
5602 gcc_assert (dims
[GOMP_DIM_VECTOR
] == 1);
5603 gcc_assert (dims
[GOMP_DIM_WORKER
] == 1);
5604 gcc_assert (dims
[GOMP_DIM_GANG
] == 1);
5606 dims
[GOMP_DIM_VECTOR
] = PTX_WARP_SIZE
;
5613 dims
[GOMP_DIM_VECTOR
] = PTX_WARP_SIZE
;
5618 if (oacc_default_dims_p
)
5621 0 : set at runtime, f.i. -fopenacc-dims=-
5622 >= 1: set at compile time, f.i. -fopenacc-dims=1. */
5623 gcc_assert (dims
[GOMP_DIM_VECTOR
] >= -1);
5624 gcc_assert (dims
[GOMP_DIM_WORKER
] >= -1);
5625 gcc_assert (dims
[GOMP_DIM_GANG
] >= -1);
5627 /* But -fopenacc-dims=- is not yet supported on trunk. */
5628 gcc_assert (dims
[GOMP_DIM_VECTOR
] != 0);
5629 gcc_assert (dims
[GOMP_DIM_WORKER
] != 0);
5630 gcc_assert (dims
[GOMP_DIM_GANG
] != 0);
5633 if (offload_region_p
)
5636 0 : set using variable, f.i. num_gangs (n)
5637 >= 1: set using constant, f.i. num_gangs (1). */
5638 gcc_assert (dims
[GOMP_DIM_VECTOR
] >= -1);
5639 gcc_assert (dims
[GOMP_DIM_WORKER
] >= -1);
5640 gcc_assert (dims
[GOMP_DIM_GANG
] >= -1);
5643 int old_dims
[GOMP_DIM_MAX
];
5645 for (i
= 0; i
< GOMP_DIM_MAX
; ++i
)
5646 old_dims
[i
] = dims
[i
];
5648 const char *vector_reason
= NULL
;
5649 if (offload_region_p
&& has_vector_partitionable_routine_calls_p (decl
))
5651 if (dims
[GOMP_DIM_VECTOR
] > PTX_WARP_SIZE
)
5653 vector_reason
= G_("using vector_length (%d) due to call to"
5654 " vector-partitionable routine, ignoring %d");
5655 dims
[GOMP_DIM_VECTOR
] = PTX_WARP_SIZE
;
5659 if (dims
[GOMP_DIM_VECTOR
] == 0)
5661 vector_reason
= G_("using vector_length (%d), ignoring runtime setting");
5662 dims
[GOMP_DIM_VECTOR
] = PTX_DEFAULT_VECTOR_LENGTH
;
5665 if (dims
[GOMP_DIM_VECTOR
] > 0
5666 && !nvptx_welformed_vector_length_p (dims
[GOMP_DIM_VECTOR
]))
5667 dims
[GOMP_DIM_VECTOR
] = PTX_DEFAULT_VECTOR_LENGTH
;
5669 nvptx_apply_dim_limits (dims
);
5671 if (dims
[GOMP_DIM_VECTOR
] != old_dims
[GOMP_DIM_VECTOR
])
5672 warning_at (decl
? DECL_SOURCE_LOCATION (decl
) : UNKNOWN_LOCATION
, 0,
5673 vector_reason
!= NULL
5675 : G_("using vector_length (%d), ignoring %d"),
5676 dims
[GOMP_DIM_VECTOR
], old_dims
[GOMP_DIM_VECTOR
]);
5678 if (dims
[GOMP_DIM_WORKER
] != old_dims
[GOMP_DIM_WORKER
])
5679 warning_at (decl
? DECL_SOURCE_LOCATION (decl
) : UNKNOWN_LOCATION
, 0,
5680 G_("using num_workers (%d), ignoring %d"),
5681 dims
[GOMP_DIM_WORKER
], old_dims
[GOMP_DIM_WORKER
]);
5683 if (oacc_default_dims_p
)
5685 dims
[GOMP_DIM_VECTOR
] = PTX_DEFAULT_VECTOR_LENGTH
;
5686 if (dims
[GOMP_DIM_WORKER
] < 0)
5687 dims
[GOMP_DIM_WORKER
] = PTX_DEFAULT_RUNTIME_DIM
;
5688 if (dims
[GOMP_DIM_GANG
] < 0)
5689 dims
[GOMP_DIM_GANG
] = PTX_DEFAULT_RUNTIME_DIM
;
5690 nvptx_apply_dim_limits (dims
);
5694 /* Validate compute dimensions of an OpenACC offload or routine, fill
5695 in non-unity defaults. FN_LEVEL indicates the level at which a
5696 routine might spawn a loop. It is negative for non-routines. If
5697 DECL is null, we are validating the default dimensions. */
5700 nvptx_goacc_validate_dims (tree decl
, int dims
[], int fn_level
)
5702 int old_dims
[GOMP_DIM_MAX
];
5705 for (i
= 0; i
< GOMP_DIM_MAX
; ++i
)
5706 old_dims
[i
] = dims
[i
];
5708 nvptx_goacc_validate_dims_1 (decl
, dims
, fn_level
);
5710 gcc_assert (dims
[GOMP_DIM_VECTOR
] != 0);
5711 if (dims
[GOMP_DIM_WORKER
] > 0 && dims
[GOMP_DIM_VECTOR
] > 0)
5712 gcc_assert (dims
[GOMP_DIM_WORKER
] * dims
[GOMP_DIM_VECTOR
] <= PTX_CTA_SIZE
);
5714 for (i
= 0; i
< GOMP_DIM_MAX
; ++i
)
5715 if (old_dims
[i
] != dims
[i
])
5721 /* Return maximum dimension size, or zero for unbounded. */
5724 nvptx_dim_limit (int axis
)
5728 case GOMP_DIM_VECTOR
:
5729 return PTX_MAX_VECTOR_LENGTH
;
5737 /* Determine whether fork & joins are needed. */
5740 nvptx_goacc_fork_join (gcall
*call
, const int dims
[],
5741 bool ARG_UNUSED (is_fork
))
5743 tree arg
= gimple_call_arg (call
, 2);
5744 unsigned axis
= TREE_INT_CST_LOW (arg
);
5746 /* We only care about worker and vector partitioning. */
5747 if (axis
< GOMP_DIM_WORKER
)
5750 /* If the size is 1, there's no partitioning. */
5751 if (dims
[axis
] == 1)
5757 /* Generate a PTX builtin function call that returns the address in
5758 the worker reduction buffer at OFFSET. TYPE is the type of the
5759 data at that location. */
5762 nvptx_get_shared_red_addr (tree type
, tree offset
, bool vector
)
5764 enum nvptx_builtins addr_dim
= NVPTX_BUILTIN_WORKER_ADDR
;
5766 addr_dim
= NVPTX_BUILTIN_VECTOR_ADDR
;
5767 machine_mode mode
= TYPE_MODE (type
);
5768 tree fndecl
= nvptx_builtin_decl (addr_dim
, true);
5769 tree size
= build_int_cst (unsigned_type_node
, GET_MODE_SIZE (mode
));
5770 tree align
= build_int_cst (unsigned_type_node
,
5771 GET_MODE_ALIGNMENT (mode
) / BITS_PER_UNIT
);
5772 tree call
= build_call_expr (fndecl
, 3, offset
, size
, align
);
5774 return fold_convert (build_pointer_type (type
), call
);
5777 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
5778 will cast the variable if necessary. */
5781 nvptx_generate_vector_shuffle (location_t loc
,
5782 tree dest_var
, tree var
, unsigned shift
,
5785 unsigned fn
= NVPTX_BUILTIN_SHUFFLE
;
5786 tree_code code
= NOP_EXPR
;
5787 tree arg_type
= unsigned_type_node
;
5788 tree var_type
= TREE_TYPE (var
);
5789 tree dest_type
= var_type
;
5791 if (TREE_CODE (var_type
) == COMPLEX_TYPE
)
5792 var_type
= TREE_TYPE (var_type
);
5794 if (TREE_CODE (var_type
) == REAL_TYPE
)
5795 code
= VIEW_CONVERT_EXPR
;
5797 if (TYPE_SIZE (var_type
)
5798 == TYPE_SIZE (long_long_unsigned_type_node
))
5800 fn
= NVPTX_BUILTIN_SHUFFLELL
;
5801 arg_type
= long_long_unsigned_type_node
;
5804 tree call
= nvptx_builtin_decl (fn
, true);
5805 tree bits
= build_int_cst (unsigned_type_node
, shift
);
5806 tree kind
= build_int_cst (unsigned_type_node
, SHUFFLE_DOWN
);
5809 if (var_type
!= dest_type
)
5811 /* Do real and imaginary parts separately. */
5812 tree real
= fold_build1 (REALPART_EXPR
, var_type
, var
);
5813 real
= fold_build1 (code
, arg_type
, real
);
5814 real
= build_call_expr_loc (loc
, call
, 3, real
, bits
, kind
);
5815 real
= fold_build1 (code
, var_type
, real
);
5817 tree imag
= fold_build1 (IMAGPART_EXPR
, var_type
, var
);
5818 imag
= fold_build1 (code
, arg_type
, imag
);
5819 imag
= build_call_expr_loc (loc
, call
, 3, imag
, bits
, kind
);
5820 imag
= fold_build1 (code
, var_type
, imag
);
5822 expr
= fold_build2 (COMPLEX_EXPR
, dest_type
, real
, imag
);
5826 expr
= fold_build1 (code
, arg_type
, var
);
5827 expr
= build_call_expr_loc (loc
, call
, 3, expr
, bits
, kind
);
5828 expr
= fold_build1 (code
, dest_type
, expr
);
5831 gimplify_assign (dest_var
, expr
, seq
);
5834 /* Lazily generate the global lock var decl and return its address. */
5837 nvptx_global_lock_addr ()
5839 tree v
= global_lock_var
;
5843 tree name
= get_identifier ("__reduction_lock");
5844 tree type
= build_qualified_type (unsigned_type_node
,
5845 TYPE_QUAL_VOLATILE
);
5846 v
= build_decl (BUILTINS_LOCATION
, VAR_DECL
, name
, type
);
5847 global_lock_var
= v
;
5848 DECL_ARTIFICIAL (v
) = 1;
5849 DECL_EXTERNAL (v
) = 1;
5850 TREE_STATIC (v
) = 1;
5851 TREE_PUBLIC (v
) = 1;
5853 mark_addressable (v
);
5854 mark_decl_referenced (v
);
5857 return build_fold_addr_expr (v
);
5860 /* Insert code to locklessly update *PTR with *PTR OP VAR just before
5861 GSI. We use a lockless scheme for nearly all case, which looks
5863 actual = initval(OP);
5866 write = guess OP myval;
5867 actual = cmp&swap (ptr, guess, write)
5868 } while (actual bit-different-to guess);
5871 This relies on a cmp&swap instruction, which is available for 32-
5872 and 64-bit types. Larger types must use a locking scheme. */
5875 nvptx_lockless_update (location_t loc
, gimple_stmt_iterator
*gsi
,
5876 tree ptr
, tree var
, tree_code op
)
5878 unsigned fn
= NVPTX_BUILTIN_CMP_SWAP
;
5879 tree_code code
= NOP_EXPR
;
5880 tree arg_type
= unsigned_type_node
;
5881 tree var_type
= TREE_TYPE (var
);
5883 if (TREE_CODE (var_type
) == COMPLEX_TYPE
5884 || TREE_CODE (var_type
) == REAL_TYPE
)
5885 code
= VIEW_CONVERT_EXPR
;
5887 if (TYPE_SIZE (var_type
) == TYPE_SIZE (long_long_unsigned_type_node
))
5889 arg_type
= long_long_unsigned_type_node
;
5890 fn
= NVPTX_BUILTIN_CMP_SWAPLL
;
5893 tree swap_fn
= nvptx_builtin_decl (fn
, true);
5895 gimple_seq init_seq
= NULL
;
5896 tree init_var
= make_ssa_name (arg_type
);
5897 tree init_expr
= omp_reduction_init_op (loc
, op
, var_type
);
5898 init_expr
= fold_build1 (code
, arg_type
, init_expr
);
5899 gimplify_assign (init_var
, init_expr
, &init_seq
);
5900 gimple
*init_end
= gimple_seq_last (init_seq
);
5902 gsi_insert_seq_before (gsi
, init_seq
, GSI_SAME_STMT
);
5904 /* Split the block just after the init stmts. */
5905 basic_block pre_bb
= gsi_bb (*gsi
);
5906 edge pre_edge
= split_block (pre_bb
, init_end
);
5907 basic_block loop_bb
= pre_edge
->dest
;
5908 pre_bb
= pre_edge
->src
;
5909 /* Reset the iterator. */
5910 *gsi
= gsi_for_stmt (gsi_stmt (*gsi
));
5912 tree expect_var
= make_ssa_name (arg_type
);
5913 tree actual_var
= make_ssa_name (arg_type
);
5914 tree write_var
= make_ssa_name (arg_type
);
5916 /* Build and insert the reduction calculation. */
5917 gimple_seq red_seq
= NULL
;
5918 tree write_expr
= fold_build1 (code
, var_type
, expect_var
);
5919 write_expr
= fold_build2 (op
, var_type
, write_expr
, var
);
5920 write_expr
= fold_build1 (code
, arg_type
, write_expr
);
5921 gimplify_assign (write_var
, write_expr
, &red_seq
);
5923 gsi_insert_seq_before (gsi
, red_seq
, GSI_SAME_STMT
);
5925 /* Build & insert the cmp&swap sequence. */
5926 gimple_seq latch_seq
= NULL
;
5927 tree swap_expr
= build_call_expr_loc (loc
, swap_fn
, 3,
5928 ptr
, expect_var
, write_var
);
5929 gimplify_assign (actual_var
, swap_expr
, &latch_seq
);
5931 gcond
*cond
= gimple_build_cond (EQ_EXPR
, actual_var
, expect_var
,
5932 NULL_TREE
, NULL_TREE
);
5933 gimple_seq_add_stmt (&latch_seq
, cond
);
5935 gimple
*latch_end
= gimple_seq_last (latch_seq
);
5936 gsi_insert_seq_before (gsi
, latch_seq
, GSI_SAME_STMT
);
5938 /* Split the block just after the latch stmts. */
5939 edge post_edge
= split_block (loop_bb
, latch_end
);
5940 basic_block post_bb
= post_edge
->dest
;
5941 loop_bb
= post_edge
->src
;
5942 *gsi
= gsi_for_stmt (gsi_stmt (*gsi
));
5944 post_edge
->flags
^= EDGE_TRUE_VALUE
| EDGE_FALLTHRU
;
5945 post_edge
->probability
= profile_probability::even ();
5946 edge loop_edge
= make_edge (loop_bb
, loop_bb
, EDGE_FALSE_VALUE
);
5947 loop_edge
->probability
= profile_probability::even ();
5948 set_immediate_dominator (CDI_DOMINATORS
, loop_bb
, pre_bb
);
5949 set_immediate_dominator (CDI_DOMINATORS
, post_bb
, loop_bb
);
5951 gphi
*phi
= create_phi_node (expect_var
, loop_bb
);
5952 add_phi_arg (phi
, init_var
, pre_edge
, loc
);
5953 add_phi_arg (phi
, actual_var
, loop_edge
, loc
);
5955 loop
*loop
= alloc_loop ();
5956 loop
->header
= loop_bb
;
5957 loop
->latch
= loop_bb
;
5958 add_loop (loop
, loop_bb
->loop_father
);
5960 return fold_build1 (code
, var_type
, write_var
);
5963 /* Insert code to lockfully update *PTR with *PTR OP VAR just before
5964 GSI. This is necessary for types larger than 64 bits, where there
5965 is no cmp&swap instruction to implement a lockless scheme. We use
5966 a lock variable in global memory.
5968 while (cmp&swap (&lock_var, 0, 1))
5971 accum = accum OP var;
5973 cmp&swap (&lock_var, 1, 0);
5976 A lock in global memory is necessary to force execution engine
5977 descheduling and avoid resource starvation that can occur if the
5978 lock is in .shared memory. */
5981 nvptx_lockfull_update (location_t loc
, gimple_stmt_iterator
*gsi
,
5982 tree ptr
, tree var
, tree_code op
)
5984 tree var_type
= TREE_TYPE (var
);
5985 tree swap_fn
= nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP
, true);
5986 tree uns_unlocked
= build_int_cst (unsigned_type_node
, 0);
5987 tree uns_locked
= build_int_cst (unsigned_type_node
, 1);
5989 /* Split the block just before the gsi. Insert a gimple nop to make
5991 gimple
*nop
= gimple_build_nop ();
5992 gsi_insert_before (gsi
, nop
, GSI_SAME_STMT
);
5993 basic_block entry_bb
= gsi_bb (*gsi
);
5994 edge entry_edge
= split_block (entry_bb
, nop
);
5995 basic_block lock_bb
= entry_edge
->dest
;
5996 /* Reset the iterator. */
5997 *gsi
= gsi_for_stmt (gsi_stmt (*gsi
));
5999 /* Build and insert the locking sequence. */
6000 gimple_seq lock_seq
= NULL
;
6001 tree lock_var
= make_ssa_name (unsigned_type_node
);
6002 tree lock_expr
= nvptx_global_lock_addr ();
6003 lock_expr
= build_call_expr_loc (loc
, swap_fn
, 3, lock_expr
,
6004 uns_unlocked
, uns_locked
);
6005 gimplify_assign (lock_var
, lock_expr
, &lock_seq
);
6006 gcond
*cond
= gimple_build_cond (EQ_EXPR
, lock_var
, uns_unlocked
,
6007 NULL_TREE
, NULL_TREE
);
6008 gimple_seq_add_stmt (&lock_seq
, cond
);
6009 gimple
*lock_end
= gimple_seq_last (lock_seq
);
6010 gsi_insert_seq_before (gsi
, lock_seq
, GSI_SAME_STMT
);
6012 /* Split the block just after the lock sequence. */
6013 edge locked_edge
= split_block (lock_bb
, lock_end
);
6014 basic_block update_bb
= locked_edge
->dest
;
6015 lock_bb
= locked_edge
->src
;
6016 *gsi
= gsi_for_stmt (gsi_stmt (*gsi
));
6018 /* Create the lock loop ... */
6019 locked_edge
->flags
^= EDGE_TRUE_VALUE
| EDGE_FALLTHRU
;
6020 locked_edge
->probability
= profile_probability::even ();
6021 edge loop_edge
= make_edge (lock_bb
, lock_bb
, EDGE_FALSE_VALUE
);
6022 loop_edge
->probability
= profile_probability::even ();
6023 set_immediate_dominator (CDI_DOMINATORS
, lock_bb
, entry_bb
);
6024 set_immediate_dominator (CDI_DOMINATORS
, update_bb
, lock_bb
);
6026 /* ... and the loop structure. */
6027 loop
*lock_loop
= alloc_loop ();
6028 lock_loop
->header
= lock_bb
;
6029 lock_loop
->latch
= lock_bb
;
6030 lock_loop
->nb_iterations_estimate
= 1;
6031 lock_loop
->any_estimate
= true;
6032 add_loop (lock_loop
, entry_bb
->loop_father
);
6034 /* Build and insert the reduction calculation. */
6035 gimple_seq red_seq
= NULL
;
6036 tree acc_in
= make_ssa_name (var_type
);
6037 tree ref_in
= build_simple_mem_ref (ptr
);
6038 TREE_THIS_VOLATILE (ref_in
) = 1;
6039 gimplify_assign (acc_in
, ref_in
, &red_seq
);
6041 tree acc_out
= make_ssa_name (var_type
);
6042 tree update_expr
= fold_build2 (op
, var_type
, ref_in
, var
);
6043 gimplify_assign (acc_out
, update_expr
, &red_seq
);
6045 tree ref_out
= build_simple_mem_ref (ptr
);
6046 TREE_THIS_VOLATILE (ref_out
) = 1;
6047 gimplify_assign (ref_out
, acc_out
, &red_seq
);
6049 gsi_insert_seq_before (gsi
, red_seq
, GSI_SAME_STMT
);
6051 /* Build & insert the unlock sequence. */
6052 gimple_seq unlock_seq
= NULL
;
6053 tree unlock_expr
= nvptx_global_lock_addr ();
6054 unlock_expr
= build_call_expr_loc (loc
, swap_fn
, 3, unlock_expr
,
6055 uns_locked
, uns_unlocked
);
6056 gimplify_and_add (unlock_expr
, &unlock_seq
);
6057 gsi_insert_seq_before (gsi
, unlock_seq
, GSI_SAME_STMT
);
6062 /* Emit a sequence to update a reduction accumlator at *PTR with the
6063 value held in VAR using operator OP. Return the updated value.
6065 TODO: optimize for atomic ops and indepedent complex ops. */
6068 nvptx_reduction_update (location_t loc
, gimple_stmt_iterator
*gsi
,
6069 tree ptr
, tree var
, tree_code op
)
6071 tree type
= TREE_TYPE (var
);
6072 tree size
= TYPE_SIZE (type
);
6074 if (size
== TYPE_SIZE (unsigned_type_node
)
6075 || size
== TYPE_SIZE (long_long_unsigned_type_node
))
6076 return nvptx_lockless_update (loc
, gsi
, ptr
, var
, op
);
6078 return nvptx_lockfull_update (loc
, gsi
, ptr
, var
, op
);
6081 /* NVPTX implementation of GOACC_REDUCTION_SETUP. */
6084 nvptx_goacc_reduction_setup (gcall
*call
, offload_attrs
*oa
)
6086 gimple_stmt_iterator gsi
= gsi_for_stmt (call
);
6087 tree lhs
= gimple_call_lhs (call
);
6088 tree var
= gimple_call_arg (call
, 2);
6089 int level
= TREE_INT_CST_LOW (gimple_call_arg (call
, 3));
6090 gimple_seq seq
= NULL
;
6092 push_gimplify_context (true);
6094 if (level
!= GOMP_DIM_GANG
)
6096 /* Copy the receiver object. */
6097 tree ref_to_res
= gimple_call_arg (call
, 1);
6099 if (!integer_zerop (ref_to_res
))
6100 var
= build_simple_mem_ref (ref_to_res
);
6103 if (level
== GOMP_DIM_WORKER
6104 || (level
== GOMP_DIM_VECTOR
&& oa
->vector_length
> PTX_WARP_SIZE
))
6106 /* Store incoming value to worker reduction buffer. */
6107 tree offset
= gimple_call_arg (call
, 5);
6108 tree call
= nvptx_get_shared_red_addr (TREE_TYPE (var
), offset
,
6109 level
== GOMP_DIM_VECTOR
);
6110 tree ptr
= make_ssa_name (TREE_TYPE (call
));
6112 gimplify_assign (ptr
, call
, &seq
);
6113 tree ref
= build_simple_mem_ref (ptr
);
6114 TREE_THIS_VOLATILE (ref
) = 1;
6115 gimplify_assign (ref
, var
, &seq
);
6119 gimplify_assign (lhs
, var
, &seq
);
6121 pop_gimplify_context (NULL
);
6122 gsi_replace_with_seq (&gsi
, seq
, true);
6125 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
6128 nvptx_goacc_reduction_init (gcall
*call
, offload_attrs
*oa
)
6130 gimple_stmt_iterator gsi
= gsi_for_stmt (call
);
6131 tree lhs
= gimple_call_lhs (call
);
6132 tree var
= gimple_call_arg (call
, 2);
6133 int level
= TREE_INT_CST_LOW (gimple_call_arg (call
, 3));
6134 enum tree_code rcode
6135 = (enum tree_code
)TREE_INT_CST_LOW (gimple_call_arg (call
, 4));
6136 tree init
= omp_reduction_init_op (gimple_location (call
), rcode
,
6138 gimple_seq seq
= NULL
;
6140 push_gimplify_context (true);
6142 if (level
== GOMP_DIM_VECTOR
&& oa
->vector_length
== PTX_WARP_SIZE
)
6144 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
6145 tree tid
= make_ssa_name (integer_type_node
);
6146 tree dim_vector
= gimple_call_arg (call
, 3);
6147 gimple
*tid_call
= gimple_build_call_internal (IFN_GOACC_DIM_POS
, 1,
6149 gimple
*cond_stmt
= gimple_build_cond (NE_EXPR
, tid
, integer_zero_node
,
6150 NULL_TREE
, NULL_TREE
);
6152 gimple_call_set_lhs (tid_call
, tid
);
6153 gimple_seq_add_stmt (&seq
, tid_call
);
6154 gimple_seq_add_stmt (&seq
, cond_stmt
);
6156 /* Split the block just after the call. */
6157 edge init_edge
= split_block (gsi_bb (gsi
), call
);
6158 basic_block init_bb
= init_edge
->dest
;
6159 basic_block call_bb
= init_edge
->src
;
6161 /* Fixup flags from call_bb to init_bb. */
6162 init_edge
->flags
^= EDGE_FALLTHRU
| EDGE_TRUE_VALUE
;
6163 init_edge
->probability
= profile_probability::even ();
6165 /* Set the initialization stmts. */
6166 gimple_seq init_seq
= NULL
;
6167 tree init_var
= make_ssa_name (TREE_TYPE (var
));
6168 gimplify_assign (init_var
, init
, &init_seq
);
6169 gsi
= gsi_start_bb (init_bb
);
6170 gsi_insert_seq_before (&gsi
, init_seq
, GSI_SAME_STMT
);
6172 /* Split block just after the init stmt. */
6174 edge inited_edge
= split_block (gsi_bb (gsi
), gsi_stmt (gsi
));
6175 basic_block dst_bb
= inited_edge
->dest
;
6177 /* Create false edge from call_bb to dst_bb. */
6178 edge nop_edge
= make_edge (call_bb
, dst_bb
, EDGE_FALSE_VALUE
);
6179 nop_edge
->probability
= profile_probability::even ();
6181 /* Create phi node in dst block. */
6182 gphi
*phi
= create_phi_node (lhs
, dst_bb
);
6183 add_phi_arg (phi
, init_var
, inited_edge
, gimple_location (call
));
6184 add_phi_arg (phi
, var
, nop_edge
, gimple_location (call
));
6186 /* Reset dominator of dst bb. */
6187 set_immediate_dominator (CDI_DOMINATORS
, dst_bb
, call_bb
);
6189 /* Reset the gsi. */
6190 gsi
= gsi_for_stmt (call
);
6194 if (level
== GOMP_DIM_GANG
)
6196 /* If there's no receiver object, propagate the incoming VAR. */
6197 tree ref_to_res
= gimple_call_arg (call
, 1);
6198 if (integer_zerop (ref_to_res
))
6202 gimplify_assign (lhs
, init
, &seq
);
6205 pop_gimplify_context (NULL
);
6206 gsi_replace_with_seq (&gsi
, seq
, true);
6209 /* NVPTX implementation of GOACC_REDUCTION_FINI. */
6212 nvptx_goacc_reduction_fini (gcall
*call
, offload_attrs
*oa
)
6214 gimple_stmt_iterator gsi
= gsi_for_stmt (call
);
6215 tree lhs
= gimple_call_lhs (call
);
6216 tree ref_to_res
= gimple_call_arg (call
, 1);
6217 tree var
= gimple_call_arg (call
, 2);
6218 int level
= TREE_INT_CST_LOW (gimple_call_arg (call
, 3));
6220 = (enum tree_code
)TREE_INT_CST_LOW (gimple_call_arg (call
, 4));
6221 gimple_seq seq
= NULL
;
6222 tree r
= NULL_TREE
;;
6224 push_gimplify_context (true);
6226 if (level
== GOMP_DIM_VECTOR
&& oa
->vector_length
== PTX_WARP_SIZE
)
6228 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
6229 but that requires a method of emitting a unified jump at the
6231 for (int shfl
= PTX_WARP_SIZE
/ 2; shfl
> 0; shfl
= shfl
>> 1)
6233 tree other_var
= make_ssa_name (TREE_TYPE (var
));
6234 nvptx_generate_vector_shuffle (gimple_location (call
),
6235 other_var
, var
, shfl
, &seq
);
6237 r
= make_ssa_name (TREE_TYPE (var
));
6238 gimplify_assign (r
, fold_build2 (op
, TREE_TYPE (var
),
6239 var
, other_var
), &seq
);
6245 tree accum
= NULL_TREE
;
6247 if (level
== GOMP_DIM_WORKER
|| level
== GOMP_DIM_VECTOR
)
6249 /* Get reduction buffer address. */
6250 tree offset
= gimple_call_arg (call
, 5);
6251 tree call
= nvptx_get_shared_red_addr (TREE_TYPE (var
), offset
,
6252 level
== GOMP_DIM_VECTOR
);
6253 tree ptr
= make_ssa_name (TREE_TYPE (call
));
6255 gimplify_assign (ptr
, call
, &seq
);
6258 else if (integer_zerop (ref_to_res
))
6265 /* UPDATE the accumulator. */
6266 gsi_insert_seq_before (&gsi
, seq
, GSI_SAME_STMT
);
6268 r
= nvptx_reduction_update (gimple_location (call
), &gsi
,
6274 gimplify_assign (lhs
, r
, &seq
);
6275 pop_gimplify_context (NULL
);
6277 gsi_replace_with_seq (&gsi
, seq
, true);
6280 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
6283 nvptx_goacc_reduction_teardown (gcall
*call
, offload_attrs
*oa
)
6285 gimple_stmt_iterator gsi
= gsi_for_stmt (call
);
6286 tree lhs
= gimple_call_lhs (call
);
6287 tree var
= gimple_call_arg (call
, 2);
6288 int level
= TREE_INT_CST_LOW (gimple_call_arg (call
, 3));
6289 gimple_seq seq
= NULL
;
6291 push_gimplify_context (true);
6292 if (level
== GOMP_DIM_WORKER
6293 || (level
== GOMP_DIM_VECTOR
&& oa
->vector_length
> PTX_WARP_SIZE
))
6295 /* Read the worker reduction buffer. */
6296 tree offset
= gimple_call_arg (call
, 5);
6297 tree call
= nvptx_get_shared_red_addr (TREE_TYPE (var
), offset
,
6298 level
== GOMP_DIM_VECTOR
);
6299 tree ptr
= make_ssa_name (TREE_TYPE (call
));
6301 gimplify_assign (ptr
, call
, &seq
);
6302 var
= build_simple_mem_ref (ptr
);
6303 TREE_THIS_VOLATILE (var
) = 1;
6306 if (level
!= GOMP_DIM_GANG
)
6308 /* Write to the receiver object. */
6309 tree ref_to_res
= gimple_call_arg (call
, 1);
6311 if (!integer_zerop (ref_to_res
))
6312 gimplify_assign (build_simple_mem_ref (ref_to_res
), var
, &seq
);
6316 gimplify_assign (lhs
, var
, &seq
);
6318 pop_gimplify_context (NULL
);
6320 gsi_replace_with_seq (&gsi
, seq
, true);
6323 /* NVPTX reduction expander. */
6326 nvptx_goacc_reduction (gcall
*call
)
6328 unsigned code
= (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call
, 0));
6331 populate_offload_attrs (&oa
);
6335 case IFN_GOACC_REDUCTION_SETUP
:
6336 nvptx_goacc_reduction_setup (call
, &oa
);
6339 case IFN_GOACC_REDUCTION_INIT
:
6340 nvptx_goacc_reduction_init (call
, &oa
);
6343 case IFN_GOACC_REDUCTION_FINI
:
6344 nvptx_goacc_reduction_fini (call
, &oa
);
6347 case IFN_GOACC_REDUCTION_TEARDOWN
:
6348 nvptx_goacc_reduction_teardown (call
, &oa
);
6357 nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED
,
6358 rtx x ATTRIBUTE_UNUSED
)
6364 nvptx_vector_mode_supported (machine_mode mode
)
6366 return (mode
== V2SImode
6367 || mode
== V2DImode
);
6370 /* Return the preferred mode for vectorizing scalar MODE. */
6373 nvptx_preferred_simd_mode (scalar_mode mode
)
6383 return default_preferred_simd_mode (mode
);
6388 nvptx_data_alignment (const_tree type
, unsigned int basic_align
)
6390 if (TREE_CODE (type
) == INTEGER_TYPE
)
6392 unsigned HOST_WIDE_INT size
= tree_to_uhwi (TYPE_SIZE_UNIT (type
));
6393 if (size
== GET_MODE_SIZE (TImode
))
6394 return GET_MODE_BITSIZE (maybe_split_mode (TImode
));
6400 /* Implement TARGET_MODES_TIEABLE_P. */
6403 nvptx_modes_tieable_p (machine_mode
, machine_mode
)
6408 /* Implement TARGET_HARD_REGNO_NREGS. */
6411 nvptx_hard_regno_nregs (unsigned int, machine_mode
)
6416 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
6419 nvptx_can_change_mode_class (machine_mode
, machine_mode
, reg_class_t
)
6424 static GTY(()) tree nvptx_previous_fndecl
;
6427 nvptx_set_current_function (tree fndecl
)
6429 if (!fndecl
|| fndecl
== nvptx_previous_fndecl
)
6432 nvptx_previous_fndecl
= fndecl
;
6433 vector_red_partition
= 0;
6434 oacc_bcast_partition
= 0;
6437 #undef TARGET_OPTION_OVERRIDE
6438 #define TARGET_OPTION_OVERRIDE nvptx_option_override
6440 #undef TARGET_ATTRIBUTE_TABLE
6441 #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
6444 #define TARGET_LRA_P hook_bool_void_false
6446 #undef TARGET_LEGITIMATE_ADDRESS_P
6447 #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
6449 #undef TARGET_PROMOTE_FUNCTION_MODE
6450 #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
6452 #undef TARGET_FUNCTION_ARG
6453 #define TARGET_FUNCTION_ARG nvptx_function_arg
6454 #undef TARGET_FUNCTION_INCOMING_ARG
6455 #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
6456 #undef TARGET_FUNCTION_ARG_ADVANCE
6457 #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
6458 #undef TARGET_FUNCTION_ARG_BOUNDARY
6459 #define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
6460 #undef TARGET_PASS_BY_REFERENCE
6461 #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
6462 #undef TARGET_FUNCTION_VALUE_REGNO_P
6463 #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
6464 #undef TARGET_FUNCTION_VALUE
6465 #define TARGET_FUNCTION_VALUE nvptx_function_value
6466 #undef TARGET_LIBCALL_VALUE
6467 #define TARGET_LIBCALL_VALUE nvptx_libcall_value
6468 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
6469 #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
6470 #undef TARGET_GET_DRAP_RTX
6471 #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
6472 #undef TARGET_SPLIT_COMPLEX_ARG
6473 #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
6474 #undef TARGET_RETURN_IN_MEMORY
6475 #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
6476 #undef TARGET_OMIT_STRUCT_RETURN_REG
6477 #define TARGET_OMIT_STRUCT_RETURN_REG true
6478 #undef TARGET_STRICT_ARGUMENT_NAMING
6479 #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
6480 #undef TARGET_CALL_ARGS
6481 #define TARGET_CALL_ARGS nvptx_call_args
6482 #undef TARGET_END_CALL_ARGS
6483 #define TARGET_END_CALL_ARGS nvptx_end_call_args
6485 #undef TARGET_ASM_FILE_START
6486 #define TARGET_ASM_FILE_START nvptx_file_start
6487 #undef TARGET_ASM_FILE_END
6488 #define TARGET_ASM_FILE_END nvptx_file_end
6489 #undef TARGET_ASM_GLOBALIZE_LABEL
6490 #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
6491 #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
6492 #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
6493 #undef TARGET_PRINT_OPERAND
6494 #define TARGET_PRINT_OPERAND nvptx_print_operand
6495 #undef TARGET_PRINT_OPERAND_ADDRESS
6496 #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
6497 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
6498 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
6499 #undef TARGET_ASM_INTEGER
6500 #define TARGET_ASM_INTEGER nvptx_assemble_integer
6501 #undef TARGET_ASM_DECL_END
6502 #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
6503 #undef TARGET_ASM_DECLARE_CONSTANT_NAME
6504 #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
6505 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
6506 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
6507 #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
6508 #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
6510 #undef TARGET_MACHINE_DEPENDENT_REORG
6511 #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
6512 #undef TARGET_NO_REGISTER_ALLOCATION
6513 #define TARGET_NO_REGISTER_ALLOCATION true
6515 #undef TARGET_ENCODE_SECTION_INFO
6516 #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
6517 #undef TARGET_RECORD_OFFLOAD_SYMBOL
6518 #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
6520 #undef TARGET_VECTOR_ALIGNMENT
6521 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
6523 #undef TARGET_CANNOT_COPY_INSN_P
6524 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
6526 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
6527 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
6529 #undef TARGET_INIT_BUILTINS
6530 #define TARGET_INIT_BUILTINS nvptx_init_builtins
6531 #undef TARGET_EXPAND_BUILTIN
6532 #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
6533 #undef TARGET_BUILTIN_DECL
6534 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
6536 #undef TARGET_SIMT_VF
6537 #define TARGET_SIMT_VF nvptx_simt_vf
6539 #undef TARGET_GOACC_VALIDATE_DIMS
6540 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
6542 #undef TARGET_GOACC_DIM_LIMIT
6543 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
6545 #undef TARGET_GOACC_FORK_JOIN
6546 #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
6548 #undef TARGET_GOACC_REDUCTION
6549 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
6551 #undef TARGET_CANNOT_FORCE_CONST_MEM
6552 #define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
6554 #undef TARGET_VECTOR_MODE_SUPPORTED_P
6555 #define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported
6557 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
6558 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
6559 nvptx_preferred_simd_mode
6561 #undef TARGET_MODES_TIEABLE_P
6562 #define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p
6564 #undef TARGET_HARD_REGNO_NREGS
6565 #define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs
6567 #undef TARGET_CAN_CHANGE_MODE_CLASS
6568 #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
6570 #undef TARGET_HAVE_SPECULATION_SAFE_VALUE
6571 #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
6573 #undef TARGET_SET_CURRENT_FUNCTION
6574 #define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function
6576 struct gcc_target targetm
= TARGET_INITIALIZER
;
6578 #include "gt-nvptx.h"