+2016-11-16 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/mkoffload.c (main): Check that either OpenACC or OpenMP
+ is selected. Pass -mgomp to offload compiler in OpenMP case.
+ * config/nvptx/nvptx-protos.h (nvptx_shuffle_kind): Move enum
+ declaration from nvptx.c.
+ (nvptx_gen_shuffle): Declare.
+ (nvptx_output_set_softstack): Declare.
+ * config/nvptx/nvptx.c (nvptx_shuffle_kind): Move to nvptx-protos.h.
+ (need_softstack_decl): New variable.
+ (need_unisimt_decl): New variable.
+ (diagnose_openacc_conflict): New. Use it...
+ (nvptx_option_override): ...here. Handle TARGET_GOMP.
+ (nvptx_encode_section_info): Handle "shared" attribute.
+ (write_as_kernel): Restrict to OpenACC target regions.
+ (init_softstack_frame): New.
+ (nvptx_init_unisimt_predicate): New.
+ (write_omp_entry): New. Use it...
+ (nvptx_declare_function_name): ...here to emit OpenMP target region
+ entrypoints. Handle TARGET_SOFT_STACK. Call
+ nvptx_init_unisimt_predicate.
+ (nvptx_output_set_softstack): New.
+ (nvptx_get_drap_rtx): Return %argp as the DRAP if needed.
+ (nvptx_gen_shuffle): Export.
+ (nvptx_output_call_insn): Handle COND_EXEC patterns. Emit instruction
+ predicate.
+ (nvptx_print_operand): Fix handling of instruction predicates.
+ (nvptx_get_unisimt_master): New helper function.
+ (nvptx_get_unisimt_predicate): Ditto.
+ (nvptx_call_insn_is_syscall_p): Ditto.
+ (nvptx_unisimt_handle_set): Ditto.
+ (nvptx_reorg_uniform_simt): New. Transform code for -muniform-simt.
+ (nvptx_reorg): Call nvptx_reorg_uniform_simt.
+ (nvptx_handle_shared_attribute): New. Use it...
+ (nvptx_attribute_table): ... here (new entry).
+ (nvptx_record_offload_symbol): Handle NULL attributes.
+ (nvptx_file_end): Handle need_softstack_decl and need_unisimt_decl.
+ (nvptx_simt_vf): New.
+ (TARGET_SIMT_VF): Define.
+ * config/nvptx/nvptx.h (TARGET_CPU_CPP_BUILTINS): Define
+ __nvptx_softstack or __nvptx_unisimt__ when -msoft-stack, or resp.
+ -muniform-simt option is active.
+ (STACK_SIZE_MODE): Define.
+ (FIXED_REGISTERS): Adjust.
+ (SOFTSTACK_SLOT_REGNUM): New.
+ (SOFTSTACK_PREV_REGNUM): New.
+ (REGISTER_NAMES): Adjust.
+ (struct machine_function): New fields.
+ * config/nvptx/nvptx.md (UNSPEC_SET_SOFTSTACK): New.
+ (UNSPEC_VOTE_BALLOT): Ditto.
+ (UNSPEC_LANEID): Ditto.
+ (UNSPECV_NOUNROLL): Ditto.
+ (atomic): New attribute.
+ (predicable): New attribute. Generate predicated forms via
+ define_cond_exec.
+ (br_true): Mark as not predicable.
+ (br_false): Ditto.
+ (br_true_uni): Ditto.
+ (br_false_uni): Ditto.
+ (return): Ditto.
+ (trap_if_true): Ditto.
+ (trap_if_false): Ditto.
+ (nvptx_fork): Ditto.
+ (nvptx_forked): Ditto.
+ (nvptx_joining): Ditto.
+ (nvptx_join): Ditto.
+ (nvptx_barsync): Ditto.
+ (epilogue): Emit stack restore if TARGET_SOFT_STACK.
+ (allocate_stack): Implement for TARGET_SOFT_STACK. Remove unused code.
+ (allocate_stack_<mode>): Remove unused pattern.
+ (set_softstack_insn): New pattern.
+ (restore_stack_block): Handle for TARGET_SOFT_STACK.
+ (nvptx_vote_ballot): New pattern.
+ (omp_simt_lane): Ditto.
+ (omp_simt_last_lane): Ditto.
+ (omp_simt_ordered): Ditto.
+ (omp_simt_vote_any): Ditto.
+ (omp_simt_xchg_bfly): Ditto.
+ (omp_simt_xchg_idx): Ditto.
+ (nvptx_nounroll): Ditto.
+ (atomic_compare_and_swap<mode>_1): Mark with atomic attribute.
+ (atomic_exchange<mode>): Ditto.
+ (atomic_fetch_add<mode>): Ditto.
+ (atomic_fetch_addsf): Ditto.
+ (atomic_fetch_<logic><mode>): Ditto.
+ * config/nvptx/nvptx.opt: (msoft-stack): New option.
+ (muniform-simt): Ditto.
+ (mgomp): Ditto.
+ * config/nvptx/t-nvptx (MULTILIB_OPTIONS): New.
+ * doc/extend.texi (Nvidia PTX Variable Attributes): New section.
+ * doc/invoke.texi (msoft-stack): Document.
+ (muniform-simt): Document
+ (mgomp): Document.
+ * doc/tm.texi: Regenerate.
+ * doc/tm.texi.in: (TARGET_SIMT_VF): New hook.
+ * target.def: Define it.
+ * target-insns.def (omp_simt_lane): New.
+ (omp_simt_last_lane): New.
+ (omp_simt_ordered): New.
+ (omp_simt_vote_any): New.
+ (omp_simt_xchg_bfly): New.
+ (omp_simt_xchg_idx): New.
+
2016-11-16 Maciej W. Rozycki <macro@imgtec.com>
* config/mips/mips-protos.h (mips_set_text_contents_type): New
/* Scan the argument vector. */
bool fopenmp = false;
+ bool fopenacc = false;
for (int i = 1; i < argc; i++)
{
#define STR "-foffload-abi="
#undef STR
else if (strcmp (argv[i], "-fopenmp") == 0)
fopenmp = true;
+ else if (strcmp (argv[i], "-fopenacc") == 0)
+ fopenacc = true;
else if (strcmp (argv[i], "-save-temps") == 0)
save_temps = true;
else if (strcmp (argv[i], "-v") == 0)
verbose = true;
}
+ if (!(fopenacc ^ fopenmp))
+ fatal_error (input_location, "either -fopenacc or -fopenmp must be set");
struct obstack argv_obstack;
obstack_init (&argv_obstack);
default:
gcc_unreachable ();
}
+ if (fopenmp)
+ obstack_ptr_grow (&argv_obstack, "-mgomp");
for (int ix = 1; ix != argc; ix++)
{
#ifndef GCC_NVPTX_PROTOS_H
#define GCC_NVPTX_PROTOS_H
+/* The kind of shuffe instruction. */
+enum nvptx_shuffle_kind
+{
+ SHUFFLE_UP,
+ SHUFFLE_DOWN,
+ SHUFFLE_BFLY,
+ SHUFFLE_IDX,
+ SHUFFLE_MAX
+};
+
extern void nvptx_declare_function_name (FILE *, const char *, const_tree decl);
extern void nvptx_declare_object_name (FILE *file, const char *name,
const_tree decl);
extern void nvptx_expand_oacc_fork (unsigned);
extern void nvptx_expand_oacc_join (unsigned);
extern void nvptx_expand_call (rtx, rtx);
+extern rtx nvptx_gen_shuffle (rtx, rtx, rtx, nvptx_shuffle_kind);
extern rtx nvptx_expand_compare (rtx);
extern const char *nvptx_ptx_type_from_mode (machine_mode, bool);
extern const char *nvptx_output_mov_insn (rtx, rtx);
extern const char *nvptx_output_call_insn (rtx_insn *, rtx, rtx);
extern const char *nvptx_output_return (void);
+extern const char *nvptx_output_set_softstack (unsigned);
#endif
#endif
/* This file should be included last. */
#include "target-def.h"
-/* The kind of shuffe instruction. */
-enum nvptx_shuffle_kind
-{
- SHUFFLE_UP,
- SHUFFLE_DOWN,
- SHUFFLE_BFLY,
- SHUFFLE_IDX,
- SHUFFLE_MAX
-};
-
/* The various PTX memory areas an object might reside in. */
enum nvptx_data_area
{
/* Global lock variable, needed for 128bit worker & gang reductions. */
static GTY(()) tree global_lock_var;
+/* True if any function references __nvptx_stacks. */
+static bool need_softstack_decl;
+
+/* True if any function references __nvptx_uni. */
+static bool need_unisimt_decl;
+
/* Allocate a new, cleared machine_function structure. */
static struct machine_function *
return p;
}
+/* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
+ and -fopenacc is also enabled. */
+
+static void
+diagnose_openacc_conflict (bool optval, const char *optname)
+{
+ if (flag_openacc && optval)
+ error ("option %s is not supported together with -fopenacc", optname);
+}
+
/* Implement TARGET_OPTION_OVERRIDE. */
static void
worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
+
+ diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
+ diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
+ diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
+
+ if (TARGET_GOMP)
+ target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
}
/* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
if (TREE_CONSTANT (decl))
area = DATA_AREA_CONST;
else if (TREE_CODE (decl) == VAR_DECL)
- /* TODO: This would be a good place to check for a .shared or
- other section name. */
- area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
+ {
+ if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
+ {
+ area = DATA_AREA_SHARED;
+ if (DECL_INITIAL (decl))
+ error ("static initialization of variable %q+D in %<.shared%>"
+ " memory is not supported", decl);
+ }
+ else
+ area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
+ }
SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
}
write_as_kernel (tree attrs)
{
return (lookup_attribute ("kernel", attrs) != NULL_TREE
- || lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE);
+ || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE
+ && lookup_attribute ("oacc function", attrs) != NULL_TREE));
+ /* For OpenMP target regions, the corresponding kernel entry is emitted from
+ write_omp_entry as a separate function. */
}
/* Emit a linker marker for a function decl or defn. */
POINTER_SIZE, reg_names[regno], reg_names[regno]);
}
+/* Emit soft stack frame setup sequence. */
+
+static void
+init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
+{
+ /* Maintain 64-bit stack alignment. */
+ unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
+ size = ROUND_UP (size, keep_align);
+ int bits = POINTER_SIZE;
+ const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
+ const char *reg_frame = reg_names[FRAME_POINTER_REGNUM];
+ const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM];
+ const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM];
+ fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack);
+ fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame);
+ fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot);
+ fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev);
+ fprintf (file, "\t{\n");
+ fprintf (file, "\t\t.reg.u32 %%fstmp0;\n");
+ fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits);
+ fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits);
+ fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
+ fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
+ bits == 64 ? ".wide" : ".lo", bits / 8);
+ fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
+
+ /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */
+ fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot);
+
+ /* Initialize %sspprev = __nvptx_stacks[tid.y]. */
+ fprintf (file, "\t\tld.shared.u%d %s, [%s];\n",
+ bits, reg_sspprev, reg_sspslot);
+
+ /* Initialize %frame = %sspprev - size. */
+ fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
+ bits, reg_frame, reg_sspprev, size);
+
+ /* Apply alignment, if larger than 64. */
+ if (alignment > keep_align)
+ fprintf (file, "\t\tand.b%d %s, %s, %d;\n",
+ bits, reg_frame, reg_frame, -alignment);
+
+ size = crtl->outgoing_args_size;
+ gcc_assert (size % keep_align == 0);
+
+ /* Initialize %stack. */
+ fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
+ bits, reg_stack, reg_frame, size);
+
+ /* Usually 'crtl->is_leaf' is computed during register allocator
+ initialization, which is not done on NVPTX. Compute it now. */
+ gcc_assert (!crtl->is_leaf);
+ crtl->is_leaf = leaf_function_p ();
+ if (!crtl->is_leaf)
+ fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
+ bits, reg_sspslot, reg_stack);
+ fprintf (file, "\t}\n");
+ cfun->machine->has_softstack = true;
+ need_softstack_decl = true;
+}
+
/* Emit code to initialize the REGNO predicate register to indicate
whether we are not lane zero on the NAME axis. */
fprintf (file, "\t}\n");
}
+/* Emit code to initialize predicate and master lane index registers for
+ -muniform-simt code generation variant. */
+
+static void
+nvptx_init_unisimt_predicate (FILE *file)
+{
+ int bits = POINTER_SIZE;
+ int master = REGNO (cfun->machine->unisimt_master);
+ int pred = REGNO (cfun->machine->unisimt_predicate);
+ fprintf (file, "\t{\n");
+ fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
+ fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
+ fprintf (file, "\t\t.reg.u%d %%ustmp2;\n", bits);
+ fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
+ fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
+ bits == 64 ? ".wide" : ".lo");
+ fprintf (file, "\t\tmov.u%d %%ustmp2, __nvptx_uni;\n", bits);
+ fprintf (file, "\t\tadd.u%d %%ustmp2, %%ustmp2, %%ustmp1;\n", bits);
+ fprintf (file, "\t\tld.shared.u32 %%r%d, [%%ustmp2];\n", master);
+ fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.x;\n");
+ /* Compute 'master lane index' as 'tid.x & __nvptx_uni[tid.y]'. */
+ fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
+ /* Compute predicate as 'tid.x == master'. */
+ fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
+ fprintf (file, "\t}\n");
+ need_unisimt_decl = true;
+}
+
+/* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
+
+ extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
+ void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
+ {
+ __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
+ __nvptx_uni[tid.y] = 0;
+ gomp_nvptx_main (ORIG, arg);
+ }
+ ORIG itself should not be emitted as a PTX .entry function. */
+
+static void
+write_omp_entry (FILE *file, const char *name, const char *orig)
+{
+ static bool gomp_nvptx_main_declared;
+ if (!gomp_nvptx_main_declared)
+ {
+ gomp_nvptx_main_declared = true;
+ write_fn_marker (func_decls, false, true, "gomp_nvptx_main");
+ func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
+ << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n";
+ }
+#define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
+ (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
+{\n\
+ .reg.u32 %r<3>;\n\
+ .reg.u" PS " %R<4>;\n\
+ mov.u32 %r0, %tid.y;\n\
+ mov.u32 %r1, %ntid.y;\n\
+ mov.u32 %r2, %ctaid.x;\n\
+ cvt.u" PS ".u32 %R1, %r0;\n\
+ " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
+ mov.u" PS " %R0, __nvptx_stacks;\n\
+ " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
+ ld.param.u" PS " %R2, [%stack];\n\
+ ld.param.u" PS " %R3, [%sz];\n\
+ add.u" PS " %R2, %R2, %R3;\n\
+ mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
+ st.shared.u" PS " [%R0], %R2;\n\
+ mov.u" PS " %R0, __nvptx_uni;\n\
+ " MAD_PS_32 " %R0, %r0, 4, %R0;\n\
+ mov.u32 %r0, 0;\n\
+ st.shared.u32 [%R0], %r0;\n\
+ mov.u" PS " %R0, \0;\n\
+ ld.param.u" PS " %R1, [%arg];\n\
+ {\n\
+ .param.u" PS " %P<2>;\n\
+ st.param.u" PS " [%P0], %R0;\n\
+ st.param.u" PS " [%P1], %R1;\n\
+ call.uni gomp_nvptx_main, (%P0, %P1);\n\
+ }\n\
+ ret.uni;\n\
+}\n"
+ static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
+ static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 ");
+#undef ENTRY_TEMPLATE
+ const char *entry_1 = TARGET_ABI64 ? entry64 : entry32;
+ /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */
+ const char *entry_2 = entry_1 + strlen (entry64) + 1;
+ fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2);
+ need_softstack_decl = need_unisimt_decl = true;
+}
+
/* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
function, including local var decls and copies from the arguments to
local regs. */
tree result_type = TREE_TYPE (fntype);
int argno = 0;
+ if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
+ && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
+ {
+ char *buf = (char *) alloca (strlen (name) + sizeof ("$impl"));
+ sprintf (buf, "%s$impl", name);
+ write_omp_entry (file, name, buf);
+ name = buf;
+ }
/* We construct the initial part of the function into a string
stream, in order to share the prototype writing code. */
std::stringstream s;
fprintf (file, "%s", s.str().c_str());
- /* Declare a local var for outgoing varargs. */
- if (cfun->machine->has_varadic)
- init_frame (file, STACK_POINTER_REGNUM,
- UNITS_PER_WORD, crtl->outgoing_args_size);
-
- /* Declare a local variable for the frame. Force its size to be
- DImode-compatible. */
HOST_WIDE_INT sz = get_frame_size ();
- if (sz || cfun->machine->has_chain)
- init_frame (file, FRAME_POINTER_REGNUM,
- crtl->stack_alignment_needed / BITS_PER_UNIT,
- (sz + GET_MODE_SIZE (DImode) - 1)
- & ~(HOST_WIDE_INT)(GET_MODE_SIZE (DImode) - 1));
+ bool need_frameptr = sz || cfun->machine->has_chain;
+ int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
+ if (!TARGET_SOFT_STACK)
+ {
+ /* Declare a local var for outgoing varargs. */
+ if (cfun->machine->has_varadic)
+ init_frame (file, STACK_POINTER_REGNUM,
+ UNITS_PER_WORD, crtl->outgoing_args_size);
+
+ /* Declare a local variable for the frame. Force its size to be
+ DImode-compatible. */
+ if (need_frameptr)
+ init_frame (file, FRAME_POINTER_REGNUM, alignment,
+ ROUND_UP (sz, GET_MODE_SIZE (DImode)));
+ }
+ else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca)
+ init_softstack_frame (file, alignment, sz);
/* Declare the pseudos we have as ptx registers. */
int maxregs = max_reg_num ();
if (cfun->machine->axis_predicate[1])
nvptx_init_axis_predicate (file,
REGNO (cfun->machine->axis_predicate[1]), "x");
+ if (cfun->machine->unisimt_predicate)
+ nvptx_init_unisimt_predicate (file);
}
+/* Output instruction that sets soft stack pointer in shared memory to the
+ value in register given by SRC_REGNO. */
+
+const char *
+nvptx_output_set_softstack (unsigned src_regno)
+{
+ if (cfun->machine->has_softstack && !crtl->is_leaf)
+ {
+ fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ",
+ POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]);
+ output_reg (asm_out_file, src_regno, VOIDmode);
+ fprintf (asm_out_file, ";\n");
+ }
+ return "";
+}
/* Output a return instruction. Also copy the return value to its outgoing
location. */
static rtx
nvptx_get_drap_rtx (void)
{
+ if (TARGET_SOFT_STACK && stack_realign_drap)
+ return arg_pointer_rtx;
return NULL_RTX;
}
/* Generate an instruction or sequence to broadcast register REG
across the vectors of a single warp. */
-static rtx
+rtx
nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
{
rtx res;
return "%.\tcvt%t0%t1\t%0, %1;";
}
+static void nvptx_print_operand (FILE *, rtx, int);
+
/* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
involves writing .param declarations and in/out copies into them. For
indirect calls, also write the .callprototype. */
static int labelno;
bool needs_tgt = register_operand (callee, Pmode);
rtx pat = PATTERN (insn);
+ if (GET_CODE (pat) == COND_EXEC)
+ pat = COND_EXEC_CODE (pat);
int arg_end = XVECLEN (pat, 0);
tree decl = NULL_TREE;
fprintf (asm_out_file, ";\n");
}
+ /* The '.' stands for the call's predicate, if any. */
+ nvptx_print_operand (asm_out_file, NULL_RTX, '.');
fprintf (asm_out_file, "\t\tcall ");
if (result != NULL_RTX)
fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
return c == '.' || c== '#';
}
-static void nvptx_print_operand (FILE *, rtx, int);
-
/* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
static void
x = current_insn_predicate;
if (x)
{
- unsigned int regno = REGNO (XEXP (x, 0));
- fputs ("[", file);
+ fputs ("@", file);
if (GET_CODE (x) == EQ)
fputs ("!", file);
- fputs (reg_names [regno], file);
- fputs ("]", file);
+ output_reg (file, REGNO (XEXP (x, 0)), VOIDmode);
}
return;
}
}
}
+/* Return a SImode "master lane index" register for uniform-simt, allocating on
+ first use. */
+
+static rtx
+nvptx_get_unisimt_master ()
+{
+ rtx &master = cfun->machine->unisimt_master;
+ return master ? master : master = gen_reg_rtx (SImode);
+}
+
+/* Return a BImode "predicate" register for uniform-simt, similar to above. */
+
+static rtx
+nvptx_get_unisimt_predicate ()
+{
+ rtx &pred = cfun->machine->unisimt_predicate;
+ return pred ? pred : pred = gen_reg_rtx (BImode);
+}
+
+/* Return true if given call insn references one of the functions provided by
+ the CUDA runtime: malloc, free, vprintf. */
+
+static bool
+nvptx_call_insn_is_syscall_p (rtx_insn *insn)
+{
+ rtx pat = PATTERN (insn);
+ gcc_checking_assert (GET_CODE (pat) == PARALLEL);
+ pat = XVECEXP (pat, 0, 0);
+ if (GET_CODE (pat) == SET)
+ pat = SET_SRC (pat);
+ gcc_checking_assert (GET_CODE (pat) == CALL
+ && GET_CODE (XEXP (pat, 0)) == MEM);
+ rtx addr = XEXP (XEXP (pat, 0), 0);
+ if (GET_CODE (addr) != SYMBOL_REF)
+ return false;
+ const char *name = XSTR (addr, 0);
+ /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
+ references with forced assembler name refer to PTX syscalls. For vprintf,
+ accept both normal and forced-assembler-name references. */
+ return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
+ || !strcmp (name, "*malloc")
+ || !strcmp (name, "*free"));
+}
+
+/* If SET subexpression of INSN sets a register, emit a shuffle instruction to
+ propagate its value from lane MASTER to current lane. */
+
+static void
+nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
+{
+ rtx reg;
+ if (GET_CODE (set) == SET && REG_P (reg = SET_DEST (set)))
+ emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), insn);
+}
+
+/* Adjust code for uniform-simt code generation variant by making atomics and
+ "syscalls" conditionally executed, and inserting shuffle-based propagation
+ for registers being set. */
+
+static void
+nvptx_reorg_uniform_simt ()
+{
+ rtx_insn *insn, *next;
+
+ for (insn = get_insns (); insn; insn = next)
+ {
+ next = NEXT_INSN (insn);
+ if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
+ && !(NONJUMP_INSN_P (insn)
+ && GET_CODE (PATTERN (insn)) == PARALLEL
+ && get_attr_atomic (insn)))
+ continue;
+ rtx pat = PATTERN (insn);
+ rtx master = nvptx_get_unisimt_master ();
+ for (int i = 0; i < XVECLEN (pat, 0); i++)
+ nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
+ rtx pred = nvptx_get_unisimt_predicate ();
+ pred = gen_rtx_NE (BImode, pred, const0_rtx);
+ pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
+ validate_change (insn, &PATTERN (insn), pat, false);
+ }
+}
+
/* Loop structure of the function. The entire function is described as
a NULL loop. */
/* Replace subregs. */
nvptx_reorg_subreg ();
+ if (TARGET_UNIFORM_SIMT)
+ nvptx_reorg_uniform_simt ();
+
regstat_free_n_sets_and_refs ();
df_finish_pass (true);
return NULL_TREE;
}
+/* Handle a "shared" attribute; arguments as in
+ struct attribute_spec.handler. */
+
+static tree
+nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args),
+ int ARG_UNUSED (flags), bool *no_add_attrs)
+{
+ tree decl = *node;
+
+ if (TREE_CODE (decl) != VAR_DECL)
+ {
+ error ("%qE attribute only applies to variables", name);
+ *no_add_attrs = true;
+ }
+ else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl)))
+ {
+ error ("%qE attribute not allowed with auto storage class", name);
+ *no_add_attrs = true;
+ }
+
+ return NULL_TREE;
+}
+
/* Table of valid machine attributes. */
static const struct attribute_spec nvptx_attribute_table[] =
{
/* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
affects_type_identity } */
{ "kernel", 0, 0, true, false, false, nvptx_handle_kernel_attribute, false },
+ { "shared", 0, 0, true, false, false, nvptx_handle_shared_attribute, false },
{ NULL, 0, 0, false, false, false, NULL, false }
};
\f
case FUNCTION_DECL:
{
tree attr = get_oacc_fn_attrib (decl);
- tree dims = TREE_VALUE (attr);
- unsigned ix;
+ /* OpenMP offloading does not set this attribute. */
+ tree dims = attr ? TREE_VALUE (attr) : NULL_TREE;
fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
- for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
+ for (; dims; dims = TREE_CHAIN (dims))
{
int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
if (worker_red_size)
write_worker_buffer (asm_out_file, worker_red_sym,
worker_red_align, worker_red_size);
+
+ if (need_softstack_decl)
+ {
+ write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
+ /* 32 is the maximum number of warps in a block. Even though it's an
+ external declaration, emit the array size explicitly; otherwise, it
+ may fail at PTX JIT time if the definition is later in link order. */
+ fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
+ POINTER_SIZE);
+ }
+ if (need_unisimt_decl)
+ {
+ write_var_marker (asm_out_file, false, true, "__nvptx_uni");
+ fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
+ }
}
/* Expander for the shuffle builtins. */
#define PTX_WORKER_LENGTH 32
#define PTX_GANG_DEFAULT 0 /* Defer to runtime. */
+/* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
+
+static int
+nvptx_simt_vf ()
+{
+ return PTX_VECTOR_LENGTH;
+}
+
/* Validate compute dimensions of an OpenACC offload or routine, fill
in non-unity defaults. FN_LEVEL indicates the level at which a
routine might spawn a loop. It is negative for non-routines. If
#undef TARGET_BUILTIN_DECL
#define TARGET_BUILTIN_DECL nvptx_builtin_decl
+#undef TARGET_SIMT_VF
+#define TARGET_SIMT_VF nvptx_simt_vf
+
#undef TARGET_GOACC_VALIDATE_DIMS
#define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
builtin_assert ("machine=nvptx"); \
builtin_assert ("cpu=nvptx"); \
builtin_define ("__nvptx__"); \
+ if (TARGET_SOFT_STACK) \
+ builtin_define ("__nvptx_softstack__"); \
+ if (TARGET_UNIFORM_SIMT) \
+ builtin_define ("__nvptx_unisimt__"); \
} while (0)
/* Avoid the default in ../../gcc.c, which adds "-pthread", which is not
#define POINTER_SIZE (TARGET_ABI64 ? 64 : 32)
#define Pmode (TARGET_ABI64 ? DImode : SImode)
+#define STACK_SIZE_MODE Pmode
/* Registers. Since ptx is a virtual target, we just define a few
hard registers for special purposes and leave pseudos unallocated.
We have to have some available hard registers, to keep gcc setup
happy. */
#define FIRST_PSEUDO_REGISTER 16
-#define FIXED_REGISTERS { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
+#define FIXED_REGISTERS { 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
#define CALL_USED_REGISTERS { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }
#define HARD_REGNO_NREGS(REG, MODE) \
#define FRAME_POINTER_REGNUM 2
#define ARG_POINTER_REGNUM 3
#define STATIC_CHAIN_REGNUM 4
+/* This register points to the shared memory location with the current warp's
+ soft stack pointer (__nvptx_stacks[tid.y]). */
+#define SOFTSTACK_SLOT_REGNUM 5
+/* This register is used to save the previous value of the soft stack pointer
+ in the prologue and restore it when returning. */
+#define SOFTSTACK_PREV_REGNUM 6
#define REGISTER_NAMES \
{ \
- "%value", "%stack", "%frame", "%args", "%chain", "%hr5", "%hr6", "%hr7", \
+ "%value", "%stack", "%frame", "%args", \
+ "%chain", "%sspslot", "%sspprev", "%hr7", \
"%hr8", "%hr9", "%hr10", "%hr11", "%hr12", "%hr13", "%hr14", "%hr15" \
}
bool is_varadic; /* This call is varadic */
bool has_varadic; /* Current function has a varadic call. */
bool has_chain; /* Current function has outgoing static chain. */
+ bool has_softstack; /* Current function has a soft stack frame. */
int num_args; /* Number of args of current call. */
int return_mode; /* Return mode of current fn.
(machine_mode not defined yet.) */
rtx axis_predicate[2]; /* Neutering predicates. */
+ rtx unisimt_master; /* 'Master lane index' for -muniform-simt. */
+ rtx unisimt_predicate; /* Predicate for -muniform-simt. */
};
#endif
\f
UNSPEC_ALLOCA
+ UNSPEC_SET_SOFTSTACK
+
UNSPEC_DIM_SIZE
UNSPEC_BIT_CONV
+ UNSPEC_VOTE_BALLOT
+
+ UNSPEC_LANEID
+
UNSPEC_SHUFFLE
UNSPEC_BR_UNIFIED
])
UNSPECV_FORKED
UNSPECV_JOINING
UNSPECV_JOIN
+
+ UNSPECV_NOUNROLL
])
(define_attr "subregs_ok" "false,true"
(const_string "false"))
+(define_attr "atomic" "false,true"
+ (const_string "false"))
+
;; The nvptx operand predicates, in general, don't permit subregs and
;; only literal constants, which differ from the generic ones, which
;; permit subregs and symbolc constants (as appropriate)
return true;
})
+(define_attr "predicable" "false,true"
+ (const_string "true"))
+
+(define_cond_exec
+ [(match_operator 0 "predicate_operator"
+ [(match_operand:BI 1 "nvptx_register_operand" "")
+ (match_operand:BI 2 "const0_operand" "")])]
+ ""
+ ""
+ )
+
(define_constraint "P0"
"An integer with the value 0."
(and (match_code "const_int")
(label_ref (match_operand 1 "" ""))
(pc)))]
""
- "%j0\\tbra\\t%l1;")
+ "%j0\\tbra\\t%l1;"
+ [(set_attr "predicable" "false")])
(define_insn "br_false"
[(set (pc)
(label_ref (match_operand 1 "" ""))
(pc)))]
""
- "%J0\\tbra\\t%l1;")
+ "%J0\\tbra\\t%l1;"
+ [(set_attr "predicable" "false")])
;; unified conditional branch
(define_insn "br_true_uni"
UNSPEC_BR_UNIFIED) (const_int 0))
(label_ref (match_operand 1 "" "")) (pc)))]
""
- "%j0\\tbra.uni\\t%l1;")
+ "%j0\\tbra.uni\\t%l1;"
+ [(set_attr "predicable" "false")])
(define_insn "br_false_uni"
[(set (pc) (if_then_else
UNSPEC_BR_UNIFIED) (const_int 0))
(label_ref (match_operand 1 "" "")) (pc)))]
""
- "%J0\\tbra.uni\\t%l1;")
+ "%J0\\tbra.uni\\t%l1;"
+ [(set_attr "predicable" "false")])
(define_expand "cbranch<mode>4"
[(set (pc)
""
{
return nvptx_output_return ();
-})
+}
+ [(set_attr "predicable" "false")])
(define_expand "epilogue"
[(clobber (const_int 0))]
""
{
+ if (TARGET_SOFT_STACK)
+ emit_insn (gen_set_softstack_insn (gen_rtx_REG (Pmode,
+ SOFTSTACK_PREV_REGNUM)));
emit_jump_insn (gen_return ());
DONE;
})
(match_operand 1 "nvptx_register_operand")]
""
{
+ if (TARGET_SOFT_STACK)
+ {
+ emit_move_insn (stack_pointer_rtx,
+ gen_rtx_MINUS (Pmode, stack_pointer_rtx, operands[1]));
+ emit_insn (gen_set_softstack_insn (stack_pointer_rtx));
+ emit_move_insn (operands[0], virtual_stack_dynamic_rtx);
+ DONE;
+ }
/* The ptx documentation specifies an alloca intrinsic (for 32 bit
only) but notes it is not implemented. The assembler emits a
confused error message. Issue a blunt one now instead. */
sorry ("target cannot support alloca.");
emit_insn (gen_nop ());
DONE;
- if (TARGET_ABI64)
- emit_insn (gen_allocate_stack_di (operands[0], operands[1]));
- else
- emit_insn (gen_allocate_stack_si (operands[0], operands[1]));
- DONE;
})
-(define_insn "allocate_stack_<mode>"
- [(set (match_operand:P 0 "nvptx_register_operand" "=R")
- (unspec:P [(match_operand:P 1 "nvptx_register_operand" "R")]
- UNSPEC_ALLOCA))]
- ""
- "%.\\tcall (%0), %%alloca, (%1);")
+(define_insn "set_softstack_insn"
+ [(unspec [(match_operand 0 "nvptx_register_operand" "R")]
+ UNSPEC_SET_SOFTSTACK)]
+ "TARGET_SOFT_STACK"
+{
+ return nvptx_output_set_softstack (REGNO (operands[0]));
+})
(define_expand "restore_stack_block"
[(match_operand 0 "register_operand" "")
(match_operand 1 "register_operand" "")]
""
{
+ if (TARGET_SOFT_STACK)
+ {
+ emit_move_insn (operands[0], operands[1]);
+ emit_insn (gen_set_softstack_insn (operands[0]));
+ }
DONE;
})
(const_int 0))
(const_int 0))]
""
- "%j0 trap;")
+ "%j0 trap;"
+ [(set_attr "predicable" "false")])
(define_insn "trap_if_false"
[(trap_if (eq (match_operand:BI 0 "nvptx_register_operand" "R")
(const_int 0))
(const_int 0))]
""
- "%J0 trap;")
+ "%J0 trap;"
+ [(set_attr "predicable" "false")])
(define_expand "ctrap<mode>4"
[(trap_if (match_operator 0 "nvptx_comparison_operator"
UNSPECV_FORK)]
""
"// fork %0;"
-)
+ [(set_attr "predicable" "false")])
(define_insn "nvptx_forked"
[(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")]
UNSPECV_FORKED)]
""
"// forked %0;"
-)
+ [(set_attr "predicable" "false")])
(define_insn "nvptx_joining"
[(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")]
UNSPECV_JOINING)]
""
"// joining %0;"
-)
+ [(set_attr "predicable" "false")])
(define_insn "nvptx_join"
[(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")]
UNSPECV_JOIN)]
""
"// join %0;"
-)
+ [(set_attr "predicable" "false")])
(define_expand "oacc_fork"
[(set (match_operand:SI 0 "nvptx_nonmemory_operand" "")
""
"%.\\tshfl%S3.b32\\t%0, %1, %2, 31;")
+(define_insn "nvptx_vote_ballot"
+ [(set (match_operand:SI 0 "nvptx_register_operand" "=R")
+ (unspec:SI [(match_operand:BI 1 "nvptx_register_operand" "R")]
+ UNSPEC_VOTE_BALLOT))]
+ ""
+ "%.\\tvote.ballot.b32\\t%0, %1;")
+
+;; Patterns for OpenMP SIMD-via-SIMT lowering
+
+;; Implement IFN_GOMP_SIMT_LANE: set operand 0 to lane index
+(define_insn "omp_simt_lane"
+ [(set (match_operand:SI 0 "nvptx_register_operand" "")
+ (unspec:SI [(const_int 0)] UNSPEC_LANEID))]
+ ""
+ "%.\\tmov.u32\\t%0, %%laneid;")
+
+;; Implement IFN_GOMP_SIMT_ORDERED: copy operand 1 to operand 0 and
+;; place a compiler barrier to disallow unrolling/peeling the containing loop
+(define_expand "omp_simt_ordered"
+ [(match_operand:SI 0 "nvptx_register_operand" "=R")
+ (match_operand:SI 1 "nvptx_register_operand" "R")]
+ ""
+{
+ emit_move_insn (operands[0], operands[1]);
+ emit_insn (gen_nvptx_nounroll ());
+ DONE;
+})
+
+;; Implement IFN_GOMP_SIMT_XCHG_BFLY: perform a "butterfly" exchange
+;; across lanes
+(define_expand "omp_simt_xchg_bfly"
+ [(match_operand 0 "nvptx_register_operand" "=R")
+ (match_operand 1 "nvptx_register_operand" "R")
+ (match_operand:SI 2 "nvptx_nonmemory_operand" "Ri")]
+ ""
+{
+ emit_insn (nvptx_gen_shuffle (operands[0], operands[1], operands[2],
+ SHUFFLE_BFLY));
+ DONE;
+})
+
+;; Implement IFN_GOMP_SIMT_XCHG_IDX: broadcast value in operand 1
+;; from lane given by index in operand 2 to operand 0 in all lanes
+(define_expand "omp_simt_xchg_idx"
+ [(match_operand 0 "nvptx_register_operand" "=R")
+ (match_operand 1 "nvptx_register_operand" "R")
+ (match_operand:SI 2 "nvptx_nonmemory_operand" "Ri")]
+ ""
+{
+ emit_insn (nvptx_gen_shuffle (operands[0], operands[1], operands[2],
+ SHUFFLE_IDX));
+ DONE;
+})
+
+;; Implement IFN_GOMP_SIMT_VOTE_ANY:
+;; set operand 0 to zero iff all lanes supply zero in operand 1
+(define_expand "omp_simt_vote_any"
+ [(match_operand:SI 0 "nvptx_register_operand" "=R")
+ (match_operand:SI 1 "nvptx_register_operand" "R")]
+ ""
+{
+ rtx pred = gen_reg_rtx (BImode);
+ emit_move_insn (pred, gen_rtx_NE (BImode, operands[1], const0_rtx));
+ emit_insn (gen_nvptx_vote_ballot (operands[0], pred));
+ DONE;
+})
+
+;; Implement IFN_GOMP_SIMT_LAST_LANE:
+;; set operand 0 to the lowest lane index that passed non-zero in operand 1
+(define_expand "omp_simt_last_lane"
+ [(match_operand:SI 0 "nvptx_register_operand" "=R")
+ (match_operand:SI 1 "nvptx_register_operand" "R")]
+ ""
+{
+ rtx pred = gen_reg_rtx (BImode);
+ rtx tmp = gen_reg_rtx (SImode);
+ emit_move_insn (pred, gen_rtx_NE (BImode, operands[1], const0_rtx));
+ emit_insn (gen_nvptx_vote_ballot (tmp, pred));
+ emit_insn (gen_ctzsi2 (operands[0], tmp));
+ DONE;
+})
+
;; extract parts of a 64 bit object into 2 32-bit ints
(define_insn "unpack<mode>si2"
[(set (match_operand:SI 0 "nvptx_register_operand" "=R")
(set (match_dup 1)
(unspec_volatile:SDIM [(const_int 0)] UNSPECV_CAS))]
""
- "%.\\tatom%A1.cas.b%T0\\t%0, %1, %2, %3;")
+ "%.\\tatom%A1.cas.b%T0\\t%0, %1, %2, %3;"
+ [(set_attr "atomic" "true")])
(define_insn "atomic_exchange<mode>"
[(set (match_operand:SDIM 0 "nvptx_register_operand" "=R") ;; output
(set (match_dup 1)
(match_operand:SDIM 2 "nvptx_nonmemory_operand" "Ri"))] ;; input
""
- "%.\\tatom%A1.exch.b%T0\\t%0, %1, %2;")
+ "%.\\tatom%A1.exch.b%T0\\t%0, %1, %2;"
+ [(set_attr "atomic" "true")])
(define_insn "atomic_fetch_add<mode>"
[(set (match_operand:SDIM 1 "memory_operand" "+m")
(set (match_operand:SDIM 0 "nvptx_register_operand" "=R")
(match_dup 1))]
""
- "%.\\tatom%A1.add%t0\\t%0, %1, %2;")
+ "%.\\tatom%A1.add%t0\\t%0, %1, %2;"
+ [(set_attr "atomic" "true")])
(define_insn "atomic_fetch_addsf"
[(set (match_operand:SF 1 "memory_operand" "+m")
(set (match_operand:SF 0 "nvptx_register_operand" "=R")
(match_dup 1))]
""
- "%.\\tatom%A1.add%t0\\t%0, %1, %2;")
+ "%.\\tatom%A1.add%t0\\t%0, %1, %2;"
+ [(set_attr "atomic" "true")])
(define_code_iterator any_logic [and ior xor])
(define_code_attr logic [(and "and") (ior "or") (xor "xor")])
(set (match_operand:SDIM 0 "nvptx_register_operand" "=R")
(match_dup 1))]
"0"
- "%.\\tatom%A1.b%T0.<logic>\\t%0, %1, %2;")
+ "%.\\tatom%A1.b%T0.<logic>\\t%0, %1, %2;"
+ [(set_attr "atomic" "true")])
(define_insn "nvptx_barsync"
[(unspec_volatile [(match_operand:SI 0 "const_int_operand" "")]
UNSPECV_BARSYNC)]
""
- "\\tbar.sync\\t%0;")
+ "\\tbar.sync\\t%0;"
+ [(set_attr "predicable" "false")])
+
+(define_insn "nvptx_nounroll"
+ [(unspec_volatile [(const_int 0)] UNSPECV_NOUNROLL)]
+ ""
+ "\\t.pragma \\\"nounroll\\\";"
+ [(set_attr "predicable" "false")])
moptimize
Target Report Var(nvptx_optimize) Init(-1)
Optimize partition neutering.
+
+msoft-stack
+Target Report Mask(SOFT_STACK)
+Use custom stacks instead of local memory for automatic storage.
+
+muniform-simt
+Target Report Mask(UNIFORM_SIMT)
+Generate code that can keep local state uniform across all lanes.
+
+mgomp
+Target Report Mask(GOMP)
+Generate code for OpenMP offloading: enables -msoft-stack and -muniform-simt.
mkoffload$(exeext): mkoffload.o collect-utils.o libcommon-target.a $(LIBIBERTY) $(LIBDEPS)
+$(LINKER) $(ALL_LINKERFLAGS) $(LDFLAGS) -o $@ \
mkoffload.o collect-utils.o libcommon-target.a $(LIBIBERTY) $(LIBS)
+
+MULTILIB_OPTIONS = mgomp
* MeP Variable Attributes::
* Microsoft Windows Variable Attributes::
* MSP430 Variable Attributes::
+* Nvidia PTX Variable Attributes::
* PowerPC Variable Attributes::
* RL78 Variable Attributes::
* SPU Variable Attributes::
These attributes can be applied to both functions and variables.
@end table
+@node Nvidia PTX Variable Attributes
+@subsection Nvidia PTX Variable Attributes
+
+These variable attributes are supported by the Nvidia PTX back end:
+
+@table @code
+@item shared
+@cindex @code{shared} attribute, Nvidia PTX
+Use this attribute to place a variable in the @code{.shared} memory space.
+This memory space is private to each cooperative thread array; only threads
+within one thread block refer to the same instance of the variable.
+The runtime does not initialize variables in this memory space.
+@end table
+
@node PowerPC Variable Attributes
@subsection PowerPC Variable Attributes
Apply partitioned execution optimizations. This is the default when any
level of optimization is selected.
+@item -msoft-stack
+@opindex msoft-stack
+Generate code that does not use @code{.local} memory
+directly for stack storage. Instead, a per-warp stack pointer is
+maintained explicitly. This enables variable-length stack allocation (with
+variable-length arrays or @code{alloca}), and when global memory is used for
+underlying storage, makes it possible to access automatic variables from other
+threads, or with atomic instructions. This code generation variant is used
+for OpenMP offloading, but the option is exposed on its own for the purpose
+of testing the compiler; to generate code suitable for linking into programs
+using OpenMP offloading, use option @option{-mgomp}.
+
+@item -muniform-simt
+@opindex muniform-simt
+Switch to code generation variant that allows to execute all threads in each
+warp, while maintaining memory state and side effects as if only one thread
+in each warp was active outside of OpenMP SIMD regions. All atomic operations
+and calls to runtime (malloc, free, vprintf) are conditionally executed (iff
+current lane index equals the master lane index), and the register being
+assigned is copied via a shuffle instruction from the master lane. Outside of
+SIMD regions lane 0 is the master; inside, each thread sees itself as the
+master. Shared memory array @code{int __nvptx_uni[]} stores all-zeros or
+all-ones bitmasks for each warp, indicating current mode (0 outside of SIMD
+regions). Each thread can bitwise-and the bitmask at position @code{tid.y}
+with current lane index to compute the master lane index.
+
+@item -mgomp
+@opindex mgomp
+Generate code for use in OpenMP offloading: enables @option{-msoft-stack} and
+@option{-muniform-simt} options, and selects corresponding multilib variant.
+
@end table
@node PDP-11 Options
to use it.
@end deftypefn
+@deftypefn {Target Hook} int TARGET_SIMT_VF (void)
+Return number of threads in SIMT thread group on the target.
+@end deftypefn
+
@deftypefn {Target Hook} bool TARGET_GOACC_VALIDATE_DIMS (tree @var{decl}, int *@var{dims}, int @var{fn_level})
This hook should check the launch dimensions provided for an OpenACC
compute region, or routine. Defaulted values are represented as -1
@hook TARGET_SIMD_CLONE_USABLE
+@hook TARGET_SIMT_VF
+
@hook TARGET_GOACC_VALIDATE_DIMS
@hook TARGET_GOACC_DIM_LIMIT
DEF_TARGET_INSN (oacc_dim_size, (rtx x0, rtx x1))
DEF_TARGET_INSN (oacc_fork, (rtx x0, rtx x1, rtx x2))
DEF_TARGET_INSN (oacc_join, (rtx x0, rtx x1, rtx x2))
+DEF_TARGET_INSN (omp_simt_lane, (rtx x0))
+DEF_TARGET_INSN (omp_simt_last_lane, (rtx x0, rtx x1))
+DEF_TARGET_INSN (omp_simt_ordered, (rtx x0, rtx x1))
+DEF_TARGET_INSN (omp_simt_vote_any, (rtx x0, rtx x1))
+DEF_TARGET_INSN (omp_simt_xchg_bfly, (rtx x0, rtx x1, rtx x2))
+DEF_TARGET_INSN (omp_simt_xchg_idx, (rtx x0, rtx x1, rtx x2))
DEF_TARGET_INSN (prefetch, (rtx x0, rtx x1, rtx x2))
DEF_TARGET_INSN (probe_stack, (rtx x0))
DEF_TARGET_INSN (probe_stack_address, (rtx x0))
HOOK_VECTOR_END (simd_clone)
+/* Functions relating to OpenMP SIMT vectorization transform. */
+#undef HOOK_PREFIX
+#define HOOK_PREFIX "TARGET_SIMT_"
+HOOK_VECTOR (TARGET_SIMT, simt)
+
+DEFHOOK
+(vf,
+"Return number of threads in SIMT thread group on the target.",
+int, (void), NULL)
+
+HOOK_VECTOR_END (simt)
+
/* Functions relating to openacc. */
#undef HOOK_PREFIX
#define HOOK_PREFIX "TARGET_GOACC_"
+2016-11-16 Alexander Monakov <amonakov@ispras.ru>
+
+ * lib/target-supports.exp (check_effective_target_alloca): Use a
+ compile test.
+ * gcc.target/nvptx/softstack.c: New test.
+ * gcc.target/nvptx/decl-shared.c: New test.
+ * gcc.target/nvptx/decl-shared-init.c: New test.
+
2016-11-16 Maciej W. Rozycki <macro@imgtec.com>
* gcc.target/mips/data-sym-jump.c: New test case.
--- /dev/null
+int var __attribute__((shared)) = 0; /* { dg-error "static initialization .* not supported" } */
--- /dev/null
+static int v_internal __attribute__((shared,used));
+int v_common __attribute__((shared));
+int v_extdef __attribute__((shared,nocommon));
+extern int v_extdecl __attribute__((shared));
+
+int use()
+{
+ return v_extdecl;
+}
+
+/* { dg-final { scan-assembler "\[\r\n\]\[\t \]*.shared \[^,\r\n\]*v_internal" } } */
+/* { dg-final { scan-assembler "\[\r\n\]\[\t \]*.weak .shared \[^,\r\n\]*v_common" } } */
+/* { dg-final { scan-assembler "\[\r\n\]\[\t \]*.visible .shared \[^,\r\n\]*v_extdef" } } */
+/* { dg-final { scan-assembler "\[\r\n\]\[\t \]*.extern .shared \[^,\r\n\]*v_extdecl" } } */
--- /dev/null
+/* { dg-options "-O2 -msoft-stack" } */
+/* { dg-do run } */
+
+static __attribute__((noinline,noclone)) int f(int *p)
+{
+ return __sync_lock_test_and_set(p, 1);
+}
+
+static __attribute__((noinline,noclone)) int g(int n)
+{
+ /* Check that variable-length stack allocation works. */
+ int v[n];
+ v[0] = 0;
+ /* Check that atomic operations can be applied to auto data. */
+ return f(v) == 0 && v[0] == 1;
+}
+
+int main()
+{
+ if (!g(1))
+ __builtin_abort();
+ return 0;
+}
proc check_effective_target_alloca {} {
if { [istarget nvptx-*-*] } {
- return 0
+ return [check_no_compiler_messages alloca assembly {
+ void f (void*);
+ void g (int n) { f (__builtin_alloca (n)); }
+ }]
}
return 1
}
+2016-11-16 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/crt0.c (__main): Setup __nvptx_stacks and __nvptx_uni.
+ * config/nvptx/mgomp.c: New file.
+ * config/nvptx/t-nvptx: Add mgomp.c
+
2016-11-16 Waldemar Brodkorb <wbx@openadk.org>
PR libgcc/68468
extern void __attribute__((noreturn)) exit (int status);
extern int main (int, void **);
+/* Always setup soft stacks to allow testing with -msoft-stack but without
+ -mgomp. 32 is the maximum number of warps in a CTA: the definition here
+ must match the external declaration emitted by the compiler. */
+void *__nvptx_stacks[32] __attribute__((shared,nocommon));
+
+/* Likewise for -muniform-simt. */
+unsigned __nvptx_uni[32] __attribute__((shared,nocommon));
+
void __attribute__((kernel))
__main (int *rval_ptr, int argc, void **argv)
{
if (rval_ptr)
*rval_ptr = 255;
+ static char stack[131072] __attribute__((aligned(8)));
+ __nvptx_stacks[0] = stack + sizeof stack;
+ __nvptx_uni[0] = 0;
+
exit (main (argc, argv));
}
--- /dev/null
+/* Define shared memory arrays for -msoft-stack and -muniform-simt.
+
+ Copyright (C) 2015-2016 Free Software Foundation, Inc.
+
+ This file is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by the
+ Free Software Foundation; either version 3, or (at your option) any
+ later version.
+
+ This file is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* OpenACC offloading does not use these symbols; thus, they are exposed
+ only for the -mgomp multilib. The same definitions are also provided
+ in crt0.c for the case of non-offloading compilation. 32 is the maximum
+ number of warps in a CTA. */
+
+#if defined(__nvptx_softstack__) && defined(__nvptx_unisimt__)
+void *__nvptx_stacks[32] __attribute__((shared,nocommon));
+unsigned __nvptx_uni[32] __attribute__((shared,nocommon));
+#endif
-LIB2ADD=$(srcdir)/config/nvptx/reduction.c
+LIB2ADD=$(srcdir)/config/nvptx/reduction.c \
+ $(srcdir)/config/nvptx/mgomp.c
LIB2ADDEH=
LIB2FUNCS_EXCLUDE=__main