From: Alexander Monakov Date: Tue, 28 Mar 2017 17:24:57 +0000 (+0300) Subject: OpenMP/PTX privatization in SIMD regions X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=0c6b03b5158f53a3c7042cf8625aa5e6bc74f52b;p=gcc.git OpenMP/PTX privatization in SIMD regions * config/nvptx/nvptx-protos.h (nvptx_output_simt_enter): Declare. (nvptx_output_simt_exit): Declare. * config/nvptx/nvptx.c (nvptx_init_unisimt_predicate): Use cfun->machine->unisimt_location. Handle NULL unisimt_predicate. (init_softstack_frame): Move initialization of crtl->is_leaf to... (nvptx_declare_function_name): ...here. Emit declaration of local memory space buffer for omp_simt_enter insn. (nvptx_output_unisimt_switch): New. (nvptx_output_softstack_switch): New. (nvptx_output_simt_enter): New. (nvptx_output_simt_exit): New. * config/nvptx/nvptx.h (struct machine_function): New fields has_simtreg, unisimt_location, simt_stack_size, simt_stack_align. * config/nvptx/nvptx.md (UNSPECV_SIMT_ENTER): New unspec. (UNSPECV_SIMT_EXIT): Ditto. (omp_simt_enter_insn): New insn. (omp_simt_enter): New expansion. (omp_simt_exit): New insn. * config/nvptx/nvptx.opt (msoft-stack-reserve-local): New option. * internal-fn.c (expand_GOMP_SIMT_ENTER): New. (expand_GOMP_SIMT_ENTER_ALLOC): New. (expand_GOMP_SIMT_EXIT): New. * internal-fn.def (GOMP_SIMT_ENTER): New internal function. (GOMP_SIMT_ENTER_ALLOC): Ditto. (GOMP_SIMT_EXIT): Ditto. * target-insns.def (omp_simt_enter): New insn. (omp_simt_exit): Ditto. * omp-low.c (struct omplow_simd_context): New fields simt_eargs, simt_dlist. (lower_rec_simd_input_clauses): Implement SIMT privatization. (lower_rec_input_clauses): Likewise. (lower_lastprivate_clauses): Handle SIMT privatization. * omp-offload.c: Include langhooks.h, tree-nested.h, stor-layout.h. (ompdevlow_adjust_simt_enter): New. (find_simtpriv_var_op): New. (execute_omp_device_lower): Handle IFN_GOMP_SIMT_ENTER, IFN_GOMP_SIMT_ENTER_ALLOC, IFN_GOMP_SIMT_EXIT. * tree-inline.h (struct copy_body_data): New field dst_simt_vars. * tree-inline.c (expand_call_inline): Handle SIMT privatization. (copy_decl_for_dup_finish): Ditto. * tree-ssa.c (execute_update_addresses_taken): Handle GOMP_SIMT_ENTER. From-SVN: r246550 --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 3f60cac3160..7a575a8a46a 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,51 @@ +2017-03-28 Alexander Monakov + + * config/nvptx/nvptx-protos.h (nvptx_output_simt_enter): Declare. + (nvptx_output_simt_exit): Declare. + * config/nvptx/nvptx.c (nvptx_init_unisimt_predicate): Use + cfun->machine->unisimt_location. Handle NULL unisimt_predicate. + (init_softstack_frame): Move initialization of crtl->is_leaf to... + (nvptx_declare_function_name): ...here. Emit declaration of local + memory space buffer for omp_simt_enter insn. + (nvptx_output_unisimt_switch): New. + (nvptx_output_softstack_switch): New. + (nvptx_output_simt_enter): New. + (nvptx_output_simt_exit): New. + * config/nvptx/nvptx.h (struct machine_function): New fields + has_simtreg, unisimt_location, simt_stack_size, simt_stack_align. + * config/nvptx/nvptx.md (UNSPECV_SIMT_ENTER): New unspec. + (UNSPECV_SIMT_EXIT): Ditto. + (omp_simt_enter_insn): New insn. + (omp_simt_enter): New expansion. + (omp_simt_exit): New insn. + * config/nvptx/nvptx.opt (msoft-stack-reserve-local): New option. + + * internal-fn.c (expand_GOMP_SIMT_ENTER): New. + (expand_GOMP_SIMT_ENTER_ALLOC): New. + (expand_GOMP_SIMT_EXIT): New. + * internal-fn.def (GOMP_SIMT_ENTER): New internal function. + (GOMP_SIMT_ENTER_ALLOC): Ditto. + (GOMP_SIMT_EXIT): Ditto. + * target-insns.def (omp_simt_enter): New insn. + (omp_simt_exit): Ditto. + * omp-low.c (struct omplow_simd_context): New fields simt_eargs, + simt_dlist. + (lower_rec_simd_input_clauses): Implement SIMT privatization. + (lower_rec_input_clauses): Likewise. + (lower_lastprivate_clauses): Handle SIMT privatization. + + * omp-offload.c: Include langhooks.h, tree-nested.h, stor-layout.h. + (ompdevlow_adjust_simt_enter): New. + (find_simtpriv_var_op): New. + (execute_omp_device_lower): Handle IFN_GOMP_SIMT_ENTER, + IFN_GOMP_SIMT_ENTER_ALLOC, IFN_GOMP_SIMT_EXIT. + + * tree-inline.h (struct copy_body_data): New field dst_simt_vars. + * tree-inline.c (expand_call_inline): Handle SIMT privatization. + (copy_decl_for_dup_finish): Ditto. + + * tree-ssa.c (execute_update_addresses_taken): Handle GOMP_SIMT_ENTER. + 2017-03-28 Uros Bizjak PR target/53383 diff --git a/gcc/config/nvptx/nvptx-protos.h b/gcc/config/nvptx/nvptx-protos.h index aaea3ba4845..16b316f12b8 100644 --- a/gcc/config/nvptx/nvptx-protos.h +++ b/gcc/config/nvptx/nvptx-protos.h @@ -53,5 +53,7 @@ extern const char *nvptx_output_mov_insn (rtx, rtx); extern const char *nvptx_output_call_insn (rtx_insn *, rtx, rtx); extern const char *nvptx_output_return (void); extern const char *nvptx_output_set_softstack (unsigned); +extern const char *nvptx_output_simt_enter (rtx, rtx, rtx); +extern const char *nvptx_output_simt_exit (rtx); #endif #endif diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index 647855c2cfe..83f46104ca3 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -1048,11 +1048,6 @@ init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size) fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n", bits, reg_stack, reg_frame, size); - /* Usually 'crtl->is_leaf' is computed during register allocator - initialization (which is not done on NVPTX) or for pressure-sensitive - optimizations. Initialize it here, except if already set. */ - if (!crtl->is_leaf) - crtl->is_leaf = leaf_function_p (); if (!crtl->is_leaf) fprintf (file, "\t\tst.shared.u%d [%s], %s;\n", bits, reg_sspslot, reg_stack); @@ -1080,24 +1075,29 @@ nvptx_init_axis_predicate (FILE *file, int regno, const char *name) static void nvptx_init_unisimt_predicate (FILE *file) { + cfun->machine->unisimt_location = gen_reg_rtx (Pmode); + int loc = REGNO (cfun->machine->unisimt_location); int bits = POINTER_SIZE; - int master = REGNO (cfun->machine->unisimt_master); - int pred = REGNO (cfun->machine->unisimt_predicate); + fprintf (file, "\t.reg.u%d %%r%d;\n", bits, loc); fprintf (file, "\t{\n"); fprintf (file, "\t\t.reg.u32 %%ustmp0;\n"); fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits); - fprintf (file, "\t\t.reg.u%d %%ustmp2;\n", bits); fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n"); fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n", bits == 64 ? ".wide" : ".lo"); - fprintf (file, "\t\tmov.u%d %%ustmp2, __nvptx_uni;\n", bits); - fprintf (file, "\t\tadd.u%d %%ustmp2, %%ustmp2, %%ustmp1;\n", bits); - fprintf (file, "\t\tld.shared.u32 %%r%d, [%%ustmp2];\n", master); - fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.x;\n"); - /* Compute 'master lane index' as 'tid.x & __nvptx_uni[tid.y]'. */ - fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master); - /* Compute predicate as 'tid.x == master'. */ - fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master); + fprintf (file, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits, loc); + fprintf (file, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits, loc, loc); + if (cfun->machine->unisimt_predicate) + { + int master = REGNO (cfun->machine->unisimt_master); + int pred = REGNO (cfun->machine->unisimt_predicate); + fprintf (file, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master, loc); + fprintf (file, "\t\tmov.u32 %%ustmp0, %%laneid;\n"); + /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'. */ + fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master); + /* Compute predicate as 'tid.x == master'. */ + fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master); + } fprintf (file, "\t}\n"); need_unisimt_decl = true; } @@ -1224,6 +1224,12 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl) fprintf (file, "%s", s.str().c_str()); + /* Usually 'crtl->is_leaf' is computed during register allocator + initialization (which is not done on NVPTX) or for pressure-sensitive + optimizations. Initialize it here, except if already set. */ + if (!crtl->is_leaf) + crtl->is_leaf = leaf_function_p (); + HOST_WIDE_INT sz = get_frame_size (); bool need_frameptr = sz || cfun->machine->has_chain; int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT; @@ -1240,9 +1246,28 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl) init_frame (file, FRAME_POINTER_REGNUM, alignment, ROUND_UP (sz, GET_MODE_SIZE (DImode))); } - else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca) + else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca + || (cfun->machine->has_simtreg && !crtl->is_leaf)) init_softstack_frame (file, alignment, sz); + if (cfun->machine->has_simtreg) + { + unsigned HOST_WIDE_INT &simtsz = cfun->machine->simt_stack_size; + unsigned HOST_WIDE_INT &align = cfun->machine->simt_stack_align; + align = MAX (align, GET_MODE_SIZE (DImode)); + if (!crtl->is_leaf || cfun->calls_alloca) + simtsz = HOST_WIDE_INT_M1U; + if (simtsz == HOST_WIDE_INT_M1U) + simtsz = nvptx_softstack_size; + if (cfun->machine->has_softstack) + simtsz += POINTER_SIZE / 8; + simtsz = ROUND_UP (simtsz, GET_MODE_SIZE (DImode)); + if (align > GET_MODE_SIZE (DImode)) + simtsz += align - GET_MODE_SIZE (DImode); + if (simtsz) + fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar[" + HOST_WIDE_INT_PRINT_DEC "];\n", simtsz); + } /* Declare the pseudos we have as ptx registers. */ int maxregs = max_reg_num (); for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++) @@ -1267,10 +1292,112 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl) if (cfun->machine->axis_predicate[1]) nvptx_init_axis_predicate (file, REGNO (cfun->machine->axis_predicate[1]), "x"); - if (cfun->machine->unisimt_predicate) + if (cfun->machine->unisimt_predicate + || (cfun->machine->has_simtreg && !crtl->is_leaf)) nvptx_init_unisimt_predicate (file); } +/* Output code for switching uniform-simt state. ENTERING indicates whether + we are entering or leaving non-uniform execution region. */ + +static void +nvptx_output_unisimt_switch (FILE *file, bool entering) +{ + if (crtl->is_leaf && !cfun->machine->unisimt_predicate) + return; + fprintf (file, "\t{\n"); + fprintf (file, "\t\t.reg.u32 %%ustmp2;\n"); + fprintf (file, "\t\tmov.u32 %%ustmp2, %d;\n", entering ? -1 : 0); + if (!crtl->is_leaf) + { + int loc = REGNO (cfun->machine->unisimt_location); + fprintf (file, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc); + } + if (cfun->machine->unisimt_predicate) + { + int master = REGNO (cfun->machine->unisimt_master); + int pred = REGNO (cfun->machine->unisimt_predicate); + fprintf (file, "\t\tmov.u32 %%ustmp2, %%laneid;\n"); + fprintf (file, "\t\tmov.u32 %%r%d, %s;\n", + master, entering ? "%ustmp2" : "0"); + fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred, master); + } + fprintf (file, "\t}\n"); +} + +/* Output code for allocating per-lane storage and switching soft-stack pointer. + ENTERING indicates whether we are entering or leaving non-uniform execution. + PTR is the register pointing to allocated storage, it is assigned to on + entering and used to restore state on leaving. SIZE and ALIGN are used only + on entering. */ + +static void +nvptx_output_softstack_switch (FILE *file, bool entering, + rtx ptr, rtx size, rtx align) +{ + gcc_assert (REG_P (ptr) && !HARD_REGISTER_P (ptr)); + if (crtl->is_leaf && !cfun->machine->simt_stack_size) + return; + int bits = POINTER_SIZE, regno = REGNO (ptr); + fprintf (file, "\t{\n"); + if (entering) + { + fprintf (file, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + " + HOST_WIDE_INT_PRINT_DEC ";\n", bits, regno, + cfun->machine->simt_stack_size); + fprintf (file, "\t\tsub.u%d %%r%d, %%r%d, ", bits, regno, regno); + if (CONST_INT_P (size)) + fprintf (file, HOST_WIDE_INT_PRINT_DEC, + ROUND_UP (UINTVAL (size), GET_MODE_SIZE (DImode))); + else + output_reg (file, REGNO (size), VOIDmode); + fputs (";\n", file); + if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode)) + fprintf (file, "\t\tand.u%d %%r%d, %%r%d, -%d;\n", + bits, regno, regno, UINTVAL (align)); + } + if (cfun->machine->has_softstack) + { + const char *reg_stack = reg_names[STACK_POINTER_REGNUM]; + if (entering) + { + fprintf (file, "\t\tst.u%d [%%r%d + -%d], %s;\n", + bits, regno, bits / 8, reg_stack); + fprintf (file, "\t\tsub.u%d %s, %%r%d, %d;\n", + bits, reg_stack, regno, bits / 8); + } + else + { + fprintf (file, "\t\tld.u%d %s, [%%r%d + -%d];\n", + bits, reg_stack, regno, bits / 8); + } + nvptx_output_set_softstack (REGNO (stack_pointer_rtx)); + } + fprintf (file, "\t}\n"); +} + +/* Output code to enter non-uniform execution region. DEST is a register + to hold a per-lane allocation given by SIZE and ALIGN. */ + +const char * +nvptx_output_simt_enter (rtx dest, rtx size, rtx align) +{ + nvptx_output_unisimt_switch (asm_out_file, true); + nvptx_output_softstack_switch (asm_out_file, true, dest, size, align); + return ""; +} + +/* Output code to leave non-uniform execution region. SRC is the register + holding per-lane storage previously allocated by omp_simt_enter insn. */ + +const char * +nvptx_output_simt_exit (rtx src) +{ + nvptx_output_unisimt_switch (asm_out_file, false); + nvptx_output_softstack_switch (asm_out_file, false, src, NULL_RTX, NULL_RTX); + return ""; +} + /* Output instruction that sets soft stack pointer in shared memory to the value in register given by SRC_REGNO. */ diff --git a/gcc/config/nvptx/nvptx.h b/gcc/config/nvptx/nvptx.h index 8338d4eacf4..0a000a73df5 100644 --- a/gcc/config/nvptx/nvptx.h +++ b/gcc/config/nvptx/nvptx.h @@ -213,12 +213,18 @@ struct GTY(()) machine_function bool has_varadic; /* Current function has a varadic call. */ bool has_chain; /* Current function has outgoing static chain. */ bool has_softstack; /* Current function has a soft stack frame. */ + bool has_simtreg; /* Current function has an OpenMP SIMD region. */ int num_args; /* Number of args of current call. */ int return_mode; /* Return mode of current fn. (machine_mode not defined yet.) */ rtx axis_predicate[2]; /* Neutering predicates. */ rtx unisimt_master; /* 'Master lane index' for -muniform-simt. */ rtx unisimt_predicate; /* Predicate for -muniform-simt. */ + rtx unisimt_location; /* Mask location for -muniform-simt. */ + /* The following two fields hold the maximum size resp. alignment required + for per-lane storage in OpenMP SIMD regions. */ + unsigned HOST_WIDE_INT simt_stack_size; + unsigned HOST_WIDE_INT simt_stack_align; }; #endif diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md index 50dd42e376e..f2ed63bf06b 100644 --- a/gcc/config/nvptx/nvptx.md +++ b/gcc/config/nvptx/nvptx.md @@ -63,6 +63,9 @@ UNSPECV_JOIN UNSPECV_NOUNROLL + + UNSPECV_SIMT_ENTER + UNSPECV_SIMT_EXIT ]) (define_attr "subregs_ok" "false,true" @@ -1184,6 +1187,42 @@ ;; Patterns for OpenMP SIMD-via-SIMT lowering +(define_insn "omp_simt_enter_insn" + [(set (match_operand 0 "nvptx_register_operand" "=R") + (unspec_volatile [(match_operand 1 "nvptx_nonmemory_operand" "Ri") + (match_operand 2 "nvptx_nonmemory_operand" "Ri")] + UNSPECV_SIMT_ENTER))] + "" +{ + return nvptx_output_simt_enter (operands[0], operands[1], operands[2]); +}) + +(define_expand "omp_simt_enter" + [(match_operand 0 "nvptx_register_operand" "=R") + (match_operand 1 "nvptx_nonmemory_operand" "Ri") + (match_operand 2 "const_int_operand" "n")] + "" +{ + if (!CONST_INT_P (operands[1])) + cfun->machine->simt_stack_size = HOST_WIDE_INT_M1U; + else + cfun->machine->simt_stack_size = MAX (UINTVAL (operands[1]), + cfun->machine->simt_stack_size); + cfun->machine->simt_stack_align = MAX (UINTVAL (operands[2]), + cfun->machine->simt_stack_align); + cfun->machine->has_simtreg = true; + emit_insn (gen_omp_simt_enter_insn (operands[0], operands[1], operands[2])); + DONE; +}) + +(define_insn "omp_simt_exit" + [(unspec_volatile [(match_operand 0 "nvptx_register_operand" "R")] + UNSPECV_SIMT_EXIT)] + "" +{ + return nvptx_output_simt_exit (operands[0]); +}) + ;; Implement IFN_GOMP_SIMT_LANE: set operand 0 to lane index (define_insn "omp_simt_lane" [(set (match_operand:SI 0 "nvptx_register_operand" "") diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt index 80aab5b13a4..901def703f8 100644 --- a/gcc/config/nvptx/nvptx.opt +++ b/gcc/config/nvptx/nvptx.opt @@ -37,6 +37,10 @@ msoft-stack Target Report Mask(SOFT_STACK) Use custom stacks instead of local memory for automatic storage. +msoft-stack-reserve-local +Target Report Joined RejectNegative UInteger Var(nvptx_softstack_size) Init(128) +Specify size of .local memory used for stack when the exact amount is not known. + muniform-simt Target Report Mask(UNIFORM_SIMT) Generate code that can keep local state uniform across all lanes. diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c index df7b930e801..75fe027f7b2 100644 --- a/gcc/internal-fn.c +++ b/gcc/internal-fn.c @@ -166,6 +166,48 @@ expand_GOMP_USE_SIMT (internal_fn, gcall *) gcc_unreachable (); } +/* This should get expanded in omp_device_lower pass. */ + +static void +expand_GOMP_SIMT_ENTER (internal_fn, gcall *) +{ + gcc_unreachable (); +} + +/* Allocate per-lane storage and begin non-uniform execution region. */ + +static void +expand_GOMP_SIMT_ENTER_ALLOC (internal_fn, gcall *stmt) +{ + rtx target; + tree lhs = gimple_call_lhs (stmt); + if (lhs) + target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE); + else + target = gen_reg_rtx (Pmode); + rtx size = expand_normal (gimple_call_arg (stmt, 0)); + rtx align = expand_normal (gimple_call_arg (stmt, 1)); + struct expand_operand ops[3]; + create_output_operand (&ops[0], target, Pmode); + create_input_operand (&ops[1], size, Pmode); + create_input_operand (&ops[2], align, Pmode); + gcc_assert (targetm.have_omp_simt_enter ()); + expand_insn (targetm.code_for_omp_simt_enter, 3, ops); +} + +/* Deallocate per-lane storage and leave non-uniform execution region. */ + +static void +expand_GOMP_SIMT_EXIT (internal_fn, gcall *stmt) +{ + gcc_checking_assert (!gimple_call_lhs (stmt)); + rtx arg = expand_normal (gimple_call_arg (stmt, 0)); + struct expand_operand ops[1]; + create_input_operand (&ops[0], arg, Pmode); + gcc_assert (targetm.have_omp_simt_exit ()); + expand_insn (targetm.code_for_omp_simt_exit, 1, ops); +} + /* Lane index on SIMT targets: thread index in the warp on NVPTX. On targets without SIMT execution this should be expanded in omp_device_lower pass. */ diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def index 2ba69c93ca7..e162d81121c 100644 --- a/gcc/internal-fn.def +++ b/gcc/internal-fn.def @@ -142,6 +142,9 @@ DEF_INTERNAL_INT_FN (PARITY, ECF_CONST, parity, unary) DEF_INTERNAL_INT_FN (POPCOUNT, ECF_CONST, popcount, unary) DEF_INTERNAL_FN (GOMP_USE_SIMT, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL) +DEF_INTERNAL_FN (GOMP_SIMT_ENTER, ECF_LEAF | ECF_NOTHROW, NULL) +DEF_INTERNAL_FN (GOMP_SIMT_ENTER_ALLOC, ECF_LEAF | ECF_NOTHROW, NULL) +DEF_INTERNAL_FN (GOMP_SIMT_EXIT, ECF_LEAF | ECF_NOTHROW, NULL) DEF_INTERNAL_FN (GOMP_SIMT_LANE, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL) DEF_INTERNAL_FN (GOMP_SIMT_VF, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL) DEF_INTERNAL_FN (GOMP_SIMT_LAST_LANE, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL) diff --git a/gcc/omp-low.c b/gcc/omp-low.c index c2c69cbcc6e..253dc856374 100644 --- a/gcc/omp-low.c +++ b/gcc/omp-low.c @@ -3457,6 +3457,8 @@ omp_clause_aligned_alignment (tree clause) struct omplow_simd_context { tree idx; tree lane; + vec simt_eargs; + gimple_seq simt_dlist; int max_vf; bool is_simt; }; @@ -3492,18 +3494,39 @@ lower_rec_simd_input_clauses (tree new_var, omp_context *ctx, if (sctx->max_vf == 1) return false; - tree atype = build_array_type_nelts (TREE_TYPE (new_var), sctx->max_vf); - tree avar = create_tmp_var_raw (atype); - if (TREE_ADDRESSABLE (new_var)) - TREE_ADDRESSABLE (avar) = 1; - DECL_ATTRIBUTES (avar) - = tree_cons (get_identifier ("omp simd array"), NULL, - DECL_ATTRIBUTES (avar)); - gimple_add_tmp_var (avar); - ivar = build4 (ARRAY_REF, TREE_TYPE (new_var), avar, sctx->idx, - NULL_TREE, NULL_TREE); - lvar = build4 (ARRAY_REF, TREE_TYPE (new_var), avar, sctx->lane, - NULL_TREE, NULL_TREE); + if (sctx->is_simt) + { + if (is_gimple_reg (new_var)) + { + ivar = lvar = new_var; + return true; + } + tree type = TREE_TYPE (new_var), ptype = build_pointer_type (type); + ivar = lvar = create_tmp_var (type); + TREE_ADDRESSABLE (ivar) = 1; + DECL_ATTRIBUTES (ivar) = tree_cons (get_identifier ("omp simt private"), + NULL, DECL_ATTRIBUTES (ivar)); + sctx->simt_eargs.safe_push (build1 (ADDR_EXPR, ptype, ivar)); + tree clobber = build_constructor (type, NULL); + TREE_THIS_VOLATILE (clobber) = 1; + gimple *g = gimple_build_assign (ivar, clobber); + gimple_seq_add_stmt (&sctx->simt_dlist, g); + } + else + { + tree atype = build_array_type_nelts (TREE_TYPE (new_var), sctx->max_vf); + tree avar = create_tmp_var_raw (atype); + if (TREE_ADDRESSABLE (new_var)) + TREE_ADDRESSABLE (avar) = 1; + DECL_ATTRIBUTES (avar) + = tree_cons (get_identifier ("omp simd array"), NULL, + DECL_ATTRIBUTES (avar)); + gimple_add_tmp_var (avar); + ivar = build4 (ARRAY_REF, TREE_TYPE (new_var), avar, sctx->idx, + NULL_TREE, NULL_TREE); + lvar = build4 (ARRAY_REF, TREE_TYPE (new_var), avar, sctx->lane, + NULL_TREE, NULL_TREE); + } if (DECL_P (new_var)) { SET_DECL_VALUE_EXPR (new_var, lvar); @@ -3547,8 +3570,8 @@ lower_rec_input_clauses (tree clauses, gimple_seq *ilist, gimple_seq *dlist, bool is_simd = (gimple_code (ctx->stmt) == GIMPLE_OMP_FOR && gimple_omp_for_kind (ctx->stmt) & GF_OMP_FOR_SIMD); omplow_simd_context sctx = omplow_simd_context (); - tree simt_lane = NULL_TREE; - tree ivar = NULL_TREE, lvar = NULL_TREE; + tree simt_lane = NULL_TREE, simtrec = NULL_TREE; + tree ivar = NULL_TREE, lvar = NULL_TREE, uid = NULL_TREE; gimple_seq llist[3] = { }; copyin_seq = NULL; @@ -3581,6 +3604,10 @@ lower_rec_input_clauses (tree clauses, gimple_seq *ilist, gimple_seq *dlist, continue; } + /* Add a placeholder for simduid. */ + if (sctx.is_simt && sctx.max_vf != 1) + sctx.simt_eargs.safe_push (NULL_TREE); + /* Do all the fixed sized types in the first pass, and the variable sized types in the second pass. This makes sure that the scalar arguments to the variable sized types are processed before we use them in the @@ -4468,21 +4495,43 @@ lower_rec_input_clauses (tree clauses, gimple_seq *ilist, gimple_seq *dlist, } } - if (sctx.lane) + if (sctx.max_vf == 1) + sctx.is_simt = false; + + if (sctx.lane || sctx.is_simt) { - tree uid = create_tmp_var (ptr_type_node, "simduid"); + uid = create_tmp_var (ptr_type_node, "simduid"); /* Don't want uninit warnings on simduid, it is always uninitialized, but we use it not for the value, but for the DECL_UID only. */ TREE_NO_WARNING (uid) = 1; + c = build_omp_clause (UNKNOWN_LOCATION, OMP_CLAUSE__SIMDUID_); + OMP_CLAUSE__SIMDUID__DECL (c) = uid; + OMP_CLAUSE_CHAIN (c) = gimple_omp_for_clauses (ctx->stmt); + gimple_omp_for_set_clauses (ctx->stmt, c); + } + /* Emit calls denoting privatized variables and initializing a pointer to + structure that holds private variables as fields after ompdevlow pass. */ + if (sctx.is_simt) + { + sctx.simt_eargs[0] = uid; + gimple *g + = gimple_build_call_internal_vec (IFN_GOMP_SIMT_ENTER, sctx.simt_eargs); + gimple_call_set_lhs (g, uid); + gimple_seq_add_stmt (ilist, g); + sctx.simt_eargs.release (); + + simtrec = create_tmp_var (ptr_type_node, ".omp_simt"); + g = gimple_build_call_internal (IFN_GOMP_SIMT_ENTER_ALLOC, 1, uid); + gimple_call_set_lhs (g, simtrec); + gimple_seq_add_stmt (ilist, g); + } + if (sctx.lane) + { gimple *g = gimple_build_call_internal (IFN_GOMP_SIMD_LANE, 1, uid); gimple_call_set_lhs (g, sctx.lane); gimple_stmt_iterator gsi = gsi_start_1 (gimple_omp_body_ptr (ctx->stmt)); gsi_insert_before_without_update (&gsi, g, GSI_SAME_STMT); - c = build_omp_clause (UNKNOWN_LOCATION, OMP_CLAUSE__SIMDUID_); - OMP_CLAUSE__SIMDUID__DECL (c) = uid; - OMP_CLAUSE_CHAIN (c) = gimple_omp_for_clauses (ctx->stmt); - gimple_omp_for_set_clauses (ctx->stmt, c); g = gimple_build_assign (sctx.lane, INTEGER_CST, build_int_cst (unsigned_type_node, 0)); gimple_seq_add_stmt (ilist, g); @@ -4545,6 +4594,13 @@ lower_rec_input_clauses (tree clauses, gimple_seq *ilist, gimple_seq *dlist, gimple_seq_add_stmt (seq, gimple_build_label (end)); } } + if (sctx.is_simt) + { + gimple_seq_add_seq (dlist, sctx.simt_dlist); + gimple *g + = gimple_build_call_internal (IFN_GOMP_SIMT_EXIT, 1, simtrec); + gimple_seq_add_stmt (dlist, g); + } /* The copyin sequence is not to be executed by the main thread, since that would result in self-copies. Perhaps not visible to scalars, @@ -4715,7 +4771,8 @@ lower_lastprivate_clauses (tree clauses, tree predicate, gimple_seq *stmt_list, if (simduid && DECL_HAS_VALUE_EXPR_P (new_var)) { tree val = DECL_VALUE_EXPR (new_var); - if (TREE_CODE (val) == ARRAY_REF + if (!maybe_simt + && TREE_CODE (val) == ARRAY_REF && VAR_P (TREE_OPERAND (val, 0)) && lookup_attribute ("omp simd array", DECL_ATTRIBUTES (TREE_OPERAND (val, @@ -4734,24 +4791,26 @@ lower_lastprivate_clauses (tree clauses, tree predicate, gimple_seq *stmt_list, new_var = build4 (ARRAY_REF, TREE_TYPE (val), TREE_OPERAND (val, 0), lastlane, NULL_TREE, NULL_TREE); - if (maybe_simt) + } + else if (maybe_simt + && VAR_P (val) + && lookup_attribute ("omp simt private", + DECL_ATTRIBUTES (val))) + { + if (simtlast == NULL) { - gcall *g; - if (simtlast == NULL) - { - simtlast = create_tmp_var (unsigned_type_node); - g = gimple_build_call_internal - (IFN_GOMP_SIMT_LAST_LANE, 1, simtcond); - gimple_call_set_lhs (g, simtlast); - gimple_seq_add_stmt (stmt_list, g); - } - x = build_call_expr_internal_loc - (UNKNOWN_LOCATION, IFN_GOMP_SIMT_XCHG_IDX, - TREE_TYPE (new_var), 2, new_var, simtlast); - new_var = unshare_expr (new_var); - gimplify_assign (new_var, x, stmt_list); - new_var = unshare_expr (new_var); + simtlast = create_tmp_var (unsigned_type_node); + gcall *g = gimple_build_call_internal + (IFN_GOMP_SIMT_LAST_LANE, 1, simtcond); + gimple_call_set_lhs (g, simtlast); + gimple_seq_add_stmt (stmt_list, g); } + x = build_call_expr_internal_loc + (UNKNOWN_LOCATION, IFN_GOMP_SIMT_XCHG_IDX, + TREE_TYPE (val), 2, val, simtlast); + new_var = unshare_expr (new_var); + gimplify_assign (new_var, x, stmt_list); + new_var = unshare_expr (new_var); } } diff --git a/gcc/omp-offload.c b/gcc/omp-offload.c index d73955c554f..beeeb71a2eb 100644 --- a/gcc/omp-offload.c +++ b/gcc/omp-offload.c @@ -33,12 +33,15 @@ along with GCC; see the file COPYING3. If not see #include "diagnostic-core.h" #include "fold-const.h" #include "internal-fn.h" +#include "langhooks.h" #include "gimplify.h" #include "gimple-iterator.h" #include "gimplify-me.h" #include "gimple-walk.h" #include "tree-cfg.h" #include "tree-into-ssa.h" +#include "tree-nested.h" +#include "stor-layout.h" #include "common/common-target.h" #include "omp-general.h" #include "omp-offload.h" @@ -1669,6 +1672,92 @@ make_pass_oacc_device_lower (gcc::context *ctxt) return new pass_oacc_device_lower (ctxt); } + +/* Rewrite GOMP_SIMT_ENTER_ALLOC call given by GSI and remove the preceding + GOMP_SIMT_ENTER call identifying the privatized variables, which are + turned to structure fields and receive a DECL_VALUE_EXPR accordingly. + Set *REGIMPLIFY to true, except if no privatized variables were seen. */ + +static void +ompdevlow_adjust_simt_enter (gimple_stmt_iterator *gsi, bool *regimplify) +{ + gimple *alloc_stmt = gsi_stmt (*gsi); + tree simtrec = gimple_call_lhs (alloc_stmt); + tree simduid = gimple_call_arg (alloc_stmt, 0); + gimple *enter_stmt = SSA_NAME_DEF_STMT (simduid); + gcc_assert (gimple_call_internal_p (enter_stmt, IFN_GOMP_SIMT_ENTER)); + tree rectype = lang_hooks.types.make_type (RECORD_TYPE); + TYPE_ARTIFICIAL (rectype) = TYPE_NAMELESS (rectype) = 1; + TREE_ADDRESSABLE (rectype) = 1; + TREE_TYPE (simtrec) = build_pointer_type (rectype); + for (unsigned i = 1; i < gimple_call_num_args (enter_stmt); i++) + { + tree *argp = gimple_call_arg_ptr (enter_stmt, i); + if (*argp == null_pointer_node) + continue; + gcc_assert (TREE_CODE (*argp) == ADDR_EXPR + && VAR_P (TREE_OPERAND (*argp, 0))); + tree var = TREE_OPERAND (*argp, 0); + + tree field = build_decl (DECL_SOURCE_LOCATION (var), FIELD_DECL, + DECL_NAME (var), TREE_TYPE (var)); + SET_DECL_ALIGN (field, DECL_ALIGN (var)); + DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var); + TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (var); + + insert_field_into_struct (rectype, field); + + tree t = build_simple_mem_ref (simtrec); + t = build3 (COMPONENT_REF, TREE_TYPE (var), t, field, NULL); + TREE_THIS_VOLATILE (t) = TREE_THIS_VOLATILE (var); + SET_DECL_VALUE_EXPR (var, t); + DECL_HAS_VALUE_EXPR_P (var) = 1; + *regimplify = true; + } + layout_type (rectype); + tree size = TYPE_SIZE_UNIT (rectype); + tree align = build_int_cst (TREE_TYPE (size), TYPE_ALIGN_UNIT (rectype)); + + alloc_stmt + = gimple_build_call_internal (IFN_GOMP_SIMT_ENTER_ALLOC, 2, size, align); + gimple_call_set_lhs (alloc_stmt, simtrec); + gsi_replace (gsi, alloc_stmt, false); + gimple_stmt_iterator enter_gsi = gsi_for_stmt (enter_stmt); + enter_stmt = gimple_build_assign (simduid, gimple_call_arg (enter_stmt, 0)); + gsi_replace (&enter_gsi, enter_stmt, false); + + use_operand_p use; + gimple *exit_stmt; + if (single_imm_use (simtrec, &use, &exit_stmt)) + { + gcc_assert (gimple_call_internal_p (exit_stmt, IFN_GOMP_SIMT_EXIT)); + gimple_stmt_iterator exit_gsi = gsi_for_stmt (exit_stmt); + tree clobber = build_constructor (rectype, NULL); + TREE_THIS_VOLATILE (clobber) = 1; + exit_stmt = gimple_build_assign (build_simple_mem_ref (simtrec), clobber); + gsi_insert_before (&exit_gsi, exit_stmt, GSI_SAME_STMT); + } + else + gcc_checking_assert (has_zero_uses (simtrec)); +} + +/* Callback for walk_gimple_stmt used to scan for SIMT-privatized variables. */ + +static tree +find_simtpriv_var_op (tree *tp, int *walk_subtrees, void *) +{ + tree t = *tp; + + if (VAR_P (t) + && DECL_HAS_VALUE_EXPR_P (t) + && lookup_attribute ("omp simt private", DECL_ATTRIBUTES (t))) + { + *walk_subtrees = 0; + return t; + } + return NULL_TREE; +} + /* Cleanup uses of SIMT placeholder internal functions: on non-SIMT targets, VF is 1 and LANE is 0; on SIMT targets, VF is folded to a constant, and LANE is kept to be expanded to RTL later on. Also cleanup all other SIMT @@ -1679,6 +1768,7 @@ static unsigned int execute_omp_device_lower () { int vf = targetm.simt.vf ? targetm.simt.vf () : 1; + bool regimplify = false; basic_block bb; gimple_stmt_iterator gsi; FOR_EACH_BB_FN (bb, cfun) @@ -1694,6 +1784,20 @@ execute_omp_device_lower () case IFN_GOMP_USE_SIMT: rhs = vf == 1 ? integer_zero_node : integer_one_node; break; + case IFN_GOMP_SIMT_ENTER: + rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE; + goto simtreg_enter_exit; + case IFN_GOMP_SIMT_ENTER_ALLOC: + if (vf != 1) + ompdevlow_adjust_simt_enter (&gsi, ®implify); + rhs = vf == 1 ? null_pointer_node : NULL_TREE; + goto simtreg_enter_exit; + case IFN_GOMP_SIMT_EXIT: + simtreg_enter_exit: + if (vf != 1) + continue; + unlink_stmt_vdef (stmt); + break; case IFN_GOMP_SIMT_LANE: case IFN_GOMP_SIMT_LAST_LANE: rhs = vf == 1 ? build_zero_cst (type) : NULL_TREE; @@ -1726,6 +1830,16 @@ execute_omp_device_lower () stmt = lhs ? gimple_build_assign (lhs, rhs) : gimple_build_nop (); gsi_replace (&gsi, stmt, false); } + if (regimplify) + FOR_EACH_BB_REVERSE_FN (bb, cfun) + for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi)) + if (walk_gimple_stmt (&gsi, NULL, find_simtpriv_var_op, NULL)) + { + if (gimple_clobber_p (gsi_stmt (gsi))) + gsi_remove (&gsi, true); + else + gimple_regimplify_operands (gsi_stmt (gsi), &gsi); + } if (vf != 1) cfun->has_force_vectorize_loops = false; return 0; diff --git a/gcc/target-insns.def b/gcc/target-insns.def index 2968c879329..fb92f72ac29 100644 --- a/gcc/target-insns.def +++ b/gcc/target-insns.def @@ -68,6 +68,8 @@ DEF_TARGET_INSN (oacc_dim_pos, (rtx x0, rtx x1)) DEF_TARGET_INSN (oacc_dim_size, (rtx x0, rtx x1)) DEF_TARGET_INSN (oacc_fork, (rtx x0, rtx x1, rtx x2)) DEF_TARGET_INSN (oacc_join, (rtx x0, rtx x1, rtx x2)) +DEF_TARGET_INSN (omp_simt_enter, (rtx x0, rtx x1, rtx x2)) +DEF_TARGET_INSN (omp_simt_exit, (rtx x0)) DEF_TARGET_INSN (omp_simt_lane, (rtx x0)) DEF_TARGET_INSN (omp_simt_last_lane, (rtx x0, rtx x1)) DEF_TARGET_INSN (omp_simt_ordered, (rtx x0, rtx x1)) diff --git a/gcc/tree-inline.c b/gcc/tree-inline.c index 09e80e6a5bc..bfaaede0c32 100644 --- a/gcc/tree-inline.c +++ b/gcc/tree-inline.c @@ -4395,6 +4395,11 @@ expand_call_inline (basic_block bb, gimple *stmt, copy_body_data *id) gcall *call_stmt; unsigned int i; unsigned int prop_mask, src_properties; + struct function *dst_cfun; + tree simduid; + use_operand_p use; + gimple *simtenter_stmt = NULL; + vec *simtvars_save; /* The gimplifier uses input_location in too many places, such as internal_get_tmp_var (). */ @@ -4598,15 +4603,26 @@ expand_call_inline (basic_block bb, gimple *stmt, copy_body_data *id) id->src_cfun = DECL_STRUCT_FUNCTION (fn); id->call_stmt = call_stmt; + /* When inlining into an OpenMP SIMD-on-SIMT loop, arrange for new automatic + variables to be added to IFN_GOMP_SIMT_ENTER argument list. */ + dst_cfun = DECL_STRUCT_FUNCTION (id->dst_fn); + simtvars_save = id->dst_simt_vars; + if (!(dst_cfun->curr_properties & PROP_gimple_lomp_dev) + && (simduid = bb->loop_father->simduid) != NULL_TREE + && (simduid = ssa_default_def (dst_cfun, simduid)) != NULL_TREE + && single_imm_use (simduid, &use, &simtenter_stmt) + && is_gimple_call (simtenter_stmt) + && gimple_call_internal_p (simtenter_stmt, IFN_GOMP_SIMT_ENTER)) + vec_alloc (id->dst_simt_vars, 0); + else + id->dst_simt_vars = NULL; + /* If the src function contains an IFN_VA_ARG, then so will the dst function after inlining. Likewise for IFN_GOMP_USE_SIMT. */ prop_mask = PROP_gimple_lva | PROP_gimple_lomp_dev; src_properties = id->src_cfun->curr_properties & prop_mask; if (src_properties != prop_mask) - { - struct function *dst_cfun = DECL_STRUCT_FUNCTION (id->dst_fn); - dst_cfun->curr_properties &= src_properties | ~prop_mask; - } + dst_cfun->curr_properties &= src_properties | ~prop_mask; gcc_assert (!id->src_cfun->after_inlining); @@ -4740,6 +4756,27 @@ expand_call_inline (basic_block bb, gimple *stmt, copy_body_data *id) if (cfun->gimple_df) pt_solution_reset (&cfun->gimple_df->escaped); + /* Add new automatic variables to IFN_GOMP_SIMT_ENTER arguments. */ + if (id->dst_simt_vars && id->dst_simt_vars->length () > 0) + { + size_t nargs = gimple_call_num_args (simtenter_stmt); + vec *vars = id->dst_simt_vars; + auto_vec newargs (nargs + vars->length ()); + for (size_t i = 0; i < nargs; i++) + newargs.quick_push (gimple_call_arg (simtenter_stmt, i)); + for (tree *pvar = vars->begin (); pvar != vars->end (); pvar++) + { + tree ptrtype = build_pointer_type (TREE_TYPE (*pvar)); + newargs.quick_push (build1 (ADDR_EXPR, ptrtype, *pvar)); + } + gcall *g = gimple_build_call_internal_vec (IFN_GOMP_SIMT_ENTER, newargs); + gimple_call_set_lhs (g, gimple_call_lhs (simtenter_stmt)); + gimple_stmt_iterator gsi = gsi_for_stmt (simtenter_stmt); + gsi_replace (&gsi, g, false); + } + vec_free (id->dst_simt_vars); + id->dst_simt_vars = simtvars_save; + /* Clean up. */ if (id->debug_map) { @@ -5463,9 +5500,19 @@ copy_decl_for_dup_finish (copy_body_data *id, tree decl, tree copy) function. */ ; else - /* Ordinary automatic local variables are now in the scope of the - new function. */ - DECL_CONTEXT (copy) = id->dst_fn; + { + /* Ordinary automatic local variables are now in the scope of the + new function. */ + DECL_CONTEXT (copy) = id->dst_fn; + if (VAR_P (copy) && id->dst_simt_vars && !is_gimple_reg (copy)) + { + if (!lookup_attribute ("omp simt private", DECL_ATTRIBUTES (copy))) + DECL_ATTRIBUTES (copy) + = tree_cons (get_identifier ("omp simt private"), NULL, + DECL_ATTRIBUTES (copy)); + id->dst_simt_vars->safe_push (copy); + } + } return copy; } diff --git a/gcc/tree-inline.h b/gcc/tree-inline.h index 88b32863745..ffb8333a7dd 100644 --- a/gcc/tree-inline.h +++ b/gcc/tree-inline.h @@ -145,6 +145,10 @@ struct copy_body_data equivalents in the function into which it is being inlined. */ hash_map *dependence_map; + /* A list of addressable local variables remapped into the caller + when inlining a call within an OpenMP SIMD-on-SIMT loop. */ + vec *dst_simt_vars; + /* Cilk keywords currently need to replace some variables that ordinary nested functions do not. */ bool remap_var_for_cilk; diff --git a/gcc/tree-ssa.c b/gcc/tree-ssa.c index 831fd61e15f..42e708ed673 100644 --- a/gcc/tree-ssa.c +++ b/gcc/tree-ssa.c @@ -1654,7 +1654,8 @@ execute_update_addresses_taken (void) gimple_ior_addresses_taken (addresses_taken, stmt); gimple_call_set_arg (stmt, 1, arg); } - else if (is_asan_mark_p (stmt)) + else if (is_asan_mark_p (stmt) + || gimple_call_internal_p (stmt, IFN_GOMP_SIMT_ENTER)) ; else gimple_ior_addresses_taken (addresses_taken, stmt); @@ -1940,6 +1941,18 @@ execute_update_addresses_taken (void) continue; } } + else if (gimple_call_internal_p (stmt, IFN_GOMP_SIMT_ENTER)) + for (i = 1; i < gimple_call_num_args (stmt); i++) + { + tree *argp = gimple_call_arg_ptr (stmt, i); + if (*argp == null_pointer_node) + continue; + gcc_assert (TREE_CODE (*argp) == ADDR_EXPR + && VAR_P (TREE_OPERAND (*argp, 0))); + tree var = TREE_OPERAND (*argp, 0); + if (bitmap_bit_p (suitable_for_renaming, DECL_UID (var))) + *argp = null_pointer_node; + } for (i = 0; i < gimple_call_num_args (stmt); ++i) { tree *argp = gimple_call_arg_ptr (stmt, i);