From dba619f37006f8f2167c2043c67c22ccb3f068de Mon Sep 17 00:00:00 2001 From: Nathan Sidwell Date: Tue, 10 Nov 2015 22:29:20 +0000 Subject: [PATCH] nvptx.opt (moptimize): New flag. * config/nvptx/nvptx.opt (moptimize): New flag. * config/nvptx/nvptx.c (nvptx_option_override): Set nvptx_optimize default. (nvptx_optimize_inner): New. (nvptx_process_pars): Call it when optimizing. * doc/invoke.texi (Nvidia PTX Options): Document -moptimize. From-SVN: r230137 --- gcc/ChangeLog | 9 +++++ gcc/config/nvptx/nvptx.c | 69 ++++++++++++++++++++++++++++++++++++++ gcc/config/nvptx/nvptx.opt | 4 +++ gcc/doc/invoke.texi | 7 +++- gcc/testsuite/ChangeLog | 4 +++ 5 files changed, 92 insertions(+), 1 deletion(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 764dd3cae43..a1d54b05e73 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,12 @@ +2015-11-10 Nathan Sidwell + + * config/nvptx/nvptx.opt (moptimize): New flag. + * config/nvptx/nvptx.c (nvptx_option_override): Set nvptx_optimize + default. + (nvptx_optimize_inner): New. + (nvptx_process_pars): Call it when optimizing. + * doc/invoke.texi (Nvidia PTX Options): Document -moptimize. + 2015-11-10 Bill Schmidt * config/rs6000/rs6000.c (rs6000_secondary_reload_direct_move): diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index f1ac307b346..d8673018819 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -137,6 +137,9 @@ nvptx_option_override (void) write_symbols = NO_DEBUG; debug_info_level = DINFO_LEVEL_NONE; + if (nvptx_optimize < 0) + nvptx_optimize = optimize > 0; + declared_fndecls_htab = hash_table::create_ggc (17); needed_fndecls_htab = hash_table::create_ggc (17); declared_libfuncs_htab @@ -2942,6 +2945,69 @@ nvptx_skip_par (unsigned mask, parallel *par) nvptx_single (mask, par->forked_block, pre_tail); } +/* If PAR has a single inner parallel and PAR itself only contains + empty entry and exit blocks, swallow the inner PAR. */ + +static void +nvptx_optimize_inner (parallel *par) +{ + parallel *inner = par->inner; + + /* We mustn't be the outer dummy par. */ + if (!par->mask) + return; + + /* We must have a single inner par. */ + if (!inner || inner->next) + return; + + /* We must only contain 2 blocks ourselves -- the head and tail of + the inner par. */ + if (par->blocks.length () != 2) + return; + + /* We must be disjoint partitioning. As we only have vector and + worker partitioning, this is sufficient to guarantee the pars + have adjacent partitioning. */ + if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1)) + /* This indicates malformed code generation. */ + return; + + /* The outer forked insn should be immediately followed by the inner + fork insn. */ + rtx_insn *forked = par->forked_insn; + rtx_insn *fork = BB_END (par->forked_block); + + if (NEXT_INSN (forked) != fork) + return; + gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork); + + /* The outer joining insn must immediately follow the inner join + insn. */ + rtx_insn *joining = par->joining_insn; + rtx_insn *join = inner->join_insn; + if (NEXT_INSN (join) != joining) + return; + + /* Preconditions met. Swallow the inner par. */ + if (dump_file) + fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n", + inner->mask, inner->forked_block->index, + inner->join_block->index, + par->mask, par->forked_block->index, par->join_block->index); + + par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1); + + par->blocks.reserve (inner->blocks.length ()); + while (inner->blocks.length ()) + par->blocks.quick_push (inner->blocks.pop ()); + + par->inner = inner->inner; + inner->inner = NULL; + + delete inner; +} + /* Process the parallel PAR and all its contained parallels. We do everything but the neutering. Return mask of partitioned modes used within this parallel. */ @@ -2949,6 +3015,9 @@ nvptx_skip_par (unsigned mask, parallel *par) static unsigned nvptx_process_pars (parallel *par) { + if (nvptx_optimize) + nvptx_optimize_inner (par); + unsigned inner_mask = par->mask; /* Do the inner parallels first. */ diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt index 80170465bea..342915d8095 100644 --- a/gcc/config/nvptx/nvptx.opt +++ b/gcc/config/nvptx/nvptx.opt @@ -28,3 +28,7 @@ Generate code for a 64-bit ABI. mmainkernel Target Report RejectNegative Link in code for a __main kernel. + +moptimize +Target Report Var(nvptx_optimize) Init(-1) +Optimize partition neutering diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index e2620a57c8b..213a9d0c604 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -873,7 +873,7 @@ Objective-C and Objective-C++ Dialects}. -march=@var{arch} -mbmx -mno-bmx -mcdx -mno-cdx} @emph{Nvidia PTX Options} -@gccoptlist{-m32 -m64 -mmainkernel} +@gccoptlist{-m32 -m64 -mmainkernel -moptimize} @emph{PDP-11 Options} @gccoptlist{-mfpu -msoft-float -mac0 -mno-ac0 -m40 -m45 -m10 @gol @@ -18960,6 +18960,11 @@ Generate code for 32-bit or 64-bit ABI. Link in code for a __main kernel. This is for stand-alone instead of offloading execution. +@item -moptimize +@opindex moptimize +Apply partitioned execution optimizations. This is the default when any +level of optimization is selected. + @end table @node PDP-11 Options diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index c99c62c7c52..9778c940c25 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,7 @@ +2015-11-10 Nathan Sidwell + + * gcc.dg/goacc/nvptx-opt-1.c: New test. + 2015-11-10 Ilya Enkovich * gcc.target/i386/mask-pack.c: New test. -- 2.30.2