From 43c371e8b0dc21d3b1b6d87cc7f29d6d53ae5d82 Mon Sep 17 00:00:00 2001 From: Tom de Vries Date: Tue, 11 Jul 2017 12:25:01 +0000 Subject: [PATCH] Add extra initialization of broadcasted condition variables 2017-07-11 Tom de Vries * config/nvptx/nvptx.c (WORKAROUND_PTXJIT_BUG): New macro. (bb_first_real_insn): New function. (nvptx_single): Add extra initialization of broadcasted condition variables. From-SVN: r250129 --- gcc/ChangeLog | 7 ++++++ gcc/config/nvptx/nvptx.c | 53 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 97555974f34..c960cd76098 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,10 @@ +2017-07-11 Tom de Vries + + * config/nvptx/nvptx.c (WORKAROUND_PTXJIT_BUG): New macro. + (bb_first_real_insn): New function. + (nvptx_single): Add extra initialization of broadcasted condition + variables. + 2017-07-11 Nathan Sidwell * dwarf2out.c (gen_member_die): Remove useless check for anon ctors. diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index daeec2733ea..c8847a5dbba 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -74,6 +74,8 @@ /* This file should be included last. */ #include "target-def.h" +#define WORKAROUND_PTXJIT_BUG 1 + /* The various PTX memory areas an object might reside in. */ enum nvptx_data_area { @@ -3844,6 +3846,24 @@ nvptx_wsync (bool after) return gen_nvptx_barsync (GEN_INT (after)); } +#if WORKAROUND_PTXJIT_BUG +/* Return first real insn in BB, or return NULL_RTX if BB does not contain + real insns. */ + +static rtx_insn * +bb_first_real_insn (basic_block bb) +{ + rtx_insn *insn; + + /* Find first insn of from block. */ + FOR_BB_INSNS (bb, insn) + if (INSN_P (insn)) + return insn; + + return 0; +} +#endif + /* Single neutering according to MASK. FROM is the incoming block and TO is the outgoing block. These may be the same block. Insert at start of FROM: @@ -3958,6 +3978,39 @@ nvptx_single (unsigned mask, basic_block from, basic_block to) if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask) { /* Vector mode only, do a shuffle. */ +#if WORKAROUND_PTXJIT_BUG + /* The branch condition %rcond is propagated like this: + + { + .reg .u32 %x; + mov.u32 %x,%tid.x; + setp.ne.u32 %rnotvzero,%x,0; + } + + @%rnotvzero bra Lskip; + setp.. %rcond,op1,op2; + Lskip: + selp.u32 %rcondu32,1,0,%rcond; + shfl.idx.b32 %rcondu32,%rcondu32,0,31; + setp.ne.u32 %rcond,%rcondu32,0; + + There seems to be a bug in the ptx JIT compiler (observed at driver + version 381.22, at -O1 and higher for sm_61), that drops the shfl + unless %rcond is initialized to something before 'bra Lskip'. The + bug is not observed with ptxas from cuda 8.0.61. + + It is true that the code is non-trivial: at Lskip, %rcond is + uninitialized in threads 1-31, and after the selp the same holds + for %rcondu32. But shfl propagates the defined value in thread 0 + to threads 1-31, so after the shfl %rcondu32 is defined in threads + 0-31, and after the setp.ne %rcond is defined in threads 0-31. + + There is nothing in the PTX spec to suggest that this is wrong, or + to explain why the extra initialization is needed. So, we classify + it as a JIT bug, and the extra initialization as workaround. */ + emit_insn_before (gen_movbi (pvar, const0_rtx), + bb_first_real_insn (from)); +#endif emit_insn_before (nvptx_gen_vcast (pvar), tail); } else -- 2.30.2