/* This file should be included last. */
#include "target-def.h"
+#define WORKAROUND_PTXJIT_BUG 1
+
/* The various PTX memory areas an object might reside in. */
enum nvptx_data_area
{
return gen_nvptx_barsync (GEN_INT (after));
}
+#if WORKAROUND_PTXJIT_BUG
+/* Return first real insn in BB, or return NULL_RTX if BB does not contain
+ real insns. */
+
+static rtx_insn *
+bb_first_real_insn (basic_block bb)
+{
+ rtx_insn *insn;
+
+ /* Find first insn of from block. */
+ FOR_BB_INSNS (bb, insn)
+ if (INSN_P (insn))
+ return insn;
+
+ return 0;
+}
+#endif
+
/* Single neutering according to MASK. FROM is the incoming block and
TO is the outgoing block. These may be the same block. Insert at
start of FROM:
if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
{
/* Vector mode only, do a shuffle. */
+#if WORKAROUND_PTXJIT_BUG
+ /* The branch condition %rcond is propagated like this:
+
+ {
+ .reg .u32 %x;
+ mov.u32 %x,%tid.x;
+ setp.ne.u32 %rnotvzero,%x,0;
+ }
+
+ @%rnotvzero bra Lskip;
+ setp.<op>.<type> %rcond,op1,op2;
+ Lskip:
+ selp.u32 %rcondu32,1,0,%rcond;
+ shfl.idx.b32 %rcondu32,%rcondu32,0,31;
+ setp.ne.u32 %rcond,%rcondu32,0;
+
+ There seems to be a bug in the ptx JIT compiler (observed at driver
+ version 381.22, at -O1 and higher for sm_61), that drops the shfl
+ unless %rcond is initialized to something before 'bra Lskip'. The
+ bug is not observed with ptxas from cuda 8.0.61.
+
+ It is true that the code is non-trivial: at Lskip, %rcond is
+ uninitialized in threads 1-31, and after the selp the same holds
+ for %rcondu32. But shfl propagates the defined value in thread 0
+ to threads 1-31, so after the shfl %rcondu32 is defined in threads
+ 0-31, and after the setp.ne %rcond is defined in threads 0-31.
+
+ There is nothing in the PTX spec to suggest that this is wrong, or
+ to explain why the extra initialization is needed. So, we classify
+ it as a JIT bug, and the extra initialization as workaround. */
+ emit_insn_before (gen_movbi (pvar, const0_rtx),
+ bb_first_real_insn (from));
+#endif
emit_insn_before (nvptx_gen_vcast (pvar), tail);
}
else