use DP4 or DP3 + ADD.
Reviewed-by: Roland Scheidegger <sroland@vmware.com>
dp4_emit /* emit */
};
-/* TGSI_OPCODE_DPH */
-static void
-dph_fetch_args(
- struct lp_build_tgsi_context * bld_base,
- struct lp_build_emit_data * emit_data)
-{
- dp_fetch_args(bld_base, emit_data, 4);
- /* src0.w */
- emit_data->args[3] = bld_base->base.one;
-}
-
-const struct lp_build_tgsi_action dph_action = {
- dph_fetch_args, /* fetch_args */
- dp4_emit /* emit */
-};
-
/* TGSI_OPCODE_DST */
static void
dst_fetch_args(
bld_base->op_actions[TGSI_OPCODE_DP2] = dp2_action;
bld_base->op_actions[TGSI_OPCODE_DP3] = dp3_action;
bld_base->op_actions[TGSI_OPCODE_DP4] = dp4_action;
- bld_base->op_actions[TGSI_OPCODE_DPH] = dph_action;
bld_base->op_actions[TGSI_OPCODE_DST] = dst_action;
bld_base->op_actions[TGSI_OPCODE_EXP] = exp_action;
bld_base->op_actions[TGSI_OPCODE_LIT] = lit_action;
case TGSI_OPCODE_XPD:
return FALSE;
- case TGSI_OPCODE_DPH:
- return FALSE;
-
case TGSI_OPCODE_COS:
src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL);
tmp0 = swizzle_scalar_aos(bld, src0, TGSI_SWIZZLE_X);
ttn_move_dest(b, dest, nir_fdot4(b, src[0], src[1]));
}
-static void
-ttn_dph(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src)
-{
- ttn_move_dest(b, dest, nir_fadd(b, nir_fdot3(b, src[0], src[1]),
- ttn_channel(b, src[1], W)));
-}
-
static void
ttn_umad(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src)
{
[TGSI_OPCODE_LG2] = nir_op_flog2,
[TGSI_OPCODE_POW] = nir_op_fpow,
[TGSI_OPCODE_XPD] = 0,
- [TGSI_OPCODE_DPH] = 0,
[TGSI_OPCODE_COS] = nir_op_fcos,
[TGSI_OPCODE_DDX] = nir_op_fddx,
[TGSI_OPCODE_DDY] = nir_op_fddy,
ttn_dp4(b, op_trans[tgsi_op], dest, src);
break;
- case TGSI_OPCODE_DPH:
- ttn_dph(b, op_trans[tgsi_op], dest, src);
- break;
-
case TGSI_OPCODE_UMAD:
ttn_umad(b, op_trans[tgsi_op], dest, src);
break;
}
}
-static void
-exec_dph(struct tgsi_exec_machine *mach,
- const struct tgsi_full_instruction *inst)
-{
- unsigned int chan;
- union tgsi_exec_channel arg[3];
-
- fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
- fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
- micro_mul(&arg[2], &arg[0], &arg[1]);
-
- fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
- fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
- micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
-
- fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
- fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
- micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
-
- fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
- micro_add(&arg[0], &arg[0], &arg[1]);
-
- for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
- if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
- store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
- }
- }
-}
-
static void
exec_dp2(struct tgsi_exec_machine *mach,
const struct tgsi_full_instruction *inst)
exec_xpd(mach, inst);
break;
- case TGSI_OPCODE_DPH:
- exec_dph(mach, inst);
- break;
-
case TGSI_OPCODE_COS:
exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
break;
{ 1, 1, 0, 0, 0, 0, 0, COMP, "U2I64", TGSI_OPCODE_U2I64 },
{ 1, 0, 0, 0, 0, 0, 0, OTHR, "CLOCK", TGSI_OPCODE_CLOCK },
{ 1, 1, 0, 0, 0, 0, 0, COMP, "I2I64", TGSI_OPCODE_I2I64 },
- { 1, 2, 0, 0, 0, 0, 0, REPL, "DPH", TGSI_OPCODE_DPH },
+ { 1, 2, 0, 0, 0, 0, 0, REPL, "", 35 }, /* removed */
{ 1, 1, 0, 0, 0, 0, 0, REPL, "COS", TGSI_OPCODE_COS },
{ 1, 1, 0, 0, 0, 0, 0, COMP, "DDX", TGSI_OPCODE_DDX },
{ 1, 1, 0, 0, 0, 0, 0, COMP, "DDY", TGSI_OPCODE_DDY },
* DP3 - 3-component Dot Product
* dst = src0.x \times src1.x + src0.y \times src1.y + src0.z \times src1.z
*
- * DPH - Homogeneous Dot Product
- * dst = src0.x \times src1.x + src0.y \times src1.y + src0.z \times src1.z + src1.w
- *
* DP2 - 2-component Dot Product
* dst = src0.x \times src1.x + src0.y \times src1.y
*
* operations, which is what you'd prefer for a ISA that is natively
* scalar. Probably a native vector ISA would at least already have
* DP4/DP3 instructions, but perhaps there is room for an alternative
- * translation for DPH/DP2 using vector instructions.
+ * translation for DP2 using vector instructions.
*
* ; needs: 1 tmp
* MUL tmpA.x, src0.x, src1.x
* MAD tmpA.x, src0.y, src1.y, tmpA.x
- * if (DPH || DP3 || DP4) {
+ * if (DP3 || DP4) {
* MAD tmpA.x, src0.z, src1.z, tmpA.x
- * if (DPH) {
- * ADD tmpA.x, src1.w, tmpA.x
- * } else if (DP4) {
+ * if (DP4) {
* MAD tmpA.x, src0.w, src1.w, tmpA.x
* }
* }
*/
#define DP4_GROW (NINST(2) + NINST(3) + NINST(3) + NINST(3) - OINST(2))
#define DP3_GROW (NINST(2) + NINST(3) + NINST(3) - OINST(2))
-#define DPH_GROW (NINST(2) + NINST(3) + NINST(3) + NINST(2) - OINST(2))
#define DP2_GROW (NINST(2) + NINST(3) - OINST(2))
#define DOTP_TMP 1
static void
reg_src(&new_inst.Src[1], src1, SWIZ(Y, Y, Y, Y));
reg_src(&new_inst.Src[2], &ctx->tmp[A].src, SWIZ(X, X, X, X));
- if ((opcode == TGSI_OPCODE_DPH) ||
- (opcode == TGSI_OPCODE_DP3) ||
+ if ((opcode == TGSI_OPCODE_DP3) ||
(opcode == TGSI_OPCODE_DP4)) {
tctx->emit_instruction(tctx, &new_inst);
reg_src(&new_inst.Src[1], src1, SWIZ(Z, Z, Z, Z));
reg_src(&new_inst.Src[2], &ctx->tmp[A].src, SWIZ(X, X, X, X));
- if (opcode == TGSI_OPCODE_DPH) {
- tctx->emit_instruction(tctx, &new_inst);
-
- /* ADD tmpA.x, src1.w, tmpA.x */
- new_inst = tgsi_default_full_instruction();
- new_inst.Instruction.Opcode = TGSI_OPCODE_ADD;
- new_inst.Instruction.NumDstRegs = 1;
- reg_dst(&new_inst.Dst[0], &ctx->tmp[A].dst, TGSI_WRITEMASK_X);
- new_inst.Instruction.NumSrcRegs = 2;
- reg_src(&new_inst.Src[0], src1, SWIZ(W, W, W, W));
- reg_src(&new_inst.Src[1], &ctx->tmp[A].src, SWIZ(X, X, X, X));
- } else if (opcode == TGSI_OPCODE_DP4) {
+ if (opcode == TGSI_OPCODE_DP4) {
tctx->emit_instruction(tctx, &new_inst);
/* MAD tmpA.x, src0.w, src1.w, tmpA.x */
goto skip;
transform_dotp(tctx, inst);
break;
- case TGSI_OPCODE_DPH:
- if (!ctx->config->lower_DPH)
- goto skip;
- transform_dotp(tctx, inst);
- break;
case TGSI_OPCODE_DP2:
if (!ctx->config->lower_DP2)
goto skip;
OPCS(LOG) ||
OPCS(DP4) ||
OPCS(DP3) ||
- OPCS(DPH) ||
OPCS(DP2) ||
OPCS(FLR) ||
OPCS(CEIL) ||
newlen += DP3_GROW * OPCS(DP3);
numtmp = MAX2(numtmp, DOTP_TMP);
}
- if (OPCS(DPH)) {
- newlen += DPH_GROW * OPCS(DPH);
- numtmp = MAX2(numtmp, DOTP_TMP);
- }
if (OPCS(DP2)) {
newlen += DP2_GROW * OPCS(DP2);
numtmp = MAX2(numtmp, DOTP_TMP);
unsigned lower_LOG:1;
unsigned lower_DP4:1;
unsigned lower_DP3:1;
- unsigned lower_DPH:1;
unsigned lower_DP2:1;
unsigned lower_FLR:1;
unsigned lower_CEIL:1;
OP11(LG2)
OP12(POW)
OP12(XPD)
-OP12(DPH)
OP11(COS)
OP11(DDX)
OP11(DDY)
read_mask = TGSI_WRITEMASK_XYZW;
break;
- case TGSI_OPCODE_DPH:
- read_mask = src_idx == 0 ? TGSI_WRITEMASK_XYZ : TGSI_WRITEMASK_XYZW;
- break;
-
case TGSI_OPCODE_TEX:
case TGSI_OPCODE_TXD:
case TGSI_OPCODE_TXB:
dst.w = 1
-.. opcode:: DPH - Homogeneous Dot Product
-
-This instruction replicates its result.
-
-.. math::
-
- dst = src0.x \times src1.x + src0.y \times src1.y + src0.z \times src1.z + src1.w
-
-
.. opcode:: COS - Cosine
This instruction replicates its result.
should be set the same way for an entire pipeline. Note that this
applies not only to the literal MUL TGSI opcode, but all FP32
multiplications implied by other operations, such as MAD, FMA, DP2,
-DP3, DP4, DPH, DST, LOG, LRP, XPD, and possibly others. If there is a
+DP3, DP4, DST, LOG, LRP, XPD, and possibly others. If there is a
mismatch between shaders, then it is unspecified whether this behavior
will be enabled.
}
}
-static void
-trans_dph(const struct instr_translater *t, struct etna_compile *c,
- const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
-{
- /*
- DP3 tmp.xyzw, src0.xyzw, src1,xyzw, void
- ADD dst.xyzw, tmp.xyzw, void, src1.wwww
- */
- struct etna_native_reg temp = etna_compile_get_inner_temp(c);
- struct etna_inst ins[2] = { };
-
- ins[0].opcode = INST_OPCODE_DP3;
- ins[0].dst = etna_native_to_dst(temp, INST_COMPS_X | INST_COMPS_Y |
- INST_COMPS_Z | INST_COMPS_W);
- ins[0].src[0] = src[0];
- ins[0].src[1] = src[1];
-
- ins[1].opcode = INST_OPCODE_ADD;
- ins[1].sat = inst->Instruction.Saturate;
- ins[1].dst = convert_dst(c, &inst->Dst[0]);
- ins[1].src[0] = etna_native_to_src(temp, INST_SWIZ_IDENTITY);
- ins[1].src[2] = swizzle(src[1], SWIZZLE(W, W, W, W));
-
- emit_inst(c, &ins[0]);
- emit_inst(c, &ins[1]);
-}
-
static void
trans_sampler(const struct instr_translater *t, struct etna_compile *c,
const struct tgsi_full_instruction *inst,
INSTR(LRP, trans_lrp),
INSTR(LIT, trans_lit),
INSTR(SSG, trans_ssg),
- INSTR(DPH, trans_dph),
INSTR(SIN, trans_trig),
INSTR(COS, trans_trig),
[ TGSI_OPCODE_DP2 ] = { false, true, TGSI_SWIZZLE_ONE, 1, 2 },
[ TGSI_OPCODE_DP3 ] = { false, true, TGSI_SWIZZLE_ONE, 1, 2 },
[ TGSI_OPCODE_DP4 ] = { false, true, TGSI_SWIZZLE_ONE, 1, 2 },
- [ TGSI_OPCODE_DPH ] = { false, false, 0, 1, 2 },
[ TGSI_OPCODE_DST ] = { false, false, 0, 1, 2 },
[ TGSI_OPCODE_END ] = { false, false, 0, 0, 0 },
[ TGSI_OPCODE_EX2 ] = { false, false, 0, 1, 1 },
emit_simple_arith(p, inst, A0_DP4, 2, fs);
break;
- case TGSI_OPCODE_DPH:
- src0 = src_vector(p, &inst->Src[0], fs);
- src1 = src_vector(p, &inst->Src[1], fs);
-
- i915_emit_arith(p,
- A0_DP4,
- get_result_vector(p, &inst->Dst[0]),
- get_result_flags(inst), 0,
- swizzle(src0, X, Y, Z, ONE), src1, 0);
- break;
-
case TGSI_OPCODE_DST:
src0 = src_vector(p, &inst->Src[0], fs);
src1 = src_vector(p, &inst->Src[1], fs);
case TGSI_OPCODE_DP3:
return 0x7;
case TGSI_OPCODE_DP4:
- case TGSI_OPCODE_DPH:
case TGSI_OPCODE_KILL_IF: /* WriteMask ignored */
return 0xf;
case TGSI_OPCODE_DST:
FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
mkMov(dst0[c], val0);
break;
- case TGSI_OPCODE_DPH:
- val0 = buildDot(3);
- src1 = fetchSrc(1, 3);
- mkOp2(OP_ADD, TYPE_F32, val0, val0, src1);
- FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
- mkMov(dst0[c], val0);
- break;
case TGSI_OPCODE_DST:
if (dst0[0])
loadImm(dst0[0], 1.0f);
case TGSI_OPCODE_DP4:
nvfx_fp_emit(fpc, arith(sat, DP4, dst, mask, src[0], src[1], none));
break;
- case TGSI_OPCODE_DPH:
- tmp = nvfx_src(temp(fpc));
- nvfx_fp_emit(fpc, arith(0, DP3, tmp.reg, NVFX_FP_MASK_X, src[0], src[1], none));
- nvfx_fp_emit(fpc, arith(sat, ADD, dst, mask, swz(tmp, X, X, X, X), swz(src[1], W, W, W, W), none));
- break;
case TGSI_OPCODE_DST:
nvfx_fp_emit(fpc, arith(sat, DST, dst, mask, src[0], src[1], none));
break;
case TGSI_OPCODE_DP4:
nvfx_vp_emit(vpc, arith(sat, VEC, DP4, dst, mask, src[0], src[1], none));
break;
- case TGSI_OPCODE_DPH:
- nvfx_vp_emit(vpc, arith(sat, VEC, DPH, dst, mask, src[0], src[1], none));
- break;
case TGSI_OPCODE_DST:
nvfx_vp_emit(vpc, arith(sat, VEC, DST, dst, mask, src[0], src[1], none));
break;
case TGSI_OPCODE_LG2: return RC_OPCODE_LG2;
case TGSI_OPCODE_POW: return RC_OPCODE_POW;
case TGSI_OPCODE_XPD: return RC_OPCODE_XPD;
- case TGSI_OPCODE_DPH: return RC_OPCODE_DPH;
case TGSI_OPCODE_COS: return RC_OPCODE_COS;
case TGSI_OPCODE_DDX: return RC_OPCODE_DDX;
case TGSI_OPCODE_DDY: return RC_OPCODE_DDY;
alu.src[0].chan = alu.src[1].chan = 0;
}
break;
- case TGSI_OPCODE_DPH:
- if (i == 3) {
- alu.src[0].sel = V_SQ_ALU_SRC_1;
- alu.src[0].chan = 0;
- alu.src[0].neg = 0;
- }
- break;
default:
break;
}
[32] = { ALU_OP0_NOP, tgsi_unsupported},
[33] = { ALU_OP0_NOP, tgsi_unsupported},
[34] = { ALU_OP0_NOP, tgsi_unsupported},
- [TGSI_OPCODE_DPH] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
+ [35] = { ALU_OP0_NOP, tgsi_unsupported},
[TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig},
[TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
[TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
[32] = { ALU_OP0_NOP, tgsi_unsupported},
[33] = { ALU_OP0_NOP, tgsi_unsupported},
[34] = { ALU_OP0_NOP, tgsi_unsupported},
- [TGSI_OPCODE_DPH] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
+ [35] = { ALU_OP0_NOP, tgsi_unsupported},
[TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig},
[TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
[TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
[32] = { ALU_OP0_NOP, tgsi_unsupported},
[33] = { ALU_OP0_NOP, tgsi_unsupported},
[34] = { ALU_OP0_NOP, tgsi_unsupported},
- [TGSI_OPCODE_DPH] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
+ [35] = { ALU_OP0_NOP, tgsi_unsupported},
[TGSI_OPCODE_COS] = { ALU_OP1_COS, cayman_trig},
[TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
[TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
}
-/**
- * Translate the following TGSI DPH instruction.
- * DPH DST, SRC1, SRC2
- * To the following SVGA3D instruction sequence.
- * DP3 TMP, SRC1, SRC2
- * ADD DST, TMP, SRC2.wwww
- */
-static boolean
-emit_dph(struct svga_shader_emitter *emit,
- const struct tgsi_full_instruction *insn )
-{
- SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
- const struct src_register src0 = translate_src_register(
- emit, &insn->Src[0] );
- struct src_register src1 =
- translate_src_register(emit, &insn->Src[1]);
- SVGA3dShaderDestToken temp = get_temp( emit );
-
- /* DP3 TMP, SRC1, SRC2 */
- if (!submit_op2( emit, inst_token( SVGA3DOP_DP3 ), temp, src0, src1 ))
- return FALSE;
-
- src1 = scalar(src1, TGSI_SWIZZLE_W);
-
- /* ADD DST, TMP, SRC2.wwww */
- if (!submit_op2( emit, inst_token( SVGA3DOP_ADD ), dst,
- src( temp ), src1 ))
- return FALSE;
-
- return TRUE;
-}
-
-
/**
* Sine / Cosine helper function.
*/
case TGSI_OPCODE_DP2:
return emit_dp2( emit, insn );
- case TGSI_OPCODE_DPH:
- return emit_dph( emit, insn );
-
case TGSI_OPCODE_COS:
return emit_cos( emit, insn );
}
-/**
- * Emit code for TGSI_OPCODE_DPH instruction.
- */
-static boolean
-emit_dph(struct svga_shader_emitter_v10 *emit,
- const struct tgsi_full_instruction *inst)
-{
- /*
- * DP3 tmp, s0, s1
- * ADD dst, tmp, s1.wwww
- */
-
- struct tgsi_full_src_register s1_wwww =
- swizzle_src(&inst->Src[1], TGSI_SWIZZLE_W, TGSI_SWIZZLE_W,
- TGSI_SWIZZLE_W, TGSI_SWIZZLE_W);
-
- unsigned tmp = get_temp_index(emit);
- struct tgsi_full_src_register tmp_src = make_src_temp_reg(tmp);
- struct tgsi_full_dst_register tmp_dst = make_dst_temp_reg(tmp);
-
- /* DP3 tmp, s0, s1 */
- emit_instruction_op2(emit, VGPU10_OPCODE_DP3, &tmp_dst, &inst->Src[0],
- &inst->Src[1], FALSE);
-
- /* ADD dst, tmp, s1.wwww */
- emit_instruction_op2(emit, VGPU10_OPCODE_ADD, &inst->Dst[0], &tmp_src,
- &s1_wwww, inst->Instruction.Saturate);
-
- free_temp_indexes(emit);
-
- return TRUE;
-}
-
-
/**
* Emit code for TGSI_OPCODE_DST instruction.
*/
return emit_cmp(emit, inst);
case TGSI_OPCODE_COS:
return emit_sincos(emit, inst);
- case TGSI_OPCODE_DPH:
- return emit_dph(emit, inst);
case TGSI_OPCODE_DST:
return emit_dst(emit, inst);
case TGSI_OPCODE_EX2:
#define TGSI_OPCODE_U2I64 32
#define TGSI_OPCODE_CLOCK 33
#define TGSI_OPCODE_I2I64 34
-#define TGSI_OPCODE_DPH 35
+/* gap */
#define TGSI_OPCODE_COS 36
#define TGSI_OPCODE_DDX 37
#define TGSI_OPCODE_DDY 38
return TGSI_OPCODE_DP3;
case OPCODE_DP4:
return TGSI_OPCODE_DP4;
- case OPCODE_DPH:
- return TGSI_OPCODE_DPH;
case OPCODE_DST:
return TGSI_OPCODE_DST;
case OPCODE_EX2:
ureg_ADD(ureg, dst[0], src[0], ureg_negate(src[1]));
break;
+ case OPCODE_DPH: {
+ struct ureg_dst temp = ureg_DECL_temporary(ureg);
+
+ /* DPH = DP4(src0, src1) where src0.w = 1. */
+ ureg_MOV(ureg, ureg_writemask(temp, TGSI_WRITEMASK_XYZ), src[0]);
+ ureg_MOV(ureg, ureg_writemask(temp, TGSI_WRITEMASK_W),
+ ureg_imm1f(ureg, 1));
+ ureg_DP4(ureg, dst[0], ureg_src(temp), src[1]);
+ break;
+ }
+
default:
ureg_insn( ureg,
translate_opcode( inst->Opcode ),