--- /dev/null
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_vec4_vp.cpp
+ *
+ * A translator from Mesa IR to the i965 driver's Vec4 IR, used to implement
+ * ARB_vertex_program and fixed-function vertex processing.
+ */
+
+#include "brw_context.h"
+#include "brw_vec4.h"
+extern "C" {
+#include "program/prog_parameter.h"
+#include "program/prog_print.h"
+}
+using namespace brw;
+
+void
+vec4_visitor::emit_vp_sop(uint32_t conditional_mod,
+ dst_reg dst, src_reg src0, src_reg src1,
+ src_reg one)
+{
+ vec4_instruction *inst;
+
+ inst = emit(BRW_OPCODE_CMP, dst_null_d(), src0, src1);
+ inst->conditional_mod = conditional_mod;
+
+ inst = emit(BRW_OPCODE_SEL, dst, one, src_reg(0.0f));
+ inst->predicate = BRW_PREDICATE_NORMAL;
+}
+
+/**
+ * Reswizzle a given source register.
+ * \sa brw_swizzle().
+ */
+static inline src_reg
+reswizzle(src_reg orig, unsigned x, unsigned y, unsigned z, unsigned w)
+{
+ src_reg t = orig;
+ t.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(orig.swizzle, x),
+ BRW_GET_SWZ(orig.swizzle, y),
+ BRW_GET_SWZ(orig.swizzle, z),
+ BRW_GET_SWZ(orig.swizzle, w));
+ return t;
+}
+
+void
+vec4_visitor::emit_vertex_program_code()
+{
+ this->need_all_constants_in_pull_buffer = false;
+
+ setup_vp_regs();
+
+ /* Keep a reg with 1.0 around, for reuse by emit_vs_sop so that it can just
+ * be:
+ *
+ * sel.f0 dst 1.0 0.0
+ *
+ * instead of
+ *
+ * mov dst 0.0
+ * mov.f0 dst 1.0
+ */
+ src_reg one = src_reg(this, glsl_type::float_type);
+ emit(MOV(dst_reg(one), src_reg(1.0f)));
+
+ for (unsigned int insn = 0; insn < vp->Base.NumInstructions; insn++) {
+ const struct prog_instruction *vpi = &vp->Base.Instructions[insn];
+ base_ir = vpi;
+
+ dst_reg dst;
+ src_reg src[3];
+
+ /* We always emit into a temporary destination register to avoid
+ * aliasing issues.
+ */
+ dst = dst_reg(this, glsl_type::vec4_type);
+
+ for (int i = 0; i < 3; i++)
+ src[i] = get_vp_src_reg(vpi->SrcReg[i]);
+
+ switch (vpi->Opcode) {
+ case OPCODE_ABS:
+ src[0].abs = true;
+ src[0].negate = false;
+ emit(MOV(dst, src[0]));
+ break;
+
+ case OPCODE_ADD:
+ emit(ADD(dst, src[0], src[1]));
+ break;
+
+ case OPCODE_ARL:
+ if (intel->gen >= 6) {
+ dst.writemask = WRITEMASK_X;
+ dst_reg dst_f = dst;
+ dst_f.type = BRW_REGISTER_TYPE_F;
+
+ emit(RNDD(dst_f, src[0]));
+ emit(MOV(dst, src_reg(dst_f)));
+ } else {
+ emit(RNDD(dst, src[0]));
+ }
+ break;
+
+ case OPCODE_DP3:
+ emit(DP3(dst, src[0], src[1]));
+ break;
+ case OPCODE_DP4:
+ emit(DP4(dst, src[0], src[1]));
+ break;
+ case OPCODE_DPH:
+ emit(DPH(dst, src[0], src[1]));
+ break;
+
+ case OPCODE_DST: {
+ dst_reg t = dst;
+ if (vpi->DstReg.WriteMask & WRITEMASK_X) {
+ t.writemask = WRITEMASK_X;
+ emit(MOV(t, src_reg(1.0f)));
+ }
+ if (vpi->DstReg.WriteMask & WRITEMASK_Y) {
+ t.writemask = WRITEMASK_Y;
+ emit(MUL(t, src[0], src[1]));
+ }
+ if (vpi->DstReg.WriteMask & WRITEMASK_Z) {
+ t.writemask = WRITEMASK_Z;
+ emit(MOV(t, src[0]));
+ }
+ if (vpi->DstReg.WriteMask & WRITEMASK_W) {
+ t.writemask = WRITEMASK_W;
+ emit(MOV(t, src[1]));
+ }
+ break;
+ }
+
+ case OPCODE_EXP: {
+ dst_reg result = dst;
+ if (vpi->DstReg.WriteMask & WRITEMASK_X) {
+ /* tmp_d = floor(src[0].x) */
+ src_reg tmp_d = src_reg(this, glsl_type::ivec4_type);
+ assert(tmp_d.type == BRW_REGISTER_TYPE_D);
+ emit(RNDD(dst_reg(tmp_d), reswizzle(src[0], 0, 0, 0, 0)));
+
+ /* result[0] = 2.0 ^ tmp */
+ /* Adjust exponent for floating point: exp += 127 */
+ dst_reg tmp_d_x(GRF, tmp_d.reg, glsl_type::int_type, WRITEMASK_X);
+ emit(ADD(tmp_d_x, tmp_d, src_reg(127)));
+
+ /* Install exponent and sign. Excess drops off the edge: */
+ dst_reg res_d_x(GRF, result.reg, glsl_type::int_type, WRITEMASK_X);
+ emit(BRW_OPCODE_SHL, res_d_x, tmp_d, src_reg(23));
+ }
+ if (vpi->DstReg.WriteMask & WRITEMASK_Y) {
+ result.writemask = WRITEMASK_Y;
+ emit(FRC(result, src[0]));
+ }
+ if (vpi->DstReg.WriteMask & WRITEMASK_Z) {
+ result.writemask = WRITEMASK_Z;
+ emit_math(SHADER_OPCODE_EXP2, result, src[0]);
+ }
+ if (vpi->DstReg.WriteMask & WRITEMASK_W) {
+ result.writemask = WRITEMASK_W;
+ emit(MOV(result, src_reg(1.0f)));
+ }
+ break;
+ }
+
+ case OPCODE_EX2:
+ emit_math(SHADER_OPCODE_EXP2, dst, src[0]);
+ break;
+
+ case OPCODE_FLR:
+ emit(RNDD(dst, src[0]));
+ break;
+
+ case OPCODE_FRC:
+ emit(FRC(dst, src[0]));
+ break;
+
+ case OPCODE_LG2:
+ emit_math(SHADER_OPCODE_LOG2, dst, src[0]);
+ break;
+
+ case OPCODE_LIT: {
+ dst_reg result = dst;
+ /* From the ARB_vertex_program spec:
+ *
+ * tmp = VectorLoad(op0);
+ * if (tmp.x < 0) tmp.x = 0;
+ * if (tmp.y < 0) tmp.y = 0;
+ * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
+ * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
+ * result.x = 1.0;
+ * result.y = tmp.x;
+ * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
+ * result.w = 1.0;
+ *
+ * Note that we don't do the clamping to +/- 128. We didn't in
+ * brw_vs_emit.c either.
+ */
+ if (vpi->DstReg.WriteMask & WRITEMASK_XW) {
+ result.writemask = WRITEMASK_XW;
+ emit(MOV(result, src_reg(1.0f)));
+ }
+ if (vpi->DstReg.WriteMask & WRITEMASK_YZ) {
+ result.writemask = WRITEMASK_YZ;
+ emit(MOV(result, src_reg(0.0f)));
+
+ src_reg tmp_x = reswizzle(src[0], 0, 0, 0, 0);
+
+ emit(CMP(dst_null_d(), tmp_x, src_reg(0.0f), BRW_CONDITIONAL_G));
+ emit(IF(BRW_PREDICATE_NORMAL));
+
+ if (vpi->DstReg.WriteMask & WRITEMASK_Y) {
+ result.writemask = WRITEMASK_Y;
+ emit(MOV(result, tmp_x));
+ }
+
+ if (vpi->DstReg.WriteMask & WRITEMASK_Z) {
+ /* if (tmp.y < 0) tmp.y = 0; */
+ src_reg tmp_y = reswizzle(src[0], 1, 1, 1, 1);
+ result.writemask = WRITEMASK_Z;
+ emit_minmax(BRW_CONDITIONAL_G, result, tmp_y, src_reg(0.0f));
+
+ src_reg clamped_y(result);
+ clamped_y.swizzle = BRW_SWIZZLE_ZZZZ;
+
+ src_reg tmp_w = reswizzle(src[0], 3, 3, 3, 3);
+
+ emit_math(SHADER_OPCODE_POW, result, clamped_y, tmp_w);
+ }
+ emit(BRW_OPCODE_ENDIF);
+ }
+ break;
+ }
+
+ case OPCODE_LOG: {
+ dst_reg result = dst;
+ result.type = BRW_REGISTER_TYPE_UD;
+ src_reg result_src = src_reg(result);
+
+ src_reg arg0_ud = reswizzle(src[0], 0, 0, 0, 0);
+ arg0_ud.type = BRW_REGISTER_TYPE_UD;
+
+ /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
+ * according to spec:
+ *
+ * These almost look likey they could be joined up, but not really
+ * practical:
+ *
+ * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
+ * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
+ */
+ if (vpi->DstReg.WriteMask & WRITEMASK_XZ) {
+ result.writemask = WRITEMASK_X;
+ emit(AND(result, arg0_ud, src_reg((1u << 31) - 1)));
+ emit(BRW_OPCODE_SHR, result, result_src, src_reg(23u));
+ src_reg result_d(result_src);
+ result_d.type = BRW_REGISTER_TYPE_D; /* does it matter? */
+ result.type = BRW_REGISTER_TYPE_F;
+ emit(ADD(result, result_d, src_reg(-127)));
+ }
+
+ if (vpi->DstReg.WriteMask & WRITEMASK_YZ) {
+ result.writemask = WRITEMASK_Y;
+ result.type = BRW_REGISTER_TYPE_UD;
+ emit(AND(result, arg0_ud, src_reg((1u << 23) - 1)));
+ emit(OR(result, result_src, src_reg(127u << 23)));
+ }
+
+ if (vpi->DstReg.WriteMask & WRITEMASK_Z) {
+ /* result[2] = result[0] + LOG2(result[1]); */
+
+ /* Why bother? The above is just a hint how to do this with a
+ * taylor series. Maybe we *should* use a taylor series as by
+ * the time all the above has been done it's almost certainly
+ * quicker than calling the mathbox, even with low precision.
+ *
+ * Options are:
+ * - result[0] + mathbox.LOG2(result[1])
+ * - mathbox.LOG2(arg0.x)
+ * - result[0] + inline_taylor_approx(result[1])
+ */
+ result.type = BRW_REGISTER_TYPE_F;
+ result.writemask = WRITEMASK_Z;
+ src_reg result_x(result), result_y(result), result_z(result);
+ result_x.swizzle = BRW_SWIZZLE_XXXX;
+ result_y.swizzle = BRW_SWIZZLE_YYYY;
+ result_z.swizzle = BRW_SWIZZLE_ZZZZ;
+ emit_math(SHADER_OPCODE_LOG2, result, result_y);
+ emit(ADD(result, result_z, result_x));
+ }
+
+ if (vpi->DstReg.WriteMask & WRITEMASK_W) {
+ result.type = BRW_REGISTER_TYPE_F;
+ result.writemask = WRITEMASK_W;
+ emit(MOV(result, src_reg(1.0f)));
+ }
+ break;
+ }
+
+ case OPCODE_MAD: {
+ src_reg temp = src_reg(this, glsl_type::vec4_type);
+ emit(MUL(dst_reg(temp), src[0], src[1]));
+ emit(ADD(dst, temp, src[2]));
+ break;
+ }
+
+ case OPCODE_MAX:
+ emit_minmax(BRW_CONDITIONAL_G, dst, src[0], src[1]);
+ break;
+
+ case OPCODE_MIN:
+ emit_minmax(BRW_CONDITIONAL_L, dst, src[0], src[1]);
+ break;
+
+ case OPCODE_MOV:
+ emit(MOV(dst, src[0]));
+ break;
+
+ case OPCODE_MUL:
+ emit(MUL(dst, src[0], src[1]));
+ break;
+
+ case OPCODE_POW:
+ emit_math(SHADER_OPCODE_POW, dst, src[0], src[1]);
+ break;
+
+ case OPCODE_RCP:
+ emit_math(SHADER_OPCODE_RCP, dst, src[0]);
+ break;
+
+ case OPCODE_RSQ:
+ emit_math(SHADER_OPCODE_RSQ, dst, src[0]);
+ break;
+
+ case OPCODE_SGE:
+ emit_vp_sop(BRW_CONDITIONAL_GE, dst, src[0], src[1], one);
+ break;
+
+ case OPCODE_SLT:
+ emit_vp_sop(BRW_CONDITIONAL_L, dst, src[0], src[1], one);
+ break;
+
+ case OPCODE_SUB: {
+ src_reg neg_src1 = src[1];
+ neg_src1.negate = !src[1].negate;
+ emit(ADD(dst, src[0], neg_src1));
+ break;
+ }
+
+ case OPCODE_SWZ:
+ /* Note that SWZ's extended swizzles are handled in the general
+ * get_src_reg() code.
+ */
+ emit(MOV(dst, src[0]));
+ break;
+
+ case OPCODE_XPD: {
+ src_reg t1 = src_reg(this, glsl_type::vec4_type);
+ src_reg t2 = src_reg(this, glsl_type::vec4_type);
+
+ emit(MUL(dst_reg(t1),
+ reswizzle(src[0], 1, 2, 0, 3),
+ reswizzle(src[1], 2, 0, 1, 3)));
+ emit(MUL(dst_reg(t2),
+ reswizzle(src[0], 2, 0, 1, 3),
+ reswizzle(src[1], 1, 2, 0, 3)));
+ t2.negate = true;
+ emit(ADD(dst, t1, t2));
+ break;
+ }
+
+ case OPCODE_END:
+ break;
+
+ default:
+ _mesa_problem(ctx, "Unsupported opcode %s in vertex program\n",
+ _mesa_opcode_string(vpi->Opcode));
+ }
+
+ /* Copy the temporary back into the actual destination register. */
+ if (vpi->Opcode != OPCODE_END) {
+ emit(MOV(get_vp_dst_reg(vpi->DstReg), src_reg(dst)));
+ }
+ }
+
+ /* If we used relative addressing, we need to upload all constants as
+ * pull constants. Do that now.
+ */
+ if (this->need_all_constants_in_pull_buffer) {
+ const struct gl_program_parameter_list *params = c->vp->program.Base.Parameters;
+ unsigned i;
+ for (i = 0; i < params->NumParameters * 4; i++) {
+ c->prog_data.pull_param[i] = ¶ms->ParameterValues[i / 4][i % 4].f;
+ }
+ c->prog_data.nr_pull_params = i;
+ }
+}
+
+void
+vec4_visitor::setup_vp_regs()
+{
+ /* PROGRAM_TEMPORARY */
+ int num_temp = vp->Base.NumTemporaries;
+ vp_temp_regs = rzalloc_array(mem_ctx, src_reg, num_temp);
+ for (int i = 0; i < num_temp; i++)
+ vp_temp_regs[i] = src_reg(this, glsl_type::vec4_type);
+
+ /* PROGRAM_STATE_VAR etc. */
+ struct gl_program_parameter_list *plist = c->vp->program.Base.Parameters;
+ for (unsigned p = 0; p < plist->NumParameters; p++) {
+ unsigned components = plist->Parameters[p].Size;
+
+ /* Parameters should be either vec4 uniforms or single component
+ * constants; matrices and other larger types should have been broken
+ * down earlier.
+ */
+ assert(components <= 4);
+
+ this->uniform_size[this->uniforms] = 1; /* 1 vec4 */
+ this->uniform_vector_size[this->uniforms] = components;
+ for (unsigned i = 0; i < 4; i++) {
+ c->prog_data.param[this->uniforms * 4 + i] = i >= components ? 0 :
+ &plist->ParameterValues[p][i].f;
+ }
+ this->uniforms++; /* counted in vec4 units */
+ }
+
+ /* PROGRAM_OUTPUT */
+ for (int slot = 0; slot < c->prog_data.vue_map.num_slots; slot++) {
+ int vert_result = c->prog_data.vue_map.slot_to_vert_result[slot];
+ if (vert_result == VERT_RESULT_PSIZ)
+ output_reg[vert_result] = dst_reg(this, glsl_type::float_type);
+ else
+ output_reg[vert_result] = dst_reg(this, glsl_type::vec4_type);
+ assert(output_reg[vert_result].type == BRW_REGISTER_TYPE_F);
+ }
+
+ /* PROGRAM_ADDRESS */
+ this->vp_addr_reg = src_reg(this, glsl_type::int_type);
+ assert(this->vp_addr_reg.type == BRW_REGISTER_TYPE_D);
+}
+
+dst_reg
+vec4_visitor::get_vp_dst_reg(const prog_dst_register &dst)
+{
+ dst_reg result;
+
+ assert(!dst.RelAddr);
+
+ switch (dst.File) {
+ case PROGRAM_TEMPORARY:
+ result = dst_reg(vp_temp_regs[dst.Index]);
+ break;
+
+ case PROGRAM_OUTPUT:
+ result = output_reg[dst.Index];
+ break;
+
+ case PROGRAM_ADDRESS: {
+ assert(dst.Index == 0);
+ result = dst_reg(this->vp_addr_reg);
+ break;
+ }
+
+ case PROGRAM_UNDEFINED:
+ return dst_null_f();
+
+ default:
+ assert("vec4_vp: bad destination register file");
+ return dst_reg(this, glsl_type::vec4_type);
+ }
+
+ result.writemask = dst.WriteMask;
+ return result;
+}
+
+src_reg
+vec4_visitor::get_vp_src_reg(const prog_src_register &src)
+{
+ struct gl_program_parameter_list *plist = c->vp->program.Base.Parameters;
+
+ src_reg result;
+
+ assert(!src.Abs);
+
+ switch (src.File) {
+ case PROGRAM_UNDEFINED:
+ return src_reg(brw_null_reg());
+
+ case PROGRAM_TEMPORARY:
+ result = vp_temp_regs[src.Index];
+ break;
+
+ case PROGRAM_INPUT:
+ result = src_reg(ATTR, src.Index, glsl_type::vec4_type);
+ result.type = BRW_REGISTER_TYPE_F;
+ break;
+
+ case PROGRAM_ADDRESS: {
+ assert(src.Index == 0);
+ result = this->vp_addr_reg;
+ break;
+ }
+
+ case PROGRAM_STATE_VAR:
+ case PROGRAM_CONSTANT:
+ /* From the ARB_vertex_program specification:
+ * "Relative addressing can only be used for accessing program
+ * parameter arrays."
+ */
+ if (src.RelAddr) {
+ /* Since we have no idea what the base of the array is, we need to
+ * upload ALL constants as push constants.
+ */
+ this->need_all_constants_in_pull_buffer = true;
+
+ /* Add the small constant index to the address register */
+ src_reg reladdr = src_reg(this, glsl_type::int_type);
+ dst_reg dst_reladdr = dst_reg(reladdr);
+ dst_reladdr.writemask = WRITEMASK_X;
+ emit(ADD(dst_reladdr, this->vp_addr_reg, src_reg(src.Index)));
+
+ if (intel->gen < 6)
+ emit(MUL(dst_reladdr, reladdr, src_reg(16)));
+
+ #if 0
+ assert(src.Index < this->uniforms);
+ result = src_reg(dst_reg(UNIFORM, 0));
+ result.type = BRW_REGISTER_TYPE_F;
+ result.reladdr = new(mem_ctx) src_reg();
+ memcpy(result.reladdr, &reladdr, sizeof(src_reg));
+ #endif
+
+ result = src_reg(this, glsl_type::vec4_type);
+ src_reg surf_index = src_reg(unsigned(SURF_INDEX_VERT_CONST_BUFFER));
+ vec4_instruction *load =
+ new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
+ dst_reg(result), surf_index, reladdr);
+ load->base_mrf = 14;
+ load->mlen = 1;
+ emit(load);
+ break;
+ }
+
+ /* We actually want to look at the type in the Parameters list for this,
+ * because this lets us upload constant builtin uniforms as actual
+ * constants.
+ */
+ switch (plist->Parameters[src.Index].Type) {
+ case PROGRAM_CONSTANT:
+ result = src_reg(this, glsl_type::vec4_type);
+ for (int i = 0; i < 4; i++) {
+ dst_reg t = dst_reg(result);
+ t.writemask = 1 << i;
+ emit(MOV(t, src_reg(plist->ParameterValues[src.Index][i].f)));
+ }
+ break;
+
+ case PROGRAM_STATE_VAR:
+ assert(src.Index < this->uniforms);
+ result = src_reg(dst_reg(UNIFORM, src.Index));
+ result.type = BRW_REGISTER_TYPE_F;
+ break;
+
+ default:
+ _mesa_problem(ctx, "bad uniform src register file: %s\n",
+ _mesa_register_file_name((gl_register_file)src.File));
+ return src_reg(this, glsl_type::vec4_type);
+ }
+ break;
+
+ default:
+ _mesa_problem(ctx, "bad src register file: %s\n",
+ _mesa_register_file_name((gl_register_file)src.File));
+ return src_reg(this, glsl_type::vec4_type);
+ }
+
+ if (src.Swizzle != SWIZZLE_NOOP || src.Negate) {
+ unsigned short zeros_mask = 0;
+ unsigned short ones_mask = 0;
+ unsigned short src_mask = 0;
+ unsigned short src_swiz[4];
+
+ for (int i = 0; i < 4; i++) {
+ src_swiz[i] = 0; /* initialize for safety */
+
+ /* The ZERO, ONE, and Negate options are only used for OPCODE_SWZ,
+ * but it's simplest to handle it here.
+ */
+ int s = GET_SWZ(src.Swizzle, i);
+ switch (s) {
+ case SWIZZLE_X:
+ case SWIZZLE_Y:
+ case SWIZZLE_Z:
+ case SWIZZLE_W:
+ src_mask |= 1 << i;
+ src_swiz[i] = s;
+ break;
+ case SWIZZLE_ZERO:
+ zeros_mask |= 1 << i;
+ break;
+ case SWIZZLE_ONE:
+ ones_mask |= 1 << i;
+ break;
+ }
+ }
+
+ result.swizzle =
+ BRW_SWIZZLE4(src_swiz[0], src_swiz[1], src_swiz[2], src_swiz[3]);
+
+ /* The hardware doesn't natively handle the SWZ instruction's zero/one
+ * swizzles or per-component negation, so we need to use a temporary.
+ */
+ if (zeros_mask || ones_mask || src.Negate) {
+ src_reg temp_src(this, glsl_type::vec4_type);
+ dst_reg temp(temp_src);
+
+ if (src_mask) {
+ temp.writemask = src_mask;
+ emit(MOV(temp, result));
+ }
+
+ if (zeros_mask) {
+ temp.writemask = zeros_mask;
+ emit(MOV(temp, src_reg(0.0f)));
+ }
+
+ if (ones_mask) {
+ temp.writemask = ones_mask;
+ emit(MOV(temp, src_reg(1.0f)));
+ }
+
+ if (src.Negate) {
+ temp.writemask = src.Negate;
+ src_reg neg(temp_src);
+ neg.negate = true;
+ emit(MOV(temp, neg));
+ }
+ result = temp_src;
+ }
+ }
+
+ return result;
+}