case BRW_REGISTER_TYPE_UD:
brw_reg = brw_imm_ud(src[i].fixed_hw_reg.dw1.ud);
break;
+ case BRW_REGISTER_TYPE_VF:
+ brw_reg = brw_imm_vf(src[i].fixed_hw_reg.dw1.ud);
+ break;
default:
unreachable("not reached");
}
brw_math_function(inst->opcode),
inst->base_mrf,
src,
- BRW_MATH_DATA_VECTOR,
BRW_MATH_PRECISION_FULL);
}
brw_math_function(inst->opcode),
inst->base_mrf,
op0,
- BRW_MATH_DATA_VECTOR,
BRW_MATH_PRECISION_FULL);
}
* use an implied move from g0 to the first message register.
*/
if (inst->header_present) {
- if (brw->gen < 6 && !inst->texture_offset) {
+ if (brw->gen < 6 && !inst->offset) {
/* Set up an implied move from g0 to the MRF. */
src = brw_vec8_grf(0, 0);
} else {
brw_set_default_access_mode(p, BRW_ALIGN_1);
- if (inst->texture_offset) {
+ if (inst->offset) {
/* Set the texel offset bits in DWord 2. */
brw_MOV(p, get_element_ud(header, 2),
- brw_imm_ud(inst->texture_offset));
+ brw_imm_ud(inst->offset));
}
brw_adjust_sampler_state_pointer(p, header, sampler_index, dst);
BRW_URB_SWIZZLE_INTERLEAVE);
}
+void
+vec4_generator::generate_gs_urb_write_allocate(vec4_instruction *inst)
+{
+ struct brw_reg src = brw_message_reg(inst->base_mrf);
+
+ /* We pass the temporary passed in src0 as the writeback register */
+ brw_urb_WRITE(p,
+ inst->get_src(this->prog_data, 0), /* dest */
+ inst->base_mrf, /* starting mrf reg nr */
+ src,
+ BRW_URB_WRITE_ALLOCATE_COMPLETE,
+ inst->mlen,
+ 1, /* response len */
+ inst->offset, /* urb destination offset */
+ BRW_URB_SWIZZLE_INTERLEAVE);
+
+ /* Now put allocated urb handle in dst.0 */
+ brw_push_insn_state(p);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_MOV(p, get_element_ud(inst->get_dst(), 0),
+ get_element_ud(inst->get_src(this->prog_data, 0), 0));
+ brw_set_default_access_mode(p, BRW_ALIGN_16);
+ brw_pop_insn_state(p);
+}
+
void
vec4_generator::generate_gs_thread_end(vec4_instruction *inst)
{
brw_null_reg(), /* dest */
inst->base_mrf, /* starting mrf reg nr */
src,
- BRW_URB_WRITE_EOT,
+ BRW_URB_WRITE_EOT | inst->urb_write_flags,
brw->gen >= 8 ? 2 : 1,/* message len */
0, /* response len */
0, /* urb destination offset */
*
* We can do this with the following EU instruction:
*
- * mul(2) dst.3<1>UD src0<8;2,4>UD src1 { Align1 WE_all }
+ * mul(2) dst.3<1>UD src0<8;2,4>UD src1<...>UW { Align1 WE_all }
*/
brw_push_insn_state(p);
brw_set_default_access_mode(p, BRW_ALIGN_1);
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ assert(brw->gen >= 7 &&
+ src1.file == BRW_IMMEDIATE_VALUE &&
+ src1.type == BRW_REGISTER_TYPE_UD &&
+ src1.dw1.ud <= USHRT_MAX);
brw_MUL(p, suboffset(stride(dst, 2, 2, 1), 3), stride(src0, 8, 2, 4),
- src1);
+ retype(src1, BRW_REGISTER_TYPE_UW));
brw_set_default_access_mode(p, BRW_ALIGN_16);
brw_pop_insn_state(p);
}
}
void
-vec4_generator::generate_gs_set_dword_2_immed(struct brw_reg dst,
- struct brw_reg src)
+vec4_generator::generate_gs_svb_write(vec4_instruction *inst,
+ struct brw_reg dst,
+ struct brw_reg src0,
+ struct brw_reg src1)
+{
+ int binding = inst->sol_binding;
+ bool final_write = inst->sol_final_write;
+
+ brw_push_insn_state(p);
+ /* Copy Vertex data into M0.x */
+ brw_MOV(p, stride(dst, 4, 4, 1),
+ stride(retype(src0, BRW_REGISTER_TYPE_UD), 4, 4, 1));
+
+ /* Send SVB Write */
+ brw_svb_write(p,
+ final_write ? src1 : brw_null_reg(), /* dest == src1 */
+ 1, /* msg_reg_nr */
+ dst, /* src0 == previous dst */
+ SURF_INDEX_GEN6_SOL_BINDING(binding), /* binding_table_index */
+ final_write); /* send_commit_msg */
+
+ /* Finally, wait for the write commit to occur so that we can proceed to
+ * other things safely.
+ *
+ * From the Sandybridge PRM, Volume 4, Part 1, Section 3.3:
+ *
+ * The write commit does not modify the destination register, but
+ * merely clears the dependency associated with the destination
+ * register. Thus, a simple “mov” instruction using the register as a
+ * source is sufficient to wait for the write commit to occur.
+ */
+ if (final_write) {
+ brw_MOV(p, src1, src1);
+ }
+ brw_pop_insn_state(p);
+}
+
+void
+vec4_generator::generate_gs_svb_set_destination_index(vec4_instruction *inst,
+ struct brw_reg dst,
+ struct brw_reg src)
{
- assert(src.file == BRW_IMMEDIATE_VALUE);
+ int vertex = inst->sol_vertex;
brw_push_insn_state(p);
brw_set_default_access_mode(p, BRW_ALIGN_1);
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
- brw_MOV(p, suboffset(vec1(dst), 2), src);
- brw_set_default_access_mode(p, BRW_ALIGN_16);
+ brw_MOV(p, get_element_ud(dst, 5), get_element_ud(src, vertex));
+ brw_pop_insn_state(p);
+}
+
+void
+vec4_generator::generate_gs_set_dword_2(struct brw_reg dst, struct brw_reg src)
+{
+ brw_push_insn_state(p);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_MOV(p, suboffset(vec1(dst), 2), suboffset(vec1(src), 0));
brw_pop_insn_state(p);
}
brw_pop_insn_state(p);
}
+void
+vec4_generator::generate_gs_ff_sync_set_primitives(struct brw_reg dst,
+ struct brw_reg src0,
+ struct brw_reg src1,
+ struct brw_reg src2)
+{
+ brw_push_insn_state(p);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ /* Save src0 data in 16:31 bits of dst.0 */
+ brw_AND(p, suboffset(vec1(dst), 0), suboffset(vec1(src0), 0),
+ brw_imm_ud(0xffffu));
+ brw_SHL(p, suboffset(vec1(dst), 0), suboffset(vec1(dst), 0), brw_imm_ud(16));
+ /* Save src1 data in 0:15 bits of dst.0 */
+ brw_AND(p, suboffset(vec1(src2), 0), suboffset(vec1(src1), 0),
+ brw_imm_ud(0xffffu));
+ brw_OR(p, suboffset(vec1(dst), 0),
+ suboffset(vec1(dst), 0),
+ suboffset(vec1(src2), 0));
+ brw_pop_insn_state(p);
+}
+
+void
+vec4_generator::generate_gs_ff_sync(vec4_instruction *inst,
+ struct brw_reg dst,
+ struct brw_reg src0,
+ struct brw_reg src1)
+{
+ /* This opcode uses an implied MRF register for:
+ * - the header of the ff_sync message. And as such it is expected to be
+ * initialized to r0 before calling here.
+ * - the destination where we will write the allocated URB handle.
+ */
+ struct brw_reg header =
+ retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD);
+
+ /* Overwrite dword 0 of the header (SO vertices to write) and
+ * dword 1 (number of primitives written).
+ */
+ brw_push_insn_state(p);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ brw_MOV(p, get_element_ud(header, 0), get_element_ud(src1, 0));
+ brw_MOV(p, get_element_ud(header, 1), get_element_ud(src0, 0));
+ brw_pop_insn_state(p);
+
+ /* Allocate URB handle in dst */
+ brw_ff_sync(p,
+ dst,
+ 0,
+ header,
+ 1, /* allocate */
+ 1, /* response length */
+ 0 /* eot */);
+
+ /* Now put allocated urb handle in header.0 */
+ brw_push_insn_state(p);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_MOV(p, get_element_ud(header, 0), get_element_ud(dst, 0));
+
+ /* src1 is not an immediate when we use transform feedback */
+ if (src1.file != BRW_IMMEDIATE_VALUE)
+ brw_MOV(p, brw_vec4_grf(src1.nr, 0), brw_vec4_grf(dst.nr, 1));
+
+ brw_pop_insn_state(p);
+}
+
+void
+vec4_generator::generate_gs_set_primitive_id(struct brw_reg dst)
+{
+ /* In gen6, PrimitiveID is delivered in R0.1 of the payload */
+ struct brw_reg src = brw_vec8_grf(0, 0);
+ brw_push_insn_state(p);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ brw_MOV(p, get_element_ud(dst, 0), get_element_ud(src, 1));
+ brw_pop_insn_state(p);
+}
+
void
vec4_generator::generate_oword_dual_block_offsets(struct brw_reg m1,
struct brw_reg index)
{
struct annotation_info annotation;
memset(&annotation, 0, sizeof(annotation));
+ int loop_count = 0;
foreach_block_and_inst (block, vec4_instruction, inst, cfg) {
struct brw_reg src[3], dst;
}
switch (inst->opcode) {
+ case VEC4_OPCODE_UNPACK_UNIFORM:
case BRW_OPCODE_MOV:
brw_MOV(p, dst, src[0]);
break;
case BRW_OPCODE_WHILE:
brw_WHILE(p);
+ loop_count++;
break;
case SHADER_OPCODE_RCP:
case SHADER_OPCODE_LOG2:
case SHADER_OPCODE_SIN:
case SHADER_OPCODE_COS:
+ assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
if (brw->gen >= 7) {
gen6_math(p, dst, brw_math_function(inst->opcode), src[0],
brw_null_reg());
case SHADER_OPCODE_POW:
case SHADER_OPCODE_INT_QUOTIENT:
case SHADER_OPCODE_INT_REMAINDER:
+ assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
if (brw->gen >= 7) {
gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]);
} else if (brw->gen == 6) {
generate_gs_urb_write(inst);
break;
+ case GS_OPCODE_URB_WRITE_ALLOCATE:
+ generate_gs_urb_write_allocate(inst);
+ break;
+
+ case GS_OPCODE_SVB_WRITE:
+ generate_gs_svb_write(inst, dst, src[0], src[1]);
+ break;
+
+ case GS_OPCODE_SVB_SET_DST_INDEX:
+ generate_gs_svb_set_destination_index(inst, dst, src[0]);
+ break;
+
case GS_OPCODE_THREAD_END:
generate_gs_thread_end(inst);
break;
generate_gs_set_vertex_count(dst, src[0]);
break;
- case GS_OPCODE_SET_DWORD_2_IMMED:
- generate_gs_set_dword_2_immed(dst, src[0]);
+ case GS_OPCODE_FF_SYNC:
+ generate_gs_ff_sync(inst, dst, src[0], src[1]);
+ break;
+
+ case GS_OPCODE_FF_SYNC_SET_PRIMITIVES:
+ generate_gs_ff_sync_set_primitives(dst, src[0], src[1], src[2]);
+ break;
+
+ case GS_OPCODE_SET_PRIMITIVE_ID:
+ generate_gs_set_primitive_id(dst);
+ break;
+
+ case GS_OPCODE_SET_DWORD_2:
+ generate_gs_set_dword_2(dst, src[0]);
break;
case GS_OPCODE_PREPARE_CHANNEL_MASKS:
generate_unpack_flags(inst, dst);
break;
+ case VEC4_OPCODE_PACK_BYTES: {
+ /* Is effectively:
+ *
+ * mov(8) dst<16,4,1>:UB src<4,1,0>:UB
+ *
+ * but destinations' only regioning is horizontal stride, so instead we
+ * have to use two instructions:
+ *
+ * mov(4) dst<1>:UB src<4,1,0>:UB
+ * mov(4) dst.16<1>:UB src.16<4,1,0>:UB
+ *
+ * where they pack the four bytes from the low and high four DW.
+ */
+ assert(is_power_of_two(dst.dw1.bits.writemask) &&
+ dst.dw1.bits.writemask != 0);
+ unsigned offset = __builtin_ctz(dst.dw1.bits.writemask);
+
+ dst.type = BRW_REGISTER_TYPE_UB;
+
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+ src[0].type = BRW_REGISTER_TYPE_UB;
+ src[0].vstride = BRW_VERTICAL_STRIDE_4;
+ src[0].width = BRW_WIDTH_1;
+ src[0].hstride = BRW_HORIZONTAL_STRIDE_0;
+ dst.subnr = offset * 4;
+ struct brw_inst *insn = brw_MOV(p, dst, src[0]);
+ brw_inst_set_exec_size(brw, insn, BRW_EXECUTE_4);
+ brw_inst_set_no_dd_clear(brw, insn, true);
+ brw_inst_set_no_dd_check(brw, insn, inst->no_dd_check);
+
+ src[0].subnr = 16;
+ dst.subnr = 16 + offset * 4;
+ insn = brw_MOV(p, dst, src[0]);
+ brw_inst_set_exec_size(brw, insn, BRW_EXECUTE_4);
+ brw_inst_set_no_dd_clear(brw, insn, inst->no_dd_clear);
+ brw_inst_set_no_dd_check(brw, insn, true);
+
+ brw_set_default_access_mode(p, BRW_ALIGN_16);
+ break;
+ }
+
default:
if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) {
_mesa_problem(&brw->ctx, "Unsupported opcode in `%s' in vec4\n",
abort();
}
- if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
+ if (inst->opcode == VEC4_OPCODE_PACK_BYTES) {
+ /* Handled dependency hints in the generator. */
+
+ assert(!inst->conditional_mod);
+ } else if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
assert(p->nr_insn == pre_emit_nr_insn + 1 ||
!"conditional_mod, no_dd_check, or no_dd_clear set for IR "
"emitting more than 1 instruction");
brw_inst *last = &p->store[pre_emit_nr_insn];
- brw_inst_set_cond_modifier(brw, last, inst->conditional_mod);
+ if (inst->conditional_mod)
+ brw_inst_set_cond_modifier(brw, last, inst->conditional_mod);
brw_inst_set_no_dd_clear(brw, last, inst->no_dd_clear);
brw_inst_set_no_dd_check(brw, last, inst->no_dd_check);
}
} else {
fprintf(stderr, "Native code for vertex program %d:\n", prog->Id);
}
- fprintf(stderr, "vec4 shader: %d instructions. Compacted %d to %d"
+ fprintf(stderr, "vec4 shader: %d instructions. %d loops. Compacted %d to %d"
" bytes (%.0f%%)\n",
- before_size / 16, before_size, after_size,
+ before_size / 16, loop_count, before_size, after_size,
100.0f * (before_size - after_size) / before_size);
dump_assembly(p->store, annotation.ann_count, annotation.ann, brw, prog);