X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fmesa%2Fdrivers%2Fdri%2Fi965%2Fbrw_vec4_generator.cpp;h=898a6db4abfadc2d90177b3f3104f2f392418fcd;hb=639696aa05df0b7f4bfb9e2e255863cd72effba3;hp=07e29a5d32f8fde2103bccdac63be9575f6625ce;hpb=c0f1929dd23bbc558e9eef0f8fd40e10dfef3c21;p=mesa.git diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp index 07e29a5d32f..898a6db4abf 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp @@ -20,6 +20,7 @@ * IN THE SOFTWARE. */ +#include #include "brw_vec4.h" #include "brw_cfg.h" @@ -66,7 +67,7 @@ vec4_instruction::get_dst(void) } struct brw_reg -vec4_instruction::get_src(const struct brw_vec4_prog_data *prog_data, int i) +vec4_instruction::get_src(const struct brw_vue_prog_data *prog_data, int i) { struct brw_reg brw_reg; @@ -84,14 +85,17 @@ vec4_instruction::get_src(const struct brw_vec4_prog_data *prog_data, int i) case IMM: switch (src[i].type) { case BRW_REGISTER_TYPE_F: - brw_reg = brw_imm_f(src[i].imm.f); + brw_reg = brw_imm_f(src[i].fixed_hw_reg.dw1.f); break; case BRW_REGISTER_TYPE_D: - brw_reg = brw_imm_d(src[i].imm.i); + brw_reg = brw_imm_d(src[i].fixed_hw_reg.dw1.d); break; case BRW_REGISTER_TYPE_UD: - brw_reg = brw_imm_ud(src[i].imm.u); + brw_reg = brw_imm_ud(src[i].fixed_hw_reg.dw1.ud); break; + case BRW_REGISTER_TYPE_VF: + brw_reg = brw_imm_vf(src[i].fixed_hw_reg.dw1.ud); + break; default: unreachable("not reached"); } @@ -133,11 +137,14 @@ vec4_instruction::get_src(const struct brw_vec4_prog_data *prog_data, int i) vec4_generator::vec4_generator(struct brw_context *brw, struct gl_shader_program *shader_prog, struct gl_program *prog, - struct brw_vec4_prog_data *prog_data, + struct brw_vue_prog_data *prog_data, void *mem_ctx, - bool debug_flag) + bool debug_flag, + const char *stage_name, + const char *stage_abbrev) : brw(brw), shader_prog(shader_prog), prog(prog), prog_data(prog_data), - mem_ctx(mem_ctx), debug_flag(debug_flag) + mem_ctx(mem_ctx), stage_name(stage_name), stage_abbrev(stage_abbrev), + debug_flag(debug_flag) { p = rzalloc(mem_ctx, struct brw_compile); brw_init_compile(brw, p, mem_ctx); @@ -157,7 +164,6 @@ vec4_generator::generate_math1_gen4(vec4_instruction *inst, brw_math_function(inst->opcode), inst->base_mrf, src, - BRW_MATH_DATA_VECTOR, BRW_MATH_PRECISION_FULL); } @@ -218,14 +224,14 @@ vec4_generator::generate_math2_gen4(vec4_instruction *inst, brw_math_function(inst->opcode), inst->base_mrf, op0, - BRW_MATH_DATA_VECTOR, BRW_MATH_PRECISION_FULL); } void vec4_generator::generate_tex(vec4_instruction *inst, struct brw_reg dst, - struct brw_reg src) + struct brw_reg src, + struct brw_reg sampler_index) { int msg_type = -1; @@ -313,17 +319,20 @@ vec4_generator::generate_tex(vec4_instruction *inst, assert(msg_type != -1); + assert(sampler_index.type == BRW_REGISTER_TYPE_UD); + /* Load the message header if present. If there's a texture offset, we need * to set it up explicitly and load the offset bitfield. Otherwise, we can * use an implied move from g0 to the first message register. */ if (inst->header_present) { - if (brw->gen < 6 && !inst->texture_offset) { + if (brw->gen < 6 && !inst->offset) { /* Set up an implied move from g0 to the MRF. */ src = brw_vec8_grf(0, 0); } else { struct brw_reg header = retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD); + uint32_t dw2 = 0; /* Explicitly set up the message header by copying g0 to the MRF. */ brw_push_insn_state(p); @@ -332,28 +341,20 @@ vec4_generator::generate_tex(vec4_instruction *inst, brw_set_default_access_mode(p, BRW_ALIGN_1); - if (inst->texture_offset) { + if (inst->offset) /* Set the texel offset bits in DWord 2. */ - brw_MOV(p, get_element_ud(header, 2), - brw_imm_ud(inst->texture_offset)); - } + dw2 = inst->offset; - if (inst->sampler >= 16) { - /* The "Sampler Index" field can only store values between 0 and 15. - * However, we can add an offset to the "Sampler State Pointer" - * field, effectively selecting a different set of 16 samplers. - * - * The "Sampler State Pointer" needs to be aligned to a 32-byte - * offset, and each sampler state is only 16-bytes, so we can't - * exclusively use the offset - we have to use both. + if (brw->gen >= 9) + /* SKL+ overloads BRW_SAMPLER_SIMD_MODE_SIMD4X2 to also do SIMD8D, + * based on bit 22 in the header. */ - assert(brw->gen >= 8 || brw->is_haswell); - brw_ADD(p, - get_element_ud(header, 3), - get_element_ud(brw_vec8_grf(0, 0), 3), - brw_imm_ud(16 * (inst->sampler / 16) * - sizeof(gen7_sampler_state))); - } + dw2 |= GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2; + + if (dw2) + brw_MOV(p, get_element_ud(header, 2), brw_imm_ud(dw2)); + + brw_adjust_sampler_state_pointer(p, header, sampler_index); brw_pop_insn_state(p); } } @@ -372,25 +373,81 @@ vec4_generator::generate_tex(vec4_instruction *inst, break; } - uint32_t surface_index = ((inst->opcode == SHADER_OPCODE_TG4 || - inst->opcode == SHADER_OPCODE_TG4_OFFSET) - ? prog_data->base.binding_table.gather_texture_start - : prog_data->base.binding_table.texture_start) + inst->sampler; - - brw_SAMPLE(p, - dst, - inst->base_mrf, - src, - surface_index, - inst->sampler % 16, - msg_type, - 1, /* response length */ - inst->mlen, - inst->header_present, - BRW_SAMPLER_SIMD_MODE_SIMD4X2, - return_format); - - brw_mark_surface_used(&prog_data->base, surface_index); + uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 || + inst->opcode == SHADER_OPCODE_TG4_OFFSET) + ? prog_data->base.binding_table.gather_texture_start + : prog_data->base.binding_table.texture_start; + + if (sampler_index.file == BRW_IMMEDIATE_VALUE) { + uint32_t sampler = sampler_index.dw1.ud; + + brw_SAMPLE(p, + dst, + inst->base_mrf, + src, + sampler + base_binding_table_index, + sampler % 16, + msg_type, + 1, /* response length */ + inst->mlen, + inst->header_present, + BRW_SAMPLER_SIMD_MODE_SIMD4X2, + return_format); + + brw_mark_surface_used(&prog_data->base, sampler + base_binding_table_index); + } else { + /* Non-constant sampler index. */ + /* Note: this clobbers `dst` as a temporary before emitting the send */ + + struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD)); + struct brw_reg temp = vec1(retype(dst, BRW_REGISTER_TYPE_UD)); + + struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD)); + + brw_push_insn_state(p); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_access_mode(p, BRW_ALIGN_1); + + /* Some care required: `sampler` and `temp` may alias: + * addr = sampler & 0xff + * temp = (sampler << 8) & 0xf00 + * addr = addr | temp + */ + brw_ADD(p, addr, sampler_reg, brw_imm_ud(base_binding_table_index)); + brw_SHL(p, temp, sampler_reg, brw_imm_ud(8u)); + brw_AND(p, temp, temp, brw_imm_ud(0x0f00)); + brw_AND(p, addr, addr, brw_imm_ud(0x0ff)); + brw_OR(p, addr, addr, temp); + + /* a0.0 |= */ + brw_inst *insn_or = brw_next_insn(p, BRW_OPCODE_OR); + brw_set_sampler_message(p, insn_or, + 0 /* surface */, + 0 /* sampler */, + msg_type, + 1 /* rlen */, + inst->mlen /* mlen */, + inst->header_present /* header */, + BRW_SAMPLER_SIMD_MODE_SIMD4X2, + return_format); + brw_inst_set_exec_size(p->brw, insn_or, BRW_EXECUTE_1); + brw_inst_set_src1_reg_type(p->brw, insn_or, BRW_REGISTER_TYPE_UD); + brw_set_src0(p, insn_or, addr); + brw_set_dest(p, insn_or, addr); + + + /* dst = send(offset, a0.0) */ + brw_inst *insn_send = brw_next_insn(p, BRW_OPCODE_SEND); + brw_set_dest(p, insn_send, dst); + brw_set_src0(p, insn_send, src); + brw_set_indirect_send_descriptor(p, insn_send, BRW_SFID_SAMPLER, addr); + + brw_pop_insn_state(p); + + /* visitor knows more than we do about the surface limit required, + * so has already done marking. + */ + } } void @@ -422,6 +479,32 @@ vec4_generator::generate_gs_urb_write(vec4_instruction *inst) BRW_URB_SWIZZLE_INTERLEAVE); } +void +vec4_generator::generate_gs_urb_write_allocate(vec4_instruction *inst) +{ + struct brw_reg src = brw_message_reg(inst->base_mrf); + + /* We pass the temporary passed in src0 as the writeback register */ + brw_urb_WRITE(p, + inst->get_src(this->prog_data, 0), /* dest */ + inst->base_mrf, /* starting mrf reg nr */ + src, + BRW_URB_WRITE_ALLOCATE_COMPLETE, + inst->mlen, + 1, /* response len */ + inst->offset, /* urb destination offset */ + BRW_URB_SWIZZLE_INTERLEAVE); + + /* Now put allocated urb handle in dst.0 */ + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_MOV(p, get_element_ud(inst->get_dst(), 0), + get_element_ud(inst->get_src(this->prog_data, 0), 0)); + brw_set_default_access_mode(p, BRW_ALIGN_16); + brw_pop_insn_state(p); +} + void vec4_generator::generate_gs_thread_end(vec4_instruction *inst) { @@ -430,8 +513,8 @@ vec4_generator::generate_gs_thread_end(vec4_instruction *inst) brw_null_reg(), /* dest */ inst->base_mrf, /* starting mrf reg nr */ src, - BRW_URB_WRITE_EOT, - 1, /* message len */ + BRW_URB_WRITE_EOT | inst->urb_write_flags, + brw->gen >= 8 ? 2 : 1,/* message len */ 0, /* response len */ 0, /* urb destination offset */ BRW_URB_SWIZZLE_INTERLEAVE); @@ -458,13 +541,17 @@ vec4_generator::generate_gs_set_write_offset(struct brw_reg dst, * * We can do this with the following EU instruction: * - * mul(2) dst.3<1>UD src0<8;2,4>UD src1 { Align1 WE_all } + * mul(2) dst.3<1>UD src0<8;2,4>UD src1<...>UW { Align1 WE_all } */ brw_push_insn_state(p); brw_set_default_access_mode(p, BRW_ALIGN_1); brw_set_default_mask_control(p, BRW_MASK_DISABLE); + assert(brw->gen >= 7 && + src1.file == BRW_IMMEDIATE_VALUE && + src1.type == BRW_REGISTER_TYPE_UD && + src1.dw1.ud <= USHRT_MAX); brw_MUL(p, suboffset(stride(dst, 2, 2, 1), 3), stride(src0, 8, 2, 4), - src1); + retype(src1, BRW_REGISTER_TYPE_UW)); brw_set_default_access_mode(p, BRW_ALIGN_16); brw_pop_insn_state(p); } @@ -474,39 +561,94 @@ vec4_generator::generate_gs_set_vertex_count(struct brw_reg dst, struct brw_reg src) { brw_push_insn_state(p); - brw_set_default_access_mode(p, BRW_ALIGN_1); brw_set_default_mask_control(p, BRW_MASK_DISABLE); - /* If we think of the src and dst registers as composed of 8 DWORDs each, - * we want to pick up the contents of DWORDs 0 and 4 from src, truncate - * them to WORDs, and then pack them into DWORD 2 of dst. - * - * It's easier to get the EU to do this if we think of the src and dst - * registers as composed of 16 WORDS each; then, we want to pick up the - * contents of WORDs 0 and 8 from src, and pack them into WORDs 4 and 5 of - * dst. + if (brw->gen >= 8) { + /* Move the vertex count into the second MRF for the EOT write. */ + brw_MOV(p, retype(brw_message_reg(dst.nr + 1), BRW_REGISTER_TYPE_UD), + src); + } else { + /* If we think of the src and dst registers as composed of 8 DWORDs each, + * we want to pick up the contents of DWORDs 0 and 4 from src, truncate + * them to WORDs, and then pack them into DWORD 2 of dst. + * + * It's easier to get the EU to do this if we think of the src and dst + * registers as composed of 16 WORDS each; then, we want to pick up the + * contents of WORDs 0 and 8 from src, and pack them into WORDs 4 and 5 + * of dst. + * + * We can do that by the following EU instruction: + * + * mov (2) dst.4<1>:uw src<8;1,0>:uw { Align1, Q1, NoMask } + */ + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_MOV(p, + suboffset(stride(retype(dst, BRW_REGISTER_TYPE_UW), 2, 2, 1), 4), + stride(retype(src, BRW_REGISTER_TYPE_UW), 8, 1, 0)); + brw_set_default_access_mode(p, BRW_ALIGN_16); + } + brw_pop_insn_state(p); +} + +void +vec4_generator::generate_gs_svb_write(vec4_instruction *inst, + struct brw_reg dst, + struct brw_reg src0, + struct brw_reg src1) +{ + int binding = inst->sol_binding; + bool final_write = inst->sol_final_write; + + brw_push_insn_state(p); + /* Copy Vertex data into M0.x */ + brw_MOV(p, stride(dst, 4, 4, 1), + stride(retype(src0, BRW_REGISTER_TYPE_UD), 4, 4, 1)); + + /* Send SVB Write */ + brw_svb_write(p, + final_write ? src1 : brw_null_reg(), /* dest == src1 */ + 1, /* msg_reg_nr */ + dst, /* src0 == previous dst */ + SURF_INDEX_GEN6_SOL_BINDING(binding), /* binding_table_index */ + final_write); /* send_commit_msg */ + + /* Finally, wait for the write commit to occur so that we can proceed to + * other things safely. * - * We can do that by the following EU instruction: + * From the Sandybridge PRM, Volume 4, Part 1, Section 3.3: * - * mov (2) dst.4<1>:uw src<8;1,0>:uw { Align1, Q1, NoMask } + * The write commit does not modify the destination register, but + * merely clears the dependency associated with the destination + * register. Thus, a simple “mov” instruction using the register as a + * source is sufficient to wait for the write commit to occur. */ - brw_MOV(p, suboffset(stride(retype(dst, BRW_REGISTER_TYPE_UW), 2, 2, 1), 4), - stride(retype(src, BRW_REGISTER_TYPE_UW), 8, 1, 0)); - brw_set_default_access_mode(p, BRW_ALIGN_16); + if (final_write) { + brw_MOV(p, src1, src1); + } brw_pop_insn_state(p); } void -vec4_generator::generate_gs_set_dword_2_immed(struct brw_reg dst, - struct brw_reg src) +vec4_generator::generate_gs_svb_set_destination_index(vec4_instruction *inst, + struct brw_reg dst, + struct brw_reg src) { - assert(src.file == BRW_IMMEDIATE_VALUE); + int vertex = inst->sol_vertex; brw_push_insn_state(p); brw_set_default_access_mode(p, BRW_ALIGN_1); brw_set_default_mask_control(p, BRW_MASK_DISABLE); - brw_MOV(p, suboffset(vec1(dst), 2), src); - brw_set_default_access_mode(p, BRW_ALIGN_16); + brw_MOV(p, get_element_ud(dst, 5), get_element_ud(src, vertex)); + brw_pop_insn_state(p); +} + +void +vec4_generator::generate_gs_set_dword_2(struct brw_reg dst, struct brw_reg src) +{ + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_MOV(p, suboffset(vec1(dst), 2), suboffset(vec1(src), 0)); brw_pop_insn_state(p); } @@ -607,6 +749,85 @@ vec4_generator::generate_gs_get_instance_id(struct brw_reg dst) brw_pop_insn_state(p); } +void +vec4_generator::generate_gs_ff_sync_set_primitives(struct brw_reg dst, + struct brw_reg src0, + struct brw_reg src1, + struct brw_reg src2) +{ + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + /* Save src0 data in 16:31 bits of dst.0 */ + brw_AND(p, suboffset(vec1(dst), 0), suboffset(vec1(src0), 0), + brw_imm_ud(0xffffu)); + brw_SHL(p, suboffset(vec1(dst), 0), suboffset(vec1(dst), 0), brw_imm_ud(16)); + /* Save src1 data in 0:15 bits of dst.0 */ + brw_AND(p, suboffset(vec1(src2), 0), suboffset(vec1(src1), 0), + brw_imm_ud(0xffffu)); + brw_OR(p, suboffset(vec1(dst), 0), + suboffset(vec1(dst), 0), + suboffset(vec1(src2), 0)); + brw_pop_insn_state(p); +} + +void +vec4_generator::generate_gs_ff_sync(vec4_instruction *inst, + struct brw_reg dst, + struct brw_reg src0, + struct brw_reg src1) +{ + /* This opcode uses an implied MRF register for: + * - the header of the ff_sync message. And as such it is expected to be + * initialized to r0 before calling here. + * - the destination where we will write the allocated URB handle. + */ + struct brw_reg header = + retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD); + + /* Overwrite dword 0 of the header (SO vertices to write) and + * dword 1 (number of primitives written). + */ + brw_push_insn_state(p); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_MOV(p, get_element_ud(header, 0), get_element_ud(src1, 0)); + brw_MOV(p, get_element_ud(header, 1), get_element_ud(src0, 0)); + brw_pop_insn_state(p); + + /* Allocate URB handle in dst */ + brw_ff_sync(p, + dst, + 0, + header, + 1, /* allocate */ + 1, /* response length */ + 0 /* eot */); + + /* Now put allocated urb handle in header.0 */ + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_MOV(p, get_element_ud(header, 0), get_element_ud(dst, 0)); + + /* src1 is not an immediate when we use transform feedback */ + if (src1.file != BRW_IMMEDIATE_VALUE) + brw_MOV(p, brw_vec4_grf(src1.nr, 0), brw_vec4_grf(dst.nr, 1)); + + brw_pop_insn_state(p); +} + +void +vec4_generator::generate_gs_set_primitive_id(struct brw_reg dst) +{ + /* In gen6, PrimitiveID is delivered in R0.1 of the payload */ + struct brw_reg src = brw_vec8_grf(0, 0); + brw_push_insn_state(p); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_MOV(p, get_element_ud(dst, 0), get_element_ud(src, 1)); + brw_pop_insn_state(p); +} + void vec4_generator::generate_oword_dual_block_offsets(struct brw_reg m1, struct brw_reg index) @@ -781,7 +1002,6 @@ vec4_generator::generate_pull_constant_load(vec4_instruction *inst, struct brw_reg index, struct brw_reg offset) { - assert(brw->gen <= 7); assert(index.file == BRW_IMMEDIATE_VALUE && index.type == BRW_REGISTER_TYPE_UD); uint32_t surf_index = index.dw1.ud; @@ -828,23 +1048,70 @@ vec4_generator::generate_pull_constant_load_gen7(vec4_instruction *inst, struct brw_reg surf_index, struct brw_reg offset) { - assert(surf_index.file == BRW_IMMEDIATE_VALUE && - surf_index.type == BRW_REGISTER_TYPE_UD); + assert(surf_index.type == BRW_REGISTER_TYPE_UD); - brw_inst *insn = brw_next_insn(p, BRW_OPCODE_SEND); - brw_set_dest(p, insn, dst); - brw_set_src0(p, insn, offset); - brw_set_sampler_message(p, insn, - surf_index.dw1.ud, - 0, /* LD message ignores sampler unit */ - GEN5_SAMPLER_MESSAGE_SAMPLE_LD, - 1, /* rlen */ - 1, /* mlen */ - false, /* no header */ - BRW_SAMPLER_SIMD_MODE_SIMD4X2, - 0); + if (surf_index.file == BRW_IMMEDIATE_VALUE) { - brw_mark_surface_used(&prog_data->base, surf_index.dw1.ud); + brw_inst *insn = brw_next_insn(p, BRW_OPCODE_SEND); + brw_set_dest(p, insn, dst); + brw_set_src0(p, insn, offset); + brw_set_sampler_message(p, insn, + surf_index.dw1.ud, + 0, /* LD message ignores sampler unit */ + GEN5_SAMPLER_MESSAGE_SAMPLE_LD, + 1, /* rlen */ + 1, /* mlen */ + false, /* no header */ + BRW_SAMPLER_SIMD_MODE_SIMD4X2, + 0); + + brw_mark_surface_used(&prog_data->base, surf_index.dw1.ud); + + } else { + + struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD)); + + brw_push_insn_state(p); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_access_mode(p, BRW_ALIGN_1); + + /* a0.0 = surf_index & 0xff */ + brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND); + brw_inst_set_exec_size(p->brw, insn_and, BRW_EXECUTE_1); + brw_set_dest(p, insn_and, addr); + brw_set_src0(p, insn_and, vec1(retype(surf_index, BRW_REGISTER_TYPE_UD))); + brw_set_src1(p, insn_and, brw_imm_ud(0x0ff)); + + + /* a0.0 |= */ + brw_inst *insn_or = brw_next_insn(p, BRW_OPCODE_OR); + brw_set_sampler_message(p, insn_or, + 0 /* surface */, + 0 /* sampler */, + GEN5_SAMPLER_MESSAGE_SAMPLE_LD, + 1 /* rlen */, + 1 /* mlen */, + false /* header */, + BRW_SAMPLER_SIMD_MODE_SIMD4X2, + 0); + brw_inst_set_exec_size(p->brw, insn_or, BRW_EXECUTE_1); + brw_inst_set_src1_reg_type(p->brw, insn_or, BRW_REGISTER_TYPE_UD); + brw_set_src0(p, insn_or, addr); + brw_set_dest(p, insn_or, addr); + + + /* dst = send(offset, a0.0) */ + brw_inst *insn_send = brw_next_insn(p, BRW_OPCODE_SEND); + brw_set_dest(p, insn_send, dst); + brw_set_src0(p, insn_send, offset); + brw_set_indirect_send_descriptor(p, insn_send, BRW_SFID_SAMPLER, addr); + + brw_pop_insn_state(p); + + /* visitor knows more than we do about the surface limit required, + * so has already done marking. + */ + } } void @@ -880,385 +1147,436 @@ vec4_generator::generate_untyped_surface_read(vec4_instruction *inst, brw_mark_surface_used(&prog_data->base, surf_index.dw1.ud); } -/** - * Generate assembly for a Vec4 IR instruction. - * - * \param instruction The Vec4 IR instruction to generate code for. - * \param dst The destination register. - * \param src An array of up to three source registers. - */ void -vec4_generator::generate_vec4_instruction(vec4_instruction *instruction, - struct brw_reg dst, - struct brw_reg *src) +vec4_generator::generate_code(const cfg_t *cfg) { - vec4_instruction *inst = (vec4_instruction *) instruction; - - if (dst.width == BRW_WIDTH_4) { - /* This happens in attribute fixups for "dual instanced" geometry - * shaders, since they use attributes that are vec4's. Since the exec - * width is only 4, it's essential that the caller set - * force_writemask_all in order to make sure the instruction is executed - * regardless of which channels are enabled. - */ - assert(inst->force_writemask_all); + struct annotation_info annotation; + memset(&annotation, 0, sizeof(annotation)); + int loop_count = 0; - /* Fix up any <8;8,1> or <0;4,1> source registers to <4;4,1> to satisfy - * the following register region restrictions (from Graphics BSpec: - * 3D-Media-GPGPU Engine > EU Overview > Registers and Register Regions - * > Register Region Restrictions) - * - * 1. ExecSize must be greater than or equal to Width. - * - * 2. If ExecSize = Width and HorzStride != 0, VertStride must be set - * to Width * HorzStride." - */ - for (int i = 0; i < 3; i++) { - if (src[i].file == BRW_GENERAL_REGISTER_FILE) - src[i] = stride(src[i], 4, 4, 1); + foreach_block_and_inst (block, vec4_instruction, inst, cfg) { + struct brw_reg src[3], dst; + + if (unlikely(debug_flag)) + annotate(brw, &annotation, cfg, inst, p->next_insn_offset); + + for (unsigned int i = 0; i < 3; i++) { + src[i] = inst->get_src(this->prog_data, i); } - } + dst = inst->get_dst(); - switch (inst->opcode) { - case BRW_OPCODE_MOV: - brw_MOV(p, dst, src[0]); - break; - case BRW_OPCODE_ADD: - brw_ADD(p, dst, src[0], src[1]); - break; - case BRW_OPCODE_MUL: - brw_MUL(p, dst, src[0], src[1]); - break; - case BRW_OPCODE_MACH: - brw_MACH(p, dst, src[0], src[1]); - break; + brw_set_default_predicate_control(p, inst->predicate); + brw_set_default_predicate_inverse(p, inst->predicate_inverse); + brw_set_default_saturate(p, inst->saturate); + brw_set_default_mask_control(p, inst->force_writemask_all); + brw_set_default_acc_write_control(p, inst->writes_accumulator); - case BRW_OPCODE_MAD: - assert(brw->gen >= 6); - brw_MAD(p, dst, src[0], src[1], src[2]); - break; + unsigned pre_emit_nr_insn = p->nr_insn; - case BRW_OPCODE_FRC: - brw_FRC(p, dst, src[0]); - break; - case BRW_OPCODE_RNDD: - brw_RNDD(p, dst, src[0]); - break; - case BRW_OPCODE_RNDE: - brw_RNDE(p, dst, src[0]); - break; - case BRW_OPCODE_RNDZ: - brw_RNDZ(p, dst, src[0]); - break; + if (dst.width == BRW_WIDTH_4) { + /* This happens in attribute fixups for "dual instanced" geometry + * shaders, since they use attributes that are vec4's. Since the exec + * width is only 4, it's essential that the caller set + * force_writemask_all in order to make sure the instruction is executed + * regardless of which channels are enabled. + */ + assert(inst->force_writemask_all); + + /* Fix up any <8;8,1> or <0;4,1> source registers to <4;4,1> to satisfy + * the following register region restrictions (from Graphics BSpec: + * 3D-Media-GPGPU Engine > EU Overview > Registers and Register Regions + * > Register Region Restrictions) + * + * 1. ExecSize must be greater than or equal to Width. + * + * 2. If ExecSize = Width and HorzStride != 0, VertStride must be set + * to Width * HorzStride." + */ + for (int i = 0; i < 3; i++) { + if (src[i].file == BRW_GENERAL_REGISTER_FILE) + src[i] = stride(src[i], 4, 4, 1); + } + } - case BRW_OPCODE_AND: - brw_AND(p, dst, src[0], src[1]); - break; - case BRW_OPCODE_OR: - brw_OR(p, dst, src[0], src[1]); - break; - case BRW_OPCODE_XOR: - brw_XOR(p, dst, src[0], src[1]); - break; - case BRW_OPCODE_NOT: - brw_NOT(p, dst, src[0]); - break; - case BRW_OPCODE_ASR: - brw_ASR(p, dst, src[0], src[1]); - break; - case BRW_OPCODE_SHR: - brw_SHR(p, dst, src[0], src[1]); - break; - case BRW_OPCODE_SHL: - brw_SHL(p, dst, src[0], src[1]); - break; + switch (inst->opcode) { + case VEC4_OPCODE_UNPACK_UNIFORM: + case BRW_OPCODE_MOV: + brw_MOV(p, dst, src[0]); + break; + case BRW_OPCODE_ADD: + brw_ADD(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_MUL: + brw_MUL(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_MACH: + brw_MACH(p, dst, src[0], src[1]); + break; - case BRW_OPCODE_CMP: - brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]); - break; - case BRW_OPCODE_SEL: - brw_SEL(p, dst, src[0], src[1]); - break; + case BRW_OPCODE_MAD: + assert(brw->gen >= 6); + brw_MAD(p, dst, src[0], src[1], src[2]); + break; - case BRW_OPCODE_DPH: - brw_DPH(p, dst, src[0], src[1]); - break; + case BRW_OPCODE_FRC: + brw_FRC(p, dst, src[0]); + break; + case BRW_OPCODE_RNDD: + brw_RNDD(p, dst, src[0]); + break; + case BRW_OPCODE_RNDE: + brw_RNDE(p, dst, src[0]); + break; + case BRW_OPCODE_RNDZ: + brw_RNDZ(p, dst, src[0]); + break; - case BRW_OPCODE_DP4: - brw_DP4(p, dst, src[0], src[1]); - break; + case BRW_OPCODE_AND: + brw_AND(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_OR: + brw_OR(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_XOR: + brw_XOR(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_NOT: + brw_NOT(p, dst, src[0]); + break; + case BRW_OPCODE_ASR: + brw_ASR(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_SHR: + brw_SHR(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_SHL: + brw_SHL(p, dst, src[0], src[1]); + break; - case BRW_OPCODE_DP3: - brw_DP3(p, dst, src[0], src[1]); - break; + case BRW_OPCODE_CMP: + brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]); + break; + case BRW_OPCODE_SEL: + brw_SEL(p, dst, src[0], src[1]); + break; - case BRW_OPCODE_DP2: - brw_DP2(p, dst, src[0], src[1]); - break; + case BRW_OPCODE_DPH: + brw_DPH(p, dst, src[0], src[1]); + break; - case BRW_OPCODE_F32TO16: - assert(brw->gen >= 7); - brw_F32TO16(p, dst, src[0]); - break; + case BRW_OPCODE_DP4: + brw_DP4(p, dst, src[0], src[1]); + break; - case BRW_OPCODE_F16TO32: - assert(brw->gen >= 7); - brw_F16TO32(p, dst, src[0]); - break; + case BRW_OPCODE_DP3: + brw_DP3(p, dst, src[0], src[1]); + break; - case BRW_OPCODE_LRP: - assert(brw->gen >= 6); - brw_LRP(p, dst, src[0], src[1], src[2]); - break; + case BRW_OPCODE_DP2: + brw_DP2(p, dst, src[0], src[1]); + break; + + case BRW_OPCODE_F32TO16: + assert(brw->gen >= 7); + brw_F32TO16(p, dst, src[0]); + break; - case BRW_OPCODE_BFREV: - assert(brw->gen >= 7); - /* BFREV only supports UD type for src and dst. */ - brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD), + case BRW_OPCODE_F16TO32: + assert(brw->gen >= 7); + brw_F16TO32(p, dst, src[0]); + break; + + case BRW_OPCODE_LRP: + assert(brw->gen >= 6); + brw_LRP(p, dst, src[0], src[1], src[2]); + break; + + case BRW_OPCODE_BFREV: + assert(brw->gen >= 7); + /* BFREV only supports UD type for src and dst. */ + brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD), retype(src[0], BRW_REGISTER_TYPE_UD)); - break; - case BRW_OPCODE_FBH: - assert(brw->gen >= 7); - /* FBH only supports UD type for dst. */ - brw_FBH(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]); - break; - case BRW_OPCODE_FBL: - assert(brw->gen >= 7); - /* FBL only supports UD type for dst. */ - brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]); - break; - case BRW_OPCODE_CBIT: - assert(brw->gen >= 7); - /* CBIT only supports UD type for dst. */ - brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]); - break; - case BRW_OPCODE_ADDC: - assert(brw->gen >= 7); - brw_ADDC(p, dst, src[0], src[1]); - break; - case BRW_OPCODE_SUBB: - assert(brw->gen >= 7); - brw_SUBB(p, dst, src[0], src[1]); - break; - case BRW_OPCODE_MAC: - brw_MAC(p, dst, src[0], src[1]); - break; + break; + case BRW_OPCODE_FBH: + assert(brw->gen >= 7); + /* FBH only supports UD type for dst. */ + brw_FBH(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]); + break; + case BRW_OPCODE_FBL: + assert(brw->gen >= 7); + /* FBL only supports UD type for dst. */ + brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]); + break; + case BRW_OPCODE_CBIT: + assert(brw->gen >= 7); + /* CBIT only supports UD type for dst. */ + brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]); + break; + case BRW_OPCODE_ADDC: + assert(brw->gen >= 7); + brw_ADDC(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_SUBB: + assert(brw->gen >= 7); + brw_SUBB(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_MAC: + brw_MAC(p, dst, src[0], src[1]); + break; - case BRW_OPCODE_BFE: - assert(brw->gen >= 7); - brw_BFE(p, dst, src[0], src[1], src[2]); - break; + case BRW_OPCODE_BFE: + assert(brw->gen >= 7); + brw_BFE(p, dst, src[0], src[1], src[2]); + break; - case BRW_OPCODE_BFI1: - assert(brw->gen >= 7); - brw_BFI1(p, dst, src[0], src[1]); - break; - case BRW_OPCODE_BFI2: - assert(brw->gen >= 7); - brw_BFI2(p, dst, src[0], src[1], src[2]); - break; + case BRW_OPCODE_BFI1: + assert(brw->gen >= 7); + brw_BFI1(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_BFI2: + assert(brw->gen >= 7); + brw_BFI2(p, dst, src[0], src[1], src[2]); + break; - case BRW_OPCODE_IF: - if (inst->src[0].file != BAD_FILE) { - /* The instruction has an embedded compare (only allowed on gen6) */ - assert(brw->gen == 6); - gen6_IF(p, inst->conditional_mod, src[0], src[1]); - } else { - brw_inst *if_inst = brw_IF(p, BRW_EXECUTE_8); - brw_inst_set_pred_control(brw, if_inst, inst->predicate); - } - break; + case BRW_OPCODE_IF: + if (inst->src[0].file != BAD_FILE) { + /* The instruction has an embedded compare (only allowed on gen6) */ + assert(brw->gen == 6); + gen6_IF(p, inst->conditional_mod, src[0], src[1]); + } else { + brw_inst *if_inst = brw_IF(p, BRW_EXECUTE_8); + brw_inst_set_pred_control(brw, if_inst, inst->predicate); + } + break; - case BRW_OPCODE_ELSE: - brw_ELSE(p); - break; - case BRW_OPCODE_ENDIF: - brw_ENDIF(p); - break; + case BRW_OPCODE_ELSE: + brw_ELSE(p); + break; + case BRW_OPCODE_ENDIF: + brw_ENDIF(p); + break; - case BRW_OPCODE_DO: - brw_DO(p, BRW_EXECUTE_8); - break; + case BRW_OPCODE_DO: + brw_DO(p, BRW_EXECUTE_8); + break; - case BRW_OPCODE_BREAK: - brw_BREAK(p); - brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); - break; - case BRW_OPCODE_CONTINUE: - /* FINISHME: We need to write the loop instruction support still. */ - if (brw->gen >= 6) - gen6_CONT(p); - else + case BRW_OPCODE_BREAK: + brw_BREAK(p); + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); + break; + case BRW_OPCODE_CONTINUE: brw_CONT(p); - brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); - break; + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); + break; - case BRW_OPCODE_WHILE: - brw_WHILE(p); - break; + case BRW_OPCODE_WHILE: + brw_WHILE(p); + loop_count++; + break; - case SHADER_OPCODE_RCP: - case SHADER_OPCODE_RSQ: - case SHADER_OPCODE_SQRT: - case SHADER_OPCODE_EXP2: - case SHADER_OPCODE_LOG2: - case SHADER_OPCODE_SIN: - case SHADER_OPCODE_COS: - if (brw->gen >= 7) { - gen6_math(p, dst, brw_math_function(inst->opcode), src[0], - brw_null_reg()); - } else if (brw->gen == 6) { - generate_math_gen6(inst, dst, src[0], brw_null_reg()); - } else { - generate_math1_gen4(inst, dst, src[0]); - } - break; + case SHADER_OPCODE_RCP: + case SHADER_OPCODE_RSQ: + case SHADER_OPCODE_SQRT: + case SHADER_OPCODE_EXP2: + case SHADER_OPCODE_LOG2: + case SHADER_OPCODE_SIN: + case SHADER_OPCODE_COS: + assert(inst->conditional_mod == BRW_CONDITIONAL_NONE); + if (brw->gen >= 7) { + gen6_math(p, dst, brw_math_function(inst->opcode), src[0], + brw_null_reg()); + } else if (brw->gen == 6) { + generate_math_gen6(inst, dst, src[0], brw_null_reg()); + } else { + generate_math1_gen4(inst, dst, src[0]); + } + break; - case SHADER_OPCODE_POW: - case SHADER_OPCODE_INT_QUOTIENT: - case SHADER_OPCODE_INT_REMAINDER: - if (brw->gen >= 7) { - gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]); - } else if (brw->gen == 6) { - generate_math_gen6(inst, dst, src[0], src[1]); - } else { - generate_math2_gen4(inst, dst, src[0], src[1]); - } - break; + case SHADER_OPCODE_POW: + case SHADER_OPCODE_INT_QUOTIENT: + case SHADER_OPCODE_INT_REMAINDER: + assert(inst->conditional_mod == BRW_CONDITIONAL_NONE); + if (brw->gen >= 7) { + gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]); + } else if (brw->gen == 6) { + generate_math_gen6(inst, dst, src[0], src[1]); + } else { + generate_math2_gen4(inst, dst, src[0], src[1]); + } + break; - case SHADER_OPCODE_TEX: - case SHADER_OPCODE_TXD: - case SHADER_OPCODE_TXF: - case SHADER_OPCODE_TXF_CMS: - case SHADER_OPCODE_TXF_MCS: - case SHADER_OPCODE_TXL: - case SHADER_OPCODE_TXS: - case SHADER_OPCODE_TG4: - case SHADER_OPCODE_TG4_OFFSET: - generate_tex(inst, dst, src[0]); - break; + case SHADER_OPCODE_TEX: + case SHADER_OPCODE_TXD: + case SHADER_OPCODE_TXF: + case SHADER_OPCODE_TXF_CMS: + case SHADER_OPCODE_TXF_MCS: + case SHADER_OPCODE_TXL: + case SHADER_OPCODE_TXS: + case SHADER_OPCODE_TG4: + case SHADER_OPCODE_TG4_OFFSET: + generate_tex(inst, dst, src[0], src[1]); + break; - case VS_OPCODE_URB_WRITE: - generate_vs_urb_write(inst); - break; + case VS_OPCODE_URB_WRITE: + generate_vs_urb_write(inst); + break; - case SHADER_OPCODE_GEN4_SCRATCH_READ: - generate_scratch_read(inst, dst, src[0]); - break; + case SHADER_OPCODE_GEN4_SCRATCH_READ: + generate_scratch_read(inst, dst, src[0]); + break; - case SHADER_OPCODE_GEN4_SCRATCH_WRITE: - generate_scratch_write(inst, dst, src[0], src[1]); - break; + case SHADER_OPCODE_GEN4_SCRATCH_WRITE: + generate_scratch_write(inst, dst, src[0], src[1]); + break; - case VS_OPCODE_PULL_CONSTANT_LOAD: - generate_pull_constant_load(inst, dst, src[0], src[1]); - break; + case VS_OPCODE_PULL_CONSTANT_LOAD: + generate_pull_constant_load(inst, dst, src[0], src[1]); + break; - case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7: - generate_pull_constant_load_gen7(inst, dst, src[0], src[1]); - break; + case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7: + generate_pull_constant_load_gen7(inst, dst, src[0], src[1]); + break; - case GS_OPCODE_URB_WRITE: - generate_gs_urb_write(inst); - break; + case GS_OPCODE_URB_WRITE: + generate_gs_urb_write(inst); + break; - case GS_OPCODE_THREAD_END: - generate_gs_thread_end(inst); - break; + case GS_OPCODE_URB_WRITE_ALLOCATE: + generate_gs_urb_write_allocate(inst); + break; - case GS_OPCODE_SET_WRITE_OFFSET: - generate_gs_set_write_offset(dst, src[0], src[1]); - break; + case GS_OPCODE_SVB_WRITE: + generate_gs_svb_write(inst, dst, src[0], src[1]); + break; - case GS_OPCODE_SET_VERTEX_COUNT: - generate_gs_set_vertex_count(dst, src[0]); - break; + case GS_OPCODE_SVB_SET_DST_INDEX: + generate_gs_svb_set_destination_index(inst, dst, src[0]); + break; - case GS_OPCODE_SET_DWORD_2_IMMED: - generate_gs_set_dword_2_immed(dst, src[0]); - break; + case GS_OPCODE_THREAD_END: + generate_gs_thread_end(inst); + break; - case GS_OPCODE_PREPARE_CHANNEL_MASKS: - generate_gs_prepare_channel_masks(dst); - break; + case GS_OPCODE_SET_WRITE_OFFSET: + generate_gs_set_write_offset(dst, src[0], src[1]); + break; - case GS_OPCODE_SET_CHANNEL_MASKS: - generate_gs_set_channel_masks(dst, src[0]); - break; + case GS_OPCODE_SET_VERTEX_COUNT: + generate_gs_set_vertex_count(dst, src[0]); + break; - case GS_OPCODE_GET_INSTANCE_ID: - generate_gs_get_instance_id(dst); - break; + case GS_OPCODE_FF_SYNC: + generate_gs_ff_sync(inst, dst, src[0], src[1]); + break; - case SHADER_OPCODE_SHADER_TIME_ADD: - brw_shader_time_add(p, src[0], - prog_data->base.binding_table.shader_time_start); - brw_mark_surface_used(&prog_data->base, - prog_data->base.binding_table.shader_time_start); - break; + case GS_OPCODE_FF_SYNC_SET_PRIMITIVES: + generate_gs_ff_sync_set_primitives(dst, src[0], src[1], src[2]); + break; - case SHADER_OPCODE_UNTYPED_ATOMIC: - generate_untyped_atomic(inst, dst, src[0], src[1]); - break; + case GS_OPCODE_SET_PRIMITIVE_ID: + generate_gs_set_primitive_id(dst); + break; - case SHADER_OPCODE_UNTYPED_SURFACE_READ: - generate_untyped_surface_read(inst, dst, src[0]); - break; + case GS_OPCODE_SET_DWORD_2: + generate_gs_set_dword_2(dst, src[0]); + break; - case VS_OPCODE_UNPACK_FLAGS_SIMD4X2: - generate_unpack_flags(inst, dst); - break; + case GS_OPCODE_PREPARE_CHANNEL_MASKS: + generate_gs_prepare_channel_masks(dst); + break; - default: - if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) { - _mesa_problem(&brw->ctx, "Unsupported opcode in `%s' in vec4\n", - opcode_descs[inst->opcode].name); - } else { - _mesa_problem(&brw->ctx, "Unsupported opcode %d in vec4", inst->opcode); - } - abort(); - } -} + case GS_OPCODE_SET_CHANNEL_MASKS: + generate_gs_set_channel_masks(dst, src[0]); + break; -void -vec4_generator::generate_code(exec_list *instructions) -{ - struct annotation_info annotation; - memset(&annotation, 0, sizeof(annotation)); + case GS_OPCODE_GET_INSTANCE_ID: + generate_gs_get_instance_id(dst); + break; - cfg_t *cfg = NULL; - if (unlikely(debug_flag)) - cfg = new(mem_ctx) cfg_t(instructions); + case SHADER_OPCODE_SHADER_TIME_ADD: + brw_shader_time_add(p, src[0], + prog_data->base.binding_table.shader_time_start); + brw_mark_surface_used(&prog_data->base, + prog_data->base.binding_table.shader_time_start); + break; - foreach_in_list(vec4_instruction, inst, instructions) { - struct brw_reg src[3], dst; + case SHADER_OPCODE_UNTYPED_ATOMIC: + generate_untyped_atomic(inst, dst, src[0], src[1]); + break; - if (unlikely(debug_flag)) - annotate(brw, &annotation, cfg, inst, p->next_insn_offset); + case SHADER_OPCODE_UNTYPED_SURFACE_READ: + generate_untyped_surface_read(inst, dst, src[0]); + break; - for (unsigned int i = 0; i < 3; i++) { - src[i] = inst->get_src(this->prog_data, i); - } - dst = inst->get_dst(); + case VS_OPCODE_UNPACK_FLAGS_SIMD4X2: + generate_unpack_flags(inst, dst); + break; - brw_set_default_predicate_control(p, inst->predicate); - brw_set_default_predicate_inverse(p, inst->predicate_inverse); - brw_set_default_saturate(p, inst->saturate); - brw_set_default_mask_control(p, inst->force_writemask_all); - brw_set_default_acc_write_control(p, inst->writes_accumulator); + case VEC4_OPCODE_PACK_BYTES: { + /* Is effectively: + * + * mov(8) dst<16,4,1>:UB src<4,1,0>:UB + * + * but destinations' only regioning is horizontal stride, so instead we + * have to use two instructions: + * + * mov(4) dst<1>:UB src<4,1,0>:UB + * mov(4) dst.16<1>:UB src.16<4,1,0>:UB + * + * where they pack the four bytes from the low and high four DW. + */ + assert(is_power_of_two(dst.dw1.bits.writemask) && + dst.dw1.bits.writemask != 0); + unsigned offset = __builtin_ctz(dst.dw1.bits.writemask); + + dst.type = BRW_REGISTER_TYPE_UB; - unsigned pre_emit_nr_insn = p->nr_insn; + brw_set_default_access_mode(p, BRW_ALIGN_1); + + src[0].type = BRW_REGISTER_TYPE_UB; + src[0].vstride = BRW_VERTICAL_STRIDE_4; + src[0].width = BRW_WIDTH_1; + src[0].hstride = BRW_HORIZONTAL_STRIDE_0; + dst.subnr = offset * 4; + struct brw_inst *insn = brw_MOV(p, dst, src[0]); + brw_inst_set_exec_size(brw, insn, BRW_EXECUTE_4); + brw_inst_set_no_dd_clear(brw, insn, true); + brw_inst_set_no_dd_check(brw, insn, inst->no_dd_check); + + src[0].subnr = 16; + dst.subnr = 16 + offset * 4; + insn = brw_MOV(p, dst, src[0]); + brw_inst_set_exec_size(brw, insn, BRW_EXECUTE_4); + brw_inst_set_no_dd_clear(brw, insn, inst->no_dd_clear); + brw_inst_set_no_dd_check(brw, insn, true); + + brw_set_default_access_mode(p, BRW_ALIGN_16); + break; + } + + default: + if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) { + _mesa_problem(&brw->ctx, "Unsupported opcode in `%s' in vec4\n", + opcode_descs[inst->opcode].name); + } else { + _mesa_problem(&brw->ctx, "Unsupported opcode %d in vec4", inst->opcode); + } + abort(); + } - generate_vec4_instruction(inst, dst, src); + if (inst->opcode == VEC4_OPCODE_PACK_BYTES) { + /* Handled dependency hints in the generator. */ - if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) { + assert(!inst->conditional_mod); + } else if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) { assert(p->nr_insn == pre_emit_nr_insn + 1 || !"conditional_mod, no_dd_check, or no_dd_clear set for IR " "emitting more than 1 instruction"); brw_inst *last = &p->store[pre_emit_nr_insn]; - brw_inst_set_cond_modifier(brw, last, inst->conditional_mod); + if (inst->conditional_mod) + brw_inst_set_cond_modifier(brw, last, inst->conditional_mod); brw_inst_set_no_dd_clear(brw, last, inst->no_dd_clear); brw_inst_set_no_dd_check(brw, last, inst->no_dd_check); } @@ -1273,28 +1591,41 @@ vec4_generator::generate_code(exec_list *instructions) if (unlikely(debug_flag)) { if (shader_prog) { - fprintf(stderr, "Native code for %s vertex shader %d:\n", + fprintf(stderr, "Native code for %s %s shader %d:\n", shader_prog->Label ? shader_prog->Label : "unnamed", - shader_prog->Name); + stage_name, shader_prog->Name); } else { - fprintf(stderr, "Native code for vertex program %d:\n", prog->Id); + fprintf(stderr, "Native code for %s program %d:\n", stage_name, + prog->Id); } - fprintf(stderr, "vec4 shader: %d instructions. Compacted %d to %d" + fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. Compacted %d to %d" " bytes (%.0f%%)\n", - before_size / 16, before_size, after_size, + stage_abbrev, + before_size / 16, loop_count, before_size, after_size, 100.0f * (before_size - after_size) / before_size); dump_assembly(p->store, annotation.ann_count, annotation.ann, brw, prog); ralloc_free(annotation.ann); } + + static GLuint msg_id = 0; + _mesa_gl_debug(&brw->ctx, &msg_id, + MESA_DEBUG_SOURCE_SHADER_COMPILER, + MESA_DEBUG_TYPE_OTHER, + MESA_DEBUG_SEVERITY_NOTIFICATION, + "%s vec4 shader: %d inst, %d loops, " + "compacted %d to %d bytes.\n", + stage_abbrev, + before_size / 16, loop_count, + before_size, after_size); } const unsigned * -vec4_generator::generate_assembly(exec_list *instructions, +vec4_generator::generate_assembly(const cfg_t *cfg, unsigned *assembly_size) { brw_set_default_access_mode(p, BRW_ALIGN_16); - generate_code(instructions); + generate_code(cfg); return brw_get_program(p, assembly_size); }