X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fmesa%2Fdrivers%2Fdri%2Fi965%2Fbrw_vec4_gs_visitor.cpp;h=704644e7429c4b29a7558f7ea155d88131412aa8;hb=b38fcd0aea8d17919ecd9cc7afc518cfb2c01c27;hp=08a55a3fab91ad1b1592ab5e9a73a924b040c3b3;hpb=8bb15813e3047820a95724e4257aa2c862eeb31a;p=mesa.git diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp index 08a55a3fab9..704644e7429 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp @@ -28,35 +28,50 @@ */ #include "brw_vec4_gs_visitor.h" +#include "gen6_gs_visitor.h" const unsigned MAX_GS_INPUT_VERTICES = 6; namespace brw { -vec4_gs_visitor::vec4_gs_visitor(struct brw_context *brw, +vec4_gs_visitor::vec4_gs_visitor(const struct brw_compiler *compiler, + void *log_data, struct brw_gs_compile *c, struct gl_shader_program *prog, - struct brw_shader *shader, - void *mem_ctx) - : vec4_visitor(brw, &c->base, &c->gp->program.Base, &c->key.base, - &c->prog_data.base, prog, shader, mem_ctx, - INTEL_DEBUG & DEBUG_GS), + void *mem_ctx, + bool no_spills, + int shader_time_index) + : vec4_visitor(compiler, log_data, + &c->gp->program.Base, &c->key.base, + &c->prog_data.base, prog, MESA_SHADER_GEOMETRY, mem_ctx, + no_spills, shader_time_index), c(c) { } dst_reg * -vec4_gs_visitor::make_reg_for_system_value(ir_variable *ir) +vec4_gs_visitor::make_reg_for_system_value(int location, + const glsl_type *type) { - /* Geometry shaders don't use any system values. */ - assert(!"Unreached"); - return NULL; + dst_reg *reg = new(mem_ctx) dst_reg(this, type); + + switch (location) { + case SYSTEM_VALUE_INVOCATION_ID: + this->current_annotation = "initialize gl_InvocationID"; + emit(GS_OPCODE_GET_INSTANCE_ID, *reg); + break; + default: + unreachable("not reached"); + } + + return reg; } int -vec4_gs_visitor::setup_varying_inputs(int payload_reg, int *attribute_map) +vec4_gs_visitor::setup_varying_inputs(int payload_reg, int *attribute_map, + int attributes_per_reg) { /* For geometry shaders there are N copies of the input attributes, where N * is the number of input vertices. attribute_map[BRW_VARYING_SLOT_COUNT * @@ -74,11 +89,14 @@ vec4_gs_visitor::setup_varying_inputs(int payload_reg, int *attribute_map) int varying = c->input_vue_map.slot_to_varying[slot]; for (unsigned vertex = 0; vertex < num_input_vertices; vertex++) { attribute_map[BRW_VARYING_SLOT_COUNT * vertex + varying] = - payload_reg + input_array_stride * vertex + slot; + attributes_per_reg * payload_reg + input_array_stride * vertex + + slot; } } - return payload_reg + input_array_stride * num_input_vertices; + int regs_used = ALIGN(input_array_stride * num_input_vertices, + attributes_per_reg) / attributes_per_reg; + return payload_reg + regs_used; } @@ -87,6 +105,12 @@ vec4_gs_visitor::setup_payload() { int attribute_map[BRW_VARYING_SLOT_COUNT * MAX_GS_INPUT_VERTICES]; + /* If we are in dual instanced or single mode, then attributes are going + * to be interleaved, so one register contains two attribute slots. + */ + int attributes_per_reg = + c->prog_data.base.dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT ? 1 : 2; + /* If a geometry shader tries to read from an input that wasn't written by * the vertex shader, that produces undefined results, but it shouldn't * crash anything. So initialize attribute_map to zeros--that ensures that @@ -104,13 +128,13 @@ vec4_gs_visitor::setup_payload() /* If the shader uses gl_PrimitiveIDIn, that goes in r1. */ if (c->prog_data.include_primitive_id) - attribute_map[VARYING_SLOT_PRIMITIVE_ID] = reg++; + attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg++; reg = setup_uniforms(reg); - reg = setup_varying_inputs(reg, attribute_map); + reg = setup_varying_inputs(reg, attribute_map, attributes_per_reg); - lower_attributes_to_hw_regs(attribute_map, false /* interleaved */); + lower_attributes_to_hw_regs(attribute_map, attributes_per_reg > 1); this->first_non_payload_grf = reg; } @@ -129,7 +153,7 @@ vec4_gs_visitor::emit_prolog() */ this->current_annotation = "clear r0.2"; dst_reg r0(retype(brw_vec4_grf(0, 0), BRW_REGISTER_TYPE_UD)); - vec4_instruction *inst = emit(GS_OPCODE_SET_DWORD_2_IMMED, r0, 0u); + vec4_instruction *inst = emit(GS_OPCODE_SET_DWORD_2, r0, 0u); inst->force_writemask_all = true; /* Create a virtual register to hold the vertex count */ @@ -170,7 +194,13 @@ vec4_gs_visitor::emit_prolog() src_reg src(dst); dst.writemask = WRITEMASK_X; src.swizzle = BRW_SWIZZLE_WWWW; - emit(MOV(dst, src)); + inst = emit(MOV(dst, src)); + + /* In dual instanced dispatch mode, dst has a width of 4, so we need + * to make sure the MOV happens regardless of which channels are + * enabled. + */ + inst->force_writemask_all = true; } } @@ -182,7 +212,7 @@ void vec4_gs_visitor::emit_program_code() { /* We don't support NV_geometry_program4. */ - assert(!"Unreached"); + unreachable("Unreached"); } @@ -250,6 +280,13 @@ vec4_gs_visitor::emit_urb_write_opcode(bool complete) vec4_instruction *inst = emit(GS_OPCODE_URB_WRITE); inst->offset = c->prog_data.control_data_header_size_hwords; + + /* We need to increment Global Offset by 1 to make room for Broadwell's + * extra "Vertex Count" payload at the beginning of the URB entry. + */ + if (devinfo->gen >= 8) + inst->offset++; + inst->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET; return inst; } @@ -266,7 +303,7 @@ vec4_gs_visitor::compute_array_stride(ir_dereference_array *ir) * setup_attributes() will remap our accesses to the actual input array. */ ir_dereference_variable *deref_var = ir->array->as_dereference_variable(); - if (deref_var && deref_var->var->mode == ir_var_shader_in) + if (deref_var && deref_var->var->data.mode == ir_var_shader_in) return BRW_VARYING_SLOT_COUNT; else return vec4_visitor::compute_array_stride(ir); @@ -312,90 +349,141 @@ vec4_gs_visitor::emit_control_data_bits() if (c->control_data_header_size_bits > 128) urb_write_flags = urb_write_flags | BRW_URB_WRITE_PER_SLOT_OFFSET; - /* If vertex_count is 0, then no control data bits have been accumulated - * yet, so we should do nothing. + /* If we are using either channel masks or a per-slot offset, then we + * need to figure out which DWORD we are trying to write to, using the + * formula: + * + * dword_index = (vertex_count - 1) * bits_per_vertex / 32 + * + * Since bits_per_vertex is a power of two, and is known at compile + * time, this can be optimized to: + * + * dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex)) */ - emit(CMP(dst_null_d(), this->vertex_count, 0u, BRW_CONDITIONAL_NEQ)); - emit(IF(BRW_PREDICATE_NORMAL)); - { - /* If we are using either channel masks or a per-slot offset, then we - * need to figure out which DWORD we are trying to write to, using the - * formula: - * - * dword_index = (vertex_count - 1) * bits_per_vertex / 32 - * - * Since bits_per_vertex is a power of two, and is known at compile - * time, this can be optimized to: - * - * dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex)) + src_reg dword_index(this, glsl_type::uint_type); + if (urb_write_flags) { + src_reg prev_count(this, glsl_type::uint_type); + emit(ADD(dst_reg(prev_count), this->vertex_count, 0xffffffffu)); + unsigned log2_bits_per_vertex = + _mesa_fls(c->control_data_bits_per_vertex); + emit(SHR(dst_reg(dword_index), prev_count, + (uint32_t) (6 - log2_bits_per_vertex))); + } + + /* Start building the URB write message. The first MRF gets a copy of + * R0. + */ + int base_mrf = 1; + dst_reg mrf_reg(MRF, base_mrf); + src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); + vec4_instruction *inst = emit(MOV(mrf_reg, r0)); + inst->force_writemask_all = true; + + if (urb_write_flags & BRW_URB_WRITE_PER_SLOT_OFFSET) { + /* Set the per-slot offset to dword_index / 4, to that we'll write to + * the appropriate OWORD within the control data header. */ - src_reg dword_index(this, glsl_type::uint_type); - if (urb_write_flags) { - src_reg prev_count(this, glsl_type::uint_type); - emit(ADD(dst_reg(prev_count), this->vertex_count, 0xffffffffu)); - unsigned log2_bits_per_vertex = - _mesa_fls(c->control_data_bits_per_vertex); - emit(SHR(dst_reg(dword_index), prev_count, - (uint32_t) (6 - log2_bits_per_vertex))); - } + src_reg per_slot_offset(this, glsl_type::uint_type); + emit(SHR(dst_reg(per_slot_offset), dword_index, 2u)); + emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, per_slot_offset, 1u); + } - /* Start building the URB write message. The first MRF gets a copy of - * R0. + if (urb_write_flags & BRW_URB_WRITE_USE_CHANNEL_MASKS) { + /* Set the channel masks to 1 << (dword_index % 4), so that we'll + * write to the appropriate DWORD within the OWORD. We need to do + * this computation with force_writemask_all, otherwise garbage data + * from invocation 0 might clobber the mask for invocation 1 when + * GS_OPCODE_PREPARE_CHANNEL_MASKS tries to OR the two masks + * together. */ - int base_mrf = 1; - dst_reg mrf_reg(MRF, base_mrf); - src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); - vec4_instruction *inst = emit(MOV(mrf_reg, r0)); + src_reg channel(this, glsl_type::uint_type); + inst = emit(AND(dst_reg(channel), dword_index, 3u)); inst->force_writemask_all = true; - - if (urb_write_flags & BRW_URB_WRITE_PER_SLOT_OFFSET) { - /* Set the per-slot offset to dword_index / 4, to that we'll write to - * the appropriate OWORD within the control data header. - */ - src_reg per_slot_offset(this, glsl_type::uint_type); - emit(SHR(dst_reg(per_slot_offset), dword_index, 2u)); - emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, per_slot_offset, 1u); - } - - if (urb_write_flags & BRW_URB_WRITE_USE_CHANNEL_MASKS) { - /* Set the channel masks to 1 << (dword_index % 4), so that we'll - * write to the appropriate DWORD within the OWORD. We need to do - * this computation with force_writemask_all, otherwise garbage data - * from invocation 0 might clobber the mask for invocation 1 when - * GS_OPCODE_PREPARE_CHANNEL_MASKS tries to OR the two masks - * together. - */ - src_reg channel(this, glsl_type::uint_type); - inst = emit(AND(dst_reg(channel), dword_index, 3u)); - inst->force_writemask_all = true; - src_reg one(this, glsl_type::uint_type); - inst = emit(MOV(dst_reg(one), 1u)); - inst->force_writemask_all = true; - src_reg channel_mask(this, glsl_type::uint_type); - inst = emit(SHL(dst_reg(channel_mask), one, channel)); - inst->force_writemask_all = true; - emit(GS_OPCODE_PREPARE_CHANNEL_MASKS, dst_reg(channel_mask)); - emit(GS_OPCODE_SET_CHANNEL_MASKS, mrf_reg, channel_mask); - } - - /* Store the control data bits in the message payload and send it. */ - dst_reg mrf_reg2(MRF, base_mrf + 1); - inst = emit(MOV(mrf_reg2, this->control_data_bits)); + src_reg one(this, glsl_type::uint_type); + inst = emit(MOV(dst_reg(one), 1u)); + inst->force_writemask_all = true; + src_reg channel_mask(this, glsl_type::uint_type); + inst = emit(SHL(dst_reg(channel_mask), one, channel)); inst->force_writemask_all = true; - inst = emit(GS_OPCODE_URB_WRITE); - inst->urb_write_flags = urb_write_flags; - inst->base_mrf = base_mrf; - inst->mlen = 2; + emit(GS_OPCODE_PREPARE_CHANNEL_MASKS, dst_reg(channel_mask), + channel_mask); + emit(GS_OPCODE_SET_CHANNEL_MASKS, mrf_reg, channel_mask); } - emit(BRW_OPCODE_ENDIF); + + /* Store the control data bits in the message payload and send it. */ + dst_reg mrf_reg2(MRF, base_mrf + 1); + inst = emit(MOV(mrf_reg2, this->control_data_bits)); + inst->force_writemask_all = true; + inst = emit(GS_OPCODE_URB_WRITE); + inst->urb_write_flags = urb_write_flags; + /* We need to increment Global Offset by 256-bits to make room for + * Broadwell's extra "Vertex Count" payload at the beginning of the + * URB entry. Since this is an OWord message, Global Offset is counted + * in 128-bit units, so we must set it to 2. + */ + if (devinfo->gen >= 8) + inst->offset = 2; + inst->base_mrf = base_mrf; + inst->mlen = 2; } +void +vec4_gs_visitor::set_stream_control_data_bits(unsigned stream_id) +{ + /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */ + + /* Note: we are calling this *before* increasing vertex_count, so + * this->vertex_count == vertex_count - 1 in the formula above. + */ + + /* Stream mode uses 2 bits per vertex */ + assert(c->control_data_bits_per_vertex == 2); + + /* Must be a valid stream */ + assert(stream_id >= 0 && stream_id < MAX_VERTEX_STREAMS); + + /* Control data bits are initialized to 0 so we don't have to set any + * bits when sending vertices to stream 0. + */ + if (stream_id == 0) + return; + + /* reg::sid = stream_id */ + src_reg sid(this, glsl_type::uint_type); + emit(MOV(dst_reg(sid), stream_id)); + + /* reg:shift_count = 2 * (vertex_count - 1) */ + src_reg shift_count(this, glsl_type::uint_type); + emit(SHL(dst_reg(shift_count), this->vertex_count, 1u)); + + /* Note: we're relying on the fact that the GEN SHL instruction only pays + * attention to the lower 5 bits of its second source argument, so on this + * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to + * stream_id << ((2 * (vertex_count - 1)) % 32). + */ + src_reg mask(this, glsl_type::uint_type); + emit(SHL(dst_reg(mask), sid, shift_count)); + emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask)); +} void -vec4_gs_visitor::visit(ir_emit_vertex *) +vec4_gs_visitor::visit(ir_emit_vertex *ir) { this->current_annotation = "emit vertex: safety check"; + /* Haswell and later hardware ignores the "Render Stream Select" bits + * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled, + * and instead sends all primitives down the pipeline for rasterization. + * If the SOL stage is enabled, "Render Stream Select" is honored and + * primitives bound to non-zero streams are discarded after stream output. + * + * Since the only purpose of primives sent to non-zero streams is to + * be recorded by transform feedback, we can simply discard all geometry + * bound to these streams when transform feedback is disabled. + */ + if (ir->stream_id() > 0 && shader_prog->TransformFeedback.NumVarying == 0) + return; + /* To ensure that we don't output more vertices than the shader specified * using max_vertices, do the logic inside a conditional of the form "if * (vertex_count < MAX)" @@ -436,9 +524,17 @@ vec4_gs_visitor::visit(ir_emit_vertex *) emit(AND(dst_null_d(), this->vertex_count, (uint32_t) (32 / c->control_data_bits_per_vertex - 1))); inst->conditional_mod = BRW_CONDITIONAL_Z; + emit(IF(BRW_PREDICATE_NORMAL)); { + /* If vertex_count is 0, then no control data bits have been + * accumulated yet, so we skip emitting them. + */ + emit(CMP(dst_null_d(), this->vertex_count, 0u, + BRW_CONDITIONAL_NEQ)); + emit(IF(BRW_PREDICATE_NORMAL)); emit_control_data_bits(); + emit(BRW_OPCODE_ENDIF); /* Reset control_data_bits to 0 so we can start accumulating a new * batch. @@ -456,6 +552,17 @@ vec4_gs_visitor::visit(ir_emit_vertex *) this->current_annotation = "emit vertex: vertex data"; emit_vertex(); + /* In stream mode we have to set control data bits for all vertices + * unless we have disabled control data bits completely (which we do + * do for GL_POINTS outputs that don't use streams). + */ + if (c->control_data_header_size_bits > 0 && + c->prog_data.control_data_format == + GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) { + this->current_annotation = "emit vertex: Stream control data bits"; + set_stream_control_data_bits(ir->stream_id()); + } + this->current_annotation = "emit vertex: increment vertex count"; emit(ADD(dst_reg(this->vertex_count), this->vertex_count, src_reg(1u))); @@ -516,6 +623,20 @@ vec4_gs_visitor::visit(ir_end_primitive *) emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask)); } +static const unsigned * +generate_assembly(struct brw_context *brw, + struct gl_shader_program *shader_prog, + struct gl_program *prog, + struct brw_vue_prog_data *prog_data, + void *mem_ctx, + const cfg_t *cfg, + unsigned *final_assembly_size) +{ + vec4_generator g(brw->intelScreen->compiler, brw, + shader_prog, prog, prog_data, mem_ctx, + INTEL_DEBUG & DEBUG_GS, "geometry", "GS"); + return g.generate_assembly(cfg, final_assembly_size); +} extern "C" const unsigned * brw_gs_emit(struct brw_context *brw, @@ -524,28 +645,87 @@ brw_gs_emit(struct brw_context *brw, void *mem_ctx, unsigned *final_assembly_size) { - struct brw_shader *shader = - (brw_shader *) prog->_LinkedShaders[MESA_SHADER_GEOMETRY]; - if (unlikely(INTEL_DEBUG & DEBUG_GS)) { - printf("GLSL IR for native geometry shader %d:\n", prog->Name); - _mesa_print_ir(shader->ir, NULL); - printf("\n\n"); + struct brw_shader *shader = + (brw_shader *) prog->_LinkedShaders[MESA_SHADER_GEOMETRY]; + + brw_dump_ir("geometry", prog, &shader->base, NULL); } - vec4_gs_visitor v(brw, c, prog, shader, mem_ctx); - if (!v.run()) { - prog->LinkStatus = false; - ralloc_strcat(&prog->InfoLog, v.fail_msg); - return NULL; + int st_index = -1; + if (INTEL_DEBUG & DEBUG_SHADER_TIME) + st_index = brw_get_shader_time_index(brw, prog, NULL, ST_GS); + + if (brw->gen >= 7) { + /* Compile the geometry shader in DUAL_OBJECT dispatch mode, if we can do + * so without spilling. If the GS invocations count > 1, then we can't use + * dual object mode. + */ + if (c->prog_data.invocations <= 1 && + likely(!(INTEL_DEBUG & DEBUG_NO_DUAL_OBJECT_GS))) { + c->prog_data.base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT; + + vec4_gs_visitor v(brw->intelScreen->compiler, brw, + c, prog, mem_ctx, true /* no_spills */, st_index); + if (v.run(NULL /* clip planes */)) { + return generate_assembly(brw, prog, &c->gp->program.Base, + &c->prog_data.base, mem_ctx, v.cfg, + final_assembly_size); + } + } } - vec4_generator g(brw, prog, &c->gp->program.Base, &c->prog_data.base, - mem_ctx, INTEL_DEBUG & DEBUG_GS); - const unsigned *generated = - g.generate_assembly(&v.instructions, final_assembly_size); + /* Either we failed to compile in DUAL_OBJECT mode (probably because it + * would have required spilling) or DUAL_OBJECT mode is disabled. So fall + * back to DUAL_INSTANCED or SINGLE mode, which consumes fewer registers. + * + * FIXME: Single dispatch mode requires that the driver can handle + * interleaving of input registers, but this is already supported (dual + * instance mode has the same requirement). However, to take full advantage + * of single dispatch mode to reduce register pressure we would also need to + * do interleaved outputs, but currently, the vec4 visitor and generator + * classes do not support this, so at the moment register pressure in + * single and dual instance modes is the same. + * + * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 "3DSTATE_GS" + * "If InstanceCount>1, DUAL_OBJECT mode is invalid. Software will likely + * want to use DUAL_INSTANCE mode for higher performance, but SINGLE mode + * is also supported. When InstanceCount=1 (one instance per object) software + * can decide which dispatch mode to use. DUAL_OBJECT mode would likely be + * the best choice for performance, followed by SINGLE mode." + * + * So SINGLE mode is more performant when invocations == 1 and DUAL_INSTANCE + * mode is more performant when invocations > 1. Gen6 only supports + * SINGLE mode. + */ + if (c->prog_data.invocations <= 1 || brw->gen < 7) + c->prog_data.base.dispatch_mode = DISPATCH_MODE_4X1_SINGLE; + else + c->prog_data.base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_INSTANCE; + + vec4_gs_visitor *gs = NULL; + const unsigned *ret = NULL; + + if (brw->gen >= 7) + gs = new vec4_gs_visitor(brw->intelScreen->compiler, brw, + c, prog, mem_ctx, false /* no_spills */, + st_index); + else + gs = new gen6_gs_visitor(brw->intelScreen->compiler, brw, + c, prog, mem_ctx, false /* no_spills */, + st_index); + + if (!gs->run(NULL /* clip planes */)) { + prog->LinkStatus = false; + ralloc_strcat(&prog->InfoLog, gs->fail_msg); + } else { + ret = generate_assembly(brw, prog, &c->gp->program.Base, + &c->prog_data.base, mem_ctx, gs->cfg, + final_assembly_size); + } - return generated; + delete gs; + return ret; }