X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;ds=sidebyside;f=src%2Fmesa%2Fdrivers%2Fdri%2Fi965%2Fbrw_gs.c;h=8f5dcf359e960bd13594ef1e805ddc6ab2910a2a;hb=c78edcea8b256743fb38c7cd519b3324e4716143;hp=4d0b125ffe4024b651bd06ebfb086f16fe2b42d2;hpb=ef56cf7738ecb25e8c668c509097fc714ca71c96;p=mesa.git diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c index 4d0b125ffe4..8f5dcf359e9 100644 --- a/src/mesa/drivers/dri/i965/brw_gs.c +++ b/src/mesa/drivers/dri/i965/brw_gs.c @@ -33,6 +33,46 @@ #include "brw_state.h" #include "brw_ff_gs.h" #include "brw_nir.h" +#include "brw_program.h" +#include "compiler/glsl/ir_uniform.h" + +static void +brw_gs_debug_recompile(struct brw_context *brw, + struct gl_shader_program *shader_prog, + const struct brw_gs_prog_key *key) +{ + struct brw_cache_item *c = NULL; + const struct brw_gs_prog_key *old_key = NULL; + bool found = false; + + perf_debug("Recompiling geometry shader for program %d\n", + shader_prog->Name); + + for (unsigned int i = 0; i < brw->cache.size; i++) { + for (c = brw->cache.items[i]; c; c = c->next) { + if (c->cache_id == BRW_CACHE_GS_PROG) { + old_key = c->key; + + if (old_key->program_string_id == key->program_string_id) + break; + } + } + if (c) + break; + } + + if (!c) { + perf_debug(" Didn't find previous compile in the shader cache for " + "debug\n"); + return; + } + + found |= brw_debug_recompile_sampler_key(brw, &old_key->tex, &key->tex); + + if (!found) { + perf_debug(" Something else\n"); + } +} static void assign_gs_binding_table_offsets(const struct brw_device_info *devinfo, @@ -52,29 +92,22 @@ assign_gs_binding_table_offsets(const struct brw_device_info *devinfo, } bool -brw_compile_gs_prog(struct brw_context *brw, +brw_codegen_gs_prog(struct brw_context *brw, struct gl_shader_program *prog, struct brw_geometry_program *gp, - struct brw_gs_prog_key *key, - struct brw_gs_compile_output *output) + struct brw_gs_prog_key *key) { - struct brw_gs_compile c; - memset(&c, 0, sizeof(c)); - c.key = *key; - c.gp = gp; - - /* We get the bind map as input in the output struct...*/ - c.prog_data.base.base.map_entries = output->prog_data.base.base.map_entries; - memcpy(c.prog_data.base.base.bind_map, output->prog_data.base.base.bind_map, - sizeof(c.prog_data.base.base.bind_map)); - - c.prog_data.include_primitive_id = - (gp->program.Base.InputsRead & VARYING_BIT_PRIMITIVE_ID) != 0; + struct brw_compiler *compiler = brw->intelScreen->compiler; + struct gl_shader *shader = prog->_LinkedShaders[MESA_SHADER_GEOMETRY]; + struct brw_stage_state *stage_state = &brw->gs.base; + struct brw_gs_prog_data prog_data; + bool start_busy = false; + double start_time = 0; - c.prog_data.invocations = gp->program.Invocations; + memset(&prog_data, 0, sizeof(prog_data)); assign_gs_binding_table_offsets(brw->intelScreen->devinfo, prog, - &gp->program.Base, &c.prog_data); + &gp->program.Base, &prog_data); /* Allocate the references to the uniforms that will end up in the * prog_data associated with the compiled program, and which will be freed @@ -85,265 +118,88 @@ brw_compile_gs_prog(struct brw_context *brw, * every uniform is a float which gets padded to the size of a vec4. */ struct gl_shader *gs = prog->_LinkedShaders[MESA_SHADER_GEOMETRY]; - int param_count = gp->program.Base.nir->num_uniforms * 4; + struct brw_shader *bgs = (struct brw_shader *) gs; + int param_count = gp->program.Base.nir->num_uniforms; + if (!compiler->scalar_stage[MESA_SHADER_GEOMETRY]) + param_count *= 4; - c.prog_data.base.base.param = + prog_data.base.base.param = rzalloc_array(NULL, const gl_constant_value *, param_count); - c.prog_data.base.base.pull_param = + prog_data.base.base.pull_param = rzalloc_array(NULL, const gl_constant_value *, param_count); - c.prog_data.base.base.image_param = + prog_data.base.base.image_param = rzalloc_array(NULL, struct brw_image_param, gs->NumImages); - c.prog_data.base.base.nr_params = param_count; - c.prog_data.base.base.nr_image_params = gs->NumImages; + prog_data.base.base.nr_params = param_count; + prog_data.base.base.nr_image_params = gs->NumImages; brw_nir_setup_glsl_uniforms(gp->program.Base.nir, prog, &gp->program.Base, - &c.prog_data.base.base, false); - - if (brw->gen >= 8) { - c.prog_data.static_vertex_count = !gp->program.Base.nir ? -1 : - nir_gs_count_vertices(gp->program.Base.nir); - } - - if (brw->gen >= 7) { - if (gp->program.OutputType == GL_POINTS) { - /* When the output type is points, the geometry shader may output data - * to multiple streams, and EndPrimitive() has no effect. So we - * configure the hardware to interpret the control data as stream ID. - */ - c.prog_data.control_data_format = GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID; - - /* We only have to emit control bits if we are using streams */ - if (prog->Geom.UsesStreams) - c.control_data_bits_per_vertex = 2; - else - c.control_data_bits_per_vertex = 0; - } else { - /* When the output type is triangle_strip or line_strip, EndPrimitive() - * may be used to terminate the current strip and start a new one - * (similar to primitive restart), and outputting data to multiple - * streams is not supported. So we configure the hardware to interpret - * the control data as EndPrimitive information (a.k.a. "cut bits"). - */ - c.prog_data.control_data_format = GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT; - - /* We only need to output control data if the shader actually calls - * EndPrimitive(). - */ - c.control_data_bits_per_vertex = gp->program.UsesEndPrimitive ? 1 : 0; - } - } else { - /* There are no control data bits in gen6. */ - c.control_data_bits_per_vertex = 0; - - /* If it is using transform feedback, enable it */ - if (prog->TransformFeedback.NumVarying) - c.prog_data.gen6_xfb_enabled = true; - else - c.prog_data.gen6_xfb_enabled = false; - } - c.control_data_header_size_bits = - gp->program.VerticesOut * c.control_data_bits_per_vertex; - - /* 1 HWORD = 32 bytes = 256 bits */ - c.prog_data.control_data_header_size_hwords = - ALIGN(c.control_data_header_size_bits, 256) / 256; + &prog_data.base.base, + compiler->scalar_stage[MESA_SHADER_GEOMETRY]); GLbitfield64 outputs_written = gp->program.Base.OutputsWritten; - brw_compute_vue_map(brw->intelScreen->devinfo, - &c.prog_data.base.vue_map, outputs_written, - prog ? prog->SeparateShader : false); - - /* Compute the output vertex size. - * - * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 STATE_GS - Output Vertex - * Size (p168): - * - * [0,62] indicating [1,63] 16B units - * - * Specifies the size of each vertex stored in the GS output entry - * (following any Control Header data) as a number of 128-bit units - * (minus one). - * - * Programming Restrictions: The vertex size must be programmed as a - * multiple of 32B units with the following exception: Rendering is - * disabled (as per SOL stage state) and the vertex size output by the - * GS thread is 16B. - * - * If rendering is enabled (as per SOL state) the vertex size must be - * programmed as a multiple of 32B units. In other words, the only time - * software can program a vertex size with an odd number of 16B units - * is when rendering is disabled. - * - * Note: B=bytes in the above text. - * - * It doesn't seem worth the extra trouble to optimize the case where the - * vertex size is 16B (especially since this would require special-casing - * the GEN assembly that writes to the URB). So we just set the vertex - * size to a multiple of 32B (2 vec4's) in all cases. - * - * The maximum output vertex size is 62*16 = 992 bytes (31 hwords). We - * budget that as follows: - * - * 512 bytes for varyings (a varying component is 4 bytes and - * gl_MaxGeometryOutputComponents = 128) - * 16 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16 - * bytes) - * 16 bytes overhead for gl_Position (we allocate it a slot in the VUE - * even if it's not used) - * 32 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots - * whenever clip planes are enabled, even if the shader doesn't - * write to gl_ClipDistance) - * 16 bytes overhead since the VUE size must be a multiple of 32 bytes - * (see below)--this causes up to 1 VUE slot to be wasted - * 400 bytes available for varying packing overhead - * - * Worst-case varying packing overhead is 3/4 of a varying slot (12 bytes) - * per interpolation type, so this is plenty. - * - */ - unsigned output_vertex_size_bytes = c.prog_data.base.vue_map.num_slots * 16; - assert(brw->gen == 6 || - output_vertex_size_bytes <= GEN7_MAX_GS_OUTPUT_VERTEX_SIZE_BYTES); - c.prog_data.output_vertex_size_hwords = - ALIGN(output_vertex_size_bytes, 32) / 32; - - /* Compute URB entry size. The maximum allowed URB entry size is 32k. - * That divides up as follows: - * - * 64 bytes for the control data header (cut indices or StreamID bits) - * 4096 bytes for varyings (a varying component is 4 bytes and - * gl_MaxGeometryTotalOutputComponents = 1024) - * 4096 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16 - * bytes/vertex and gl_MaxGeometryOutputVertices is 256) - * 4096 bytes overhead for gl_Position (we allocate it a slot in the VUE - * even if it's not used) - * 8192 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots - * whenever clip planes are enabled, even if the shader doesn't - * write to gl_ClipDistance) - * 4096 bytes overhead since the VUE size must be a multiple of 32 - * bytes (see above)--this causes up to 1 VUE slot to be wasted - * 8128 bytes available for varying packing overhead - * - * Worst-case varying packing overhead is 3/4 of a varying slot per - * interpolation type, which works out to 3072 bytes, so this would allow - * us to accommodate 2 interpolation types without any danger of running - * out of URB space. - * - * In practice, the risk of running out of URB space is very small, since - * the above figures are all worst-case, and most of them scale with the - * number of output vertices. So we'll just calculate the amount of space - * we need, and if it's too large, fail to compile. - * - * The above is for gen7+ where we have a single URB entry that will hold - * all the output. In gen6, we will have to allocate URB entries for every - * vertex we emit, so our URB entries only need to be large enough to hold - * a single vertex. Also, gen6 does not have a control data header. - */ - unsigned output_size_bytes; - if (brw->gen >= 7) { - output_size_bytes = - c.prog_data.output_vertex_size_hwords * 32 * gp->program.VerticesOut; - output_size_bytes += 32 * c.prog_data.control_data_header_size_hwords; - } else { - output_size_bytes = c.prog_data.output_vertex_size_hwords * 32; - } - - /* Broadwell stores "Vertex Count" as a full 8 DWord (32 byte) URB output, - * which comes before the control header. - */ - if (brw->gen >= 8) - output_size_bytes += 32; - - assert(output_size_bytes >= 1); - int max_output_size_bytes = GEN7_MAX_GS_URB_ENTRY_SIZE_BYTES; - if (brw->gen == 6) - max_output_size_bytes = GEN6_MAX_GS_URB_ENTRY_SIZE_BYTES; - if (output_size_bytes > max_output_size_bytes) - return false; - + prog_data.base.cull_distance_mask = + ((1 << gp->program.Base.CullDistanceArraySize) - 1) << + gp->program.Base.ClipDistanceArraySize; - /* URB entry sizes are stored as a multiple of 64 bytes in gen7+ and - * a multiple of 128 bytes in gen6. - */ - if (brw->gen >= 7) - c.prog_data.base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64; - else - c.prog_data.base.urb_entry_size = ALIGN(output_size_bytes, 128) / 128; - - c.prog_data.output_topology = - get_hw_prim_for_gl_prim(gp->program.OutputType); - - /* The GLSL linker will have already matched up GS inputs and the outputs - * of prior stages. The driver does extend VS outputs in some cases, but - * only for legacy OpenGL or Gen4-5 hardware, neither of which offer - * geometry shader support. So we can safely ignore that. - * - * For SSO pipelines, we use a fixed VUE map layout based on variable - * locations, so we can rely on rendezvous-by-location making this work. - * - * However, we need to ignore VARYING_SLOT_PRIMITIVE_ID, as it's not - * written by previous stages and shows up via payload magic. - */ - GLbitfield64 inputs_read = - gp->program.Base.InputsRead & ~VARYING_BIT_PRIMITIVE_ID; brw_compute_vue_map(brw->intelScreen->devinfo, - &c.input_vue_map, inputs_read, + &prog_data.base.vue_map, outputs_written, prog->SeparateShader); - /* GS inputs are read from the VUE 256 bits (2 vec4's) at a time, so we - * need to program a URB read length of ceiling(num_slots / 2). - */ - c.prog_data.base.urb_read_length = (c.input_vue_map.num_slots + 1) / 2; + if (unlikely(INTEL_DEBUG & DEBUG_GS)) + brw_dump_ir("geometry", prog, gs, NULL); + + int st_index = -1; + if (INTEL_DEBUG & DEBUG_SHADER_TIME) + st_index = brw_get_shader_time_index(brw, prog, NULL, ST_GS); + + if (unlikely(brw->perf_debug)) { + start_busy = brw->batch.last_bo && drm_intel_bo_busy(brw->batch.last_bo); + start_time = get_time(); + } void *mem_ctx = ralloc_context(NULL); unsigned program_size; + char *error_str; const unsigned *program = - brw_gs_emit(brw, prog, &c, mem_ctx, &program_size); + brw_compile_gs(brw->intelScreen->compiler, brw, mem_ctx, key, + &prog_data, shader->Program->nir, prog, + st_index, &program_size, &error_str); if (program == NULL) { ralloc_free(mem_ctx); return false; } - output->mem_ctx = mem_ctx; - output->program = program; - output->program_size = program_size; - memcpy(&output->prog_data, &c.prog_data, - sizeof(output->prog_data)); - - return true; -} - -bool -brw_codegen_gs_prog(struct brw_context *brw, - struct gl_shader_program *prog, - struct brw_geometry_program *gp, - struct brw_gs_prog_key *key) -{ - struct brw_gs_compile_output output; - struct brw_stage_state *stage_state = &brw->gs.base; - - if (brw_compile_gs_prog(brw, prog, gp, key, &output)) - return false; + if (unlikely(brw->perf_debug)) { + if (bgs->compiled_once) { + brw_gs_debug_recompile(brw, prog, key); + } + if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) { + perf_debug("GS compile took %.03f ms and stalled the GPU\n", + (get_time() - start_time) * 1000); + } + bgs->compiled_once = true; + } - if (output.prog_data.base.base.total_scratch) { + /* Scratch space is used for register spilling */ + if (prog_data.base.base.total_scratch) { brw_get_scratch_bo(brw, &stage_state->scratch_bo, - output.prog_data.base.base.total_scratch * + prog_data.base.base.total_scratch * brw->max_gs_threads); } brw_upload_cache(&brw->cache, BRW_CACHE_GS_PROG, key, sizeof(*key), - output.program, output.program_size, - &output.prog_data, sizeof(output.prog_data), + program, program_size, + &prog_data, sizeof(prog_data), &stage_state->prog_offset, &brw->gs.prog_data); - ralloc_free(output.mem_ctx); + ralloc_free(mem_ctx); return true; } static bool -brw_gs_state_dirty(struct brw_context *brw) +brw_gs_state_dirty(const struct brw_context *brw) { return brw_state_dirty(brw, _NEW_TEXTURE,