From: Kenneth Graunke Date: Thu, 26 Nov 2015 01:54:22 +0000 (-0800) Subject: i965: Relase input URB Handles on Gen7/7.5 when TCS threads finish. X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=b7793783b3df94880655234bc2a9054eddf01913;p=mesa.git i965: Relase input URB Handles on Gen7/7.5 when TCS threads finish. Pre-Broadwell hardware requires us to manually release the ICP Handles by issuing URB read messages with the "Complete" bit set. We can do this in pairs to use fewer URB read messages. Based heavily on work from Chris Forbes. Signed-off-by: Kenneth Graunke Reviewed-by: Edward O'Callaghan Reviewed-by: Jordan Justen --- diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h index 61bcebdbc4b..d0137481c6c 100644 --- a/src/mesa/drivers/dri/i965/brw_defines.h +++ b/src/mesa/drivers/dri/i965/brw_defines.h @@ -1313,6 +1313,8 @@ enum opcode { TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, TCS_OPCODE_GET_PRIMITIVE_ID, TCS_OPCODE_CREATE_BARRIER_HEADER, + TCS_OPCODE_SRC0_010_IS_ZERO, + TCS_OPCODE_RELEASE_INPUT, TES_OPCODE_GET_PRIMITIVE_ID, TES_OPCODE_CREATE_INPUT_READ_HEADER, diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp index 3a36678e8d5..f692bc2de35 100644 --- a/src/mesa/drivers/dri/i965/brw_shader.cpp +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp @@ -568,6 +568,10 @@ brw_instruction_name(enum opcode op) return "tcs_get_primitive_id"; case TCS_OPCODE_CREATE_BARRIER_HEADER: return "tcs_create_barrier_header"; + case TCS_OPCODE_SRC0_010_IS_ZERO: + return "tcs_src0<0,1,0>_is_zero"; + case TCS_OPCODE_RELEASE_INPUT: + return "tcs_release_input"; case TES_OPCODE_CREATE_INPUT_READ_HEADER: return "tes_create_input_read_header"; case TES_OPCODE_ADD_INDIRECT_URB_OFFSET: @@ -1009,6 +1013,7 @@ backend_instruction::has_side_effects() const case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT: case FS_OPCODE_FB_WRITE: case SHADER_OPCODE_BARRIER: + case TCS_OPCODE_RELEASE_INPUT: return true; default: return false; diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp index 116dd353016..f1c3d37ce1c 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp @@ -157,6 +157,7 @@ vec4_instruction::is_send_from_grf() case SHADER_OPCODE_TYPED_SURFACE_WRITE: case VEC4_OPCODE_URB_READ: case TCS_OPCODE_URB_WRITE: + case TCS_OPCODE_RELEASE_INPUT: case SHADER_OPCODE_BARRIER: return true; default: diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp index cbf8b1d0bd0..cce2b4d1f4c 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp @@ -932,6 +932,42 @@ generate_vec4_urb_read(struct brw_codegen *p, brw_inst_set_urb_global_offset(devinfo, send, inst->offset); } +static void +generate_tcs_release_input(struct brw_codegen *p, + struct brw_reg header, + struct brw_reg vertex, + struct brw_reg is_unpaired) +{ + const struct brw_device_info *devinfo = p->devinfo; + + assert(vertex.file == BRW_IMMEDIATE_VALUE); + assert(vertex.type == BRW_REGISTER_TYPE_UD); + + /* m0.0-0.1: URB handles */ + struct brw_reg urb_handles = + retype(brw_vec2_grf(1 + (vertex.ud >> 3), vertex.ud & 7), + BRW_REGISTER_TYPE_UD); + + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_MOV(p, header, brw_imm_ud(0)); + brw_MOV(p, vec2(get_element_ud(header, 0)), urb_handles); + brw_pop_insn_state(p); + + brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); + brw_set_dest(p, send, brw_null_reg()); + brw_set_src0(p, send, header); + brw_set_message_descriptor(p, send, BRW_SFID_URB, + 1 /* mlen */, 0 /* rlen */, + true /* header */, false /* eot */); + brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD); + brw_inst_set_urb_complete(devinfo, send, 1); + brw_inst_set_urb_swizzle_control(devinfo, send, is_unpaired.ud ? + BRW_URB_SWIZZLE_NONE : + BRW_URB_SWIZZLE_INTERLEAVE); +} + static void generate_tes_get_primitive_id(struct brw_codegen *p, struct brw_reg dst) { @@ -1846,6 +1882,16 @@ generate_code(struct brw_codegen *p, generate_tes_get_primitive_id(p, dst); break; + case TCS_OPCODE_SRC0_010_IS_ZERO: + /* If src_reg had stride like fs_reg, we wouldn't need this. */ + brw_MOV(p, brw_null_reg(), stride(src[0], 0, 1, 0)); + brw_inst_set_cond_modifier(devinfo, brw_last_inst, BRW_CONDITIONAL_Z); + break; + + case TCS_OPCODE_RELEASE_INPUT: + generate_tcs_release_input(p, dst, src[0], src[1]); + break; + case SHADER_OPCODE_BARRIER: brw_barrier(p, src[0]); brw_WAIT(p); diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp index 507db749e63..7693f095a52 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp @@ -156,16 +156,54 @@ vec4_tcs_visitor::emit_prolog() void vec4_tcs_visitor::emit_thread_end() { + vec4_instruction *inst; current_annotation = "thread end"; if (nir->info.tcs.vertices_out % 2) { emit(BRW_OPCODE_ENDIF); } + if (devinfo->gen == 7) { + struct brw_tcs_prog_data *tcs_prog_data = + (struct brw_tcs_prog_data *) prog_data; + + current_annotation = "release input vertices"; + + /* Synchronize all threads, so we know that no one is still + * using the input URB handles. + */ + if (tcs_prog_data->instances > 1) { + dst_reg header = dst_reg(this, glsl_type::uvec4_type); + emit(TCS_OPCODE_CREATE_BARRIER_HEADER, header); + emit(SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header)); + } + + /* Make thread 0 (invocations <1, 0>) release pairs of ICP handles. + * We want to compare the bottom half of invocation_id with 0, but + * use that truth value for the top half as well. Unfortunately, + * we don't have stride in the vec4 world, nor UV immediates in + * align16, so we need an opcode to get invocation_id<0,4,0>. + */ + emit(TCS_OPCODE_SRC0_010_IS_ZERO, dst_null_d(), invocation_id); + emit(IF(BRW_PREDICATE_NORMAL)); + for (unsigned i = 0; i < key->input_vertices; i += 2) { + /* If we have an odd number of input vertices, the last will be + * unpaired. We don't want to use an interleaved URB write in + * that case. + */ + const bool is_unpaired = i == key->input_vertices - 1; + + dst_reg header(this, glsl_type::uvec4_type); + emit(TCS_OPCODE_RELEASE_INPUT, header, brw_imm_ud(i), + brw_imm_ud(is_unpaired)); + } + emit(BRW_OPCODE_ENDIF); + } + if (unlikely(INTEL_DEBUG & DEBUG_SHADER_TIME)) emit_shader_time_end(); - vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE); + inst = emit(VS_OPCODE_URB_WRITE); inst->mlen = 1; /* just the header, no data. */ inst->urb_write_flags = BRW_URB_WRITE_EOT_COMPLETE; }