i965: Relase input URB Handles on Gen7/7.5 when TCS threads finish.
authorKenneth Graunke <kenneth@whitecape.org>
Thu, 26 Nov 2015 01:54:22 +0000 (17:54 -0800)
committerKenneth Graunke <kenneth@whitecape.org>
Mon, 28 Dec 2015 21:17:00 +0000 (13:17 -0800)
Pre-Broadwell hardware requires us to manually release the ICP Handles
by issuing URB read messages with the "Complete" bit set.  We can do
this in pairs to use fewer URB read messages.

Based heavily on work from Chris Forbes.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
src/mesa/drivers/dri/i965/brw_defines.h
src/mesa/drivers/dri/i965/brw_shader.cpp
src/mesa/drivers/dri/i965/brw_vec4.cpp
src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp

index 61bcebdbc4b8f9d03f644d5143fa5f7f9a4482f5..d0137481c6c658a2b139bed47bba55cd5f9ef3fd 100644 (file)
@@ -1313,6 +1313,8 @@ enum opcode {
    TCS_OPCODE_SET_OUTPUT_URB_OFFSETS,
    TCS_OPCODE_GET_PRIMITIVE_ID,
    TCS_OPCODE_CREATE_BARRIER_HEADER,
+   TCS_OPCODE_SRC0_010_IS_ZERO,
+   TCS_OPCODE_RELEASE_INPUT,
 
    TES_OPCODE_GET_PRIMITIVE_ID,
    TES_OPCODE_CREATE_INPUT_READ_HEADER,
index 3a36678e8d5f18c133b0300cdb84cdfb929354a1..f692bc2de3598b85a5df9f8ff708972b409b10db 100644 (file)
@@ -568,6 +568,10 @@ brw_instruction_name(enum opcode op)
       return "tcs_get_primitive_id";
    case TCS_OPCODE_CREATE_BARRIER_HEADER:
       return "tcs_create_barrier_header";
+   case TCS_OPCODE_SRC0_010_IS_ZERO:
+      return "tcs_src0<0,1,0>_is_zero";
+   case TCS_OPCODE_RELEASE_INPUT:
+      return "tcs_release_input";
    case TES_OPCODE_CREATE_INPUT_READ_HEADER:
       return "tes_create_input_read_header";
    case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
@@ -1009,6 +1013,7 @@ backend_instruction::has_side_effects() const
    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
    case FS_OPCODE_FB_WRITE:
    case SHADER_OPCODE_BARRIER:
+   case TCS_OPCODE_RELEASE_INPUT:
       return true;
    default:
       return false;
index 116dd353016b6504d648c3ddbf9ece7a939ee3f5..f1c3d37ce1c4ea534cb537a6187328d2a17df080 100644 (file)
@@ -157,6 +157,7 @@ vec4_instruction::is_send_from_grf()
    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
    case VEC4_OPCODE_URB_READ:
    case TCS_OPCODE_URB_WRITE:
+   case TCS_OPCODE_RELEASE_INPUT:
    case SHADER_OPCODE_BARRIER:
       return true;
    default:
index cbf8b1d0bd0100cd26693448421523563f1fa6cd..cce2b4d1f4ccb02731955630d0ca7deeabbc3ca2 100644 (file)
@@ -932,6 +932,42 @@ generate_vec4_urb_read(struct brw_codegen *p,
    brw_inst_set_urb_global_offset(devinfo, send, inst->offset);
 }
 
+static void
+generate_tcs_release_input(struct brw_codegen *p,
+                           struct brw_reg header,
+                           struct brw_reg vertex,
+                           struct brw_reg is_unpaired)
+{
+   const struct brw_device_info *devinfo = p->devinfo;
+
+   assert(vertex.file == BRW_IMMEDIATE_VALUE);
+   assert(vertex.type == BRW_REGISTER_TYPE_UD);
+
+   /* m0.0-0.1: URB handles */
+   struct brw_reg urb_handles =
+      retype(brw_vec2_grf(1 + (vertex.ud >> 3), vertex.ud & 7),
+             BRW_REGISTER_TYPE_UD);
+
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_MOV(p, header, brw_imm_ud(0));
+   brw_MOV(p, vec2(get_element_ud(header, 0)), urb_handles);
+   brw_pop_insn_state(p);
+
+   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_set_dest(p, send, brw_null_reg());
+   brw_set_src0(p, send, header);
+   brw_set_message_descriptor(p, send, BRW_SFID_URB,
+                              1 /* mlen */, 0 /* rlen */,
+                              true /* header */, false /* eot */);
+   brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD);
+   brw_inst_set_urb_complete(devinfo, send, 1);
+   brw_inst_set_urb_swizzle_control(devinfo, send, is_unpaired.ud ?
+                                    BRW_URB_SWIZZLE_NONE :
+                                    BRW_URB_SWIZZLE_INTERLEAVE);
+}
+
 static void
 generate_tes_get_primitive_id(struct brw_codegen *p, struct brw_reg dst)
 {
@@ -1846,6 +1882,16 @@ generate_code(struct brw_codegen *p,
          generate_tes_get_primitive_id(p, dst);
          break;
 
+      case TCS_OPCODE_SRC0_010_IS_ZERO:
+         /* If src_reg had stride like fs_reg, we wouldn't need this. */
+         brw_MOV(p, brw_null_reg(), stride(src[0], 0, 1, 0));
+         brw_inst_set_cond_modifier(devinfo, brw_last_inst, BRW_CONDITIONAL_Z);
+         break;
+
+      case TCS_OPCODE_RELEASE_INPUT:
+         generate_tcs_release_input(p, dst, src[0], src[1]);
+         break;
+
       case SHADER_OPCODE_BARRIER:
          brw_barrier(p, src[0]);
          brw_WAIT(p);
index 507db749e63e4b4f7ad120ca1a4353cc297492c2..7693f095a52c5836697a17b5e488df6a3278d712 100644 (file)
@@ -156,16 +156,54 @@ vec4_tcs_visitor::emit_prolog()
 void
 vec4_tcs_visitor::emit_thread_end()
 {
+   vec4_instruction *inst;
    current_annotation = "thread end";
 
    if (nir->info.tcs.vertices_out % 2) {
       emit(BRW_OPCODE_ENDIF);
    }
 
+   if (devinfo->gen == 7) {
+      struct brw_tcs_prog_data *tcs_prog_data =
+         (struct brw_tcs_prog_data *) prog_data;
+
+      current_annotation = "release input vertices";
+
+      /* Synchronize all threads, so we know that no one is still
+       * using the input URB handles.
+       */
+      if (tcs_prog_data->instances > 1) {
+         dst_reg header = dst_reg(this, glsl_type::uvec4_type);
+         emit(TCS_OPCODE_CREATE_BARRIER_HEADER, header);
+         emit(SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header));
+      }
+
+      /* Make thread 0 (invocations <1, 0>) release pairs of ICP handles.
+       * We want to compare the bottom half of invocation_id with 0, but
+       * use that truth value for the top half as well.  Unfortunately,
+       * we don't have stride in the vec4 world, nor UV immediates in
+       * align16, so we need an opcode to get invocation_id<0,4,0>.
+       */
+      emit(TCS_OPCODE_SRC0_010_IS_ZERO, dst_null_d(), invocation_id);
+      emit(IF(BRW_PREDICATE_NORMAL));
+      for (unsigned i = 0; i < key->input_vertices; i += 2) {
+         /* If we have an odd number of input vertices, the last will be
+          * unpaired.  We don't want to use an interleaved URB write in
+          * that case.
+          */
+         const bool is_unpaired = i == key->input_vertices - 1;
+
+         dst_reg header(this, glsl_type::uvec4_type);
+         emit(TCS_OPCODE_RELEASE_INPUT, header, brw_imm_ud(i),
+              brw_imm_ud(is_unpaired));
+      }
+      emit(BRW_OPCODE_ENDIF);
+   }
+
    if (unlikely(INTEL_DEBUG & DEBUG_SHADER_TIME))
       emit_shader_time_end();
 
-   vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
+   inst = emit(VS_OPCODE_URB_WRITE);
    inst->mlen = 1;   /* just the header, no data. */
    inst->urb_write_flags = BRW_URB_WRITE_EOT_COMPLETE;
 }