From: Kenneth Graunke <kenneth@whitecape.org>
Date: Sun, 15 Nov 2015 01:40:43 +0000 (-0800)
Subject: i965: Write a scalar TCS backend that runs in SINGLE_PATCH mode.
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=7d9143ad885752184156b3a0d3e492aef09af3b0;p=mesa.git

i965: Write a scalar TCS backend that runs in SINGLE_PATCH mode.

Unlike most shader stages, the Hull Shader hardware makes us explicitly
tell it how many threads to dispatch and manually configure the channel
mask.  One perk of this is that we have a lot of flexibility - we can
run it in either SIMD4x2 or SIMD8 mode.

Treating it as SIMD8 means that shaders with 8 or fewer output vertices
(which is overwhemingly the common case) can be handled by a single
thread.  This has several intriguing properties:

- Accessing input arrays with gl_InvocationID as the index is a simple
  SIMD8 URB read with g1 as the header.  No indirect addressing required.
- Barriers are no-ops.
- We could potentially do output shadowing to combine writes, as the
  concurrency concerns are gone.  (We don't do this yet, though.)

v2: Drop first_non_payload_grf change, as it was always adding 0
    (caught by Jordan Justen).

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---

diff --git a/src/mesa/drivers/dri/i965/brw_compiler.c b/src/mesa/drivers/dri/i965/brw_compiler.c
index c8a38e3145c..61bb5ade282 100644
--- a/src/mesa/drivers/dri/i965/brw_compiler.c
+++ b/src/mesa/drivers/dri/i965/brw_compiler.c
@@ -152,7 +152,8 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
 
    compiler->scalar_stage[MESA_SHADER_VERTEX] =
       devinfo->gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS);
-   compiler->scalar_stage[MESA_SHADER_TESS_CTRL] = false;
+   compiler->scalar_stage[MESA_SHADER_TESS_CTRL] =
+      devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_TCS", false);
    compiler->scalar_stage[MESA_SHADER_TESS_EVAL] =
       devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_TES", true);
    compiler->scalar_stage[MESA_SHADER_GEOMETRY] =
@@ -194,6 +195,7 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
 
    compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].EmitNoIndirectInput = false;
    compiler->glsl_compiler_options[MESA_SHADER_TESS_EVAL].EmitNoIndirectInput = false;
+   compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].EmitNoIndirectOutput = false;
 
    if (compiler->scalar_stage[MESA_SHADER_GEOMETRY])
       compiler->glsl_compiler_options[MESA_SHADER_GEOMETRY].EmitNoIndirectInput = false;
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 78f7d40a84e..4b6aa678d54 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -1757,6 +1757,19 @@ fs_visitor::assign_vs_urb_setup()
    }
 }
 
+void
+fs_visitor::assign_tcs_single_patch_urb_setup()
+{
+   assert(stage == MESA_SHADER_TESS_CTRL);
+
+   brw_vue_prog_data *vue_prog_data = (brw_vue_prog_data *) prog_data;
+
+   /* Rewrite all ATTR file references to HW_REGs. */
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      convert_attr_sources_to_hw_regs(inst);
+   }
+}
+
 void
 fs_visitor::assign_tes_urb_setup()
 {
@@ -5473,6 +5486,88 @@ fs_visitor::run_vs(gl_clip_plane *clip_planes)
    return !failed;
 }
 
+bool
+fs_visitor::run_tcs_single_patch()
+{
+   assert(stage == MESA_SHADER_TESS_CTRL);
+
+   struct brw_tcs_prog_data *tcs_prog_data =
+      (struct brw_tcs_prog_data *) prog_data;
+
+   /* r1-r4 contain the ICP handles. */
+   payload.num_regs = 5;
+
+   if (shader_time_index >= 0)
+      emit_shader_time_begin();
+
+   /* Initialize gl_InvocationID */
+   fs_reg channels_uw = bld.vgrf(BRW_REGISTER_TYPE_UW);
+   fs_reg channels_ud = bld.vgrf(BRW_REGISTER_TYPE_UD);
+   bld.MOV(channels_uw, fs_reg(brw_imm_uv(0x76543210)));
+   bld.MOV(channels_ud, channels_uw);
+
+   if (tcs_prog_data->instances == 1) {
+      invocation_id = channels_ud;
+   } else {
+      invocation_id = bld.vgrf(BRW_REGISTER_TYPE_UD);
+
+      /* Get instance number from g0.2 bits 23:17, and multiply it by 8. */
+      fs_reg t = bld.vgrf(BRW_REGISTER_TYPE_UD);
+      fs_reg instance_times_8 = bld.vgrf(BRW_REGISTER_TYPE_UD);
+      bld.AND(t, fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD)),
+              brw_imm_ud(INTEL_MASK(23, 17)));
+      bld.SHR(instance_times_8, t, brw_imm_ud(17 - 3));
+
+      bld.ADD(invocation_id, instance_times_8, channels_ud);
+   }
+
+   /* Fix the disptach mask */
+   if (nir->info.tcs.vertices_out % 8) {
+      bld.CMP(bld.null_reg_ud(), invocation_id,
+              brw_imm_ud(nir->info.tcs.vertices_out), BRW_CONDITIONAL_L);
+      bld.IF(BRW_PREDICATE_NORMAL);
+   }
+
+   emit_nir_code();
+
+   if (nir->info.tcs.vertices_out % 8) {
+      bld.emit(BRW_OPCODE_ENDIF);
+   }
+
+   /* Emit EOT write; set TR DS Cache bit */
+   fs_reg srcs[3] = {
+      fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
+      fs_reg(brw_imm_ud(WRITEMASK_X << 16)),
+      fs_reg(brw_imm_ud(0)),
+   };
+   fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
+   bld.LOAD_PAYLOAD(payload, srcs, 3, 2);
+
+   fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_SIMD8_MASKED,
+                            bld.null_reg_ud(), payload);
+   inst->mlen = 3;
+   inst->base_mrf = -1;
+   inst->eot = true;
+
+   if (shader_time_index >= 0)
+      emit_shader_time_end();
+
+   if (failed)
+      return false;
+
+   calculate_cfg();
+
+   optimize();
+
+   assign_curb_setup();
+   assign_tcs_single_patch_urb_setup();
+
+   fixup_3src_null_dest();
+   allocate_registers();
+
+   return !failed;
+}
+
 bool
 fs_visitor::run_tes()
 {
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index a5c3297e5a1..ba6bd3f5725 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -110,6 +110,7 @@ public:
 
    bool run_fs(bool do_rep_send);
    bool run_vs(gl_clip_plane *clip_planes);
+   bool run_tcs_single_patch();
    bool run_tes();
    bool run_gs();
    bool run_cs();
@@ -126,6 +127,7 @@ public:
    void assign_urb_setup();
    void convert_attr_sources_to_hw_regs(fs_inst *inst);
    void assign_vs_urb_setup();
+   void assign_tcs_single_patch_urb_setup();
    void assign_tes_urb_setup();
    void assign_gs_urb_setup();
    bool assign_regs(bool allow_spilling);
@@ -250,6 +252,8 @@ public:
                        nir_ssa_undef_instr *instr);
    void nir_emit_vs_intrinsic(const brw::fs_builder &bld,
                               nir_intrinsic_instr *instr);
+   void nir_emit_tcs_intrinsic(const brw::fs_builder &bld,
+                               nir_intrinsic_instr *instr);
    void nir_emit_gs_intrinsic(const brw::fs_builder &bld,
                               nir_intrinsic_instr *instr);
    void nir_emit_fs_intrinsic(const brw::fs_builder &bld,
@@ -405,6 +409,7 @@ public:
    fs_reg userplane[MAX_CLIP_PLANES];
    fs_reg final_gs_vertex_count;
    fs_reg control_data_bits;
+   fs_reg invocation_id;
 
    unsigned grf_used;
    bool spilled_any_registers;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 360e2c97d74..4d14fda4a86 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -113,6 +113,9 @@ fs_visitor::nir_setup_single_output_varying(fs_reg *reg,
 void
 fs_visitor::nir_setup_outputs()
 {
+   if (stage == MESA_SHADER_TESS_CTRL)
+      return;
+
    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
 
    nir_outputs = bld.vgrf(BRW_REGISTER_TYPE_F, nir->num_outputs);
@@ -230,6 +233,8 @@ emit_system_values_block(nir_block *block, fs_visitor *v)
          break;
 
       case nir_intrinsic_load_invocation_id:
+         if (v->stage == MESA_SHADER_TESS_CTRL)
+            break;
          assert(v->stage == MESA_SHADER_GEOMETRY);
          reg = &v->nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
          if (reg->file == BAD_FILE) {
@@ -452,6 +457,9 @@ fs_visitor::nir_emit_instr(nir_instr *instr)
       case MESA_SHADER_VERTEX:
          nir_emit_vs_intrinsic(abld, nir_instr_as_intrinsic(instr));
          break;
+      case MESA_SHADER_TESS_CTRL:
+         nir_emit_tcs_intrinsic(abld, nir_instr_as_intrinsic(instr));
+         break;
       case MESA_SHADER_TESS_EVAL:
          nir_emit_tes_intrinsic(abld, nir_instr_as_intrinsic(instr));
          break;
@@ -1900,6 +1908,354 @@ fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld,
    }
 }
 
+void
+fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
+                                   nir_intrinsic_instr *instr)
+{
+   assert(stage == MESA_SHADER_TESS_CTRL);
+   struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
+   struct brw_tcs_prog_data *tcs_prog_data =
+      (struct brw_tcs_prog_data *) prog_data;
+
+   fs_reg dst;
+   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+      dst = get_nir_dest(instr->dest);
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_primitive_id:
+      bld.MOV(dst, fs_reg(brw_vec1_grf(0, 1)));
+      break;
+   case nir_intrinsic_load_invocation_id:
+      bld.MOV(retype(dst, invocation_id.type), invocation_id);
+      break;
+   case nir_intrinsic_load_patch_vertices_in:
+      bld.MOV(retype(dst, BRW_REGISTER_TYPE_D),
+              brw_imm_d(tcs_key->input_vertices));
+      break;
+
+   case nir_intrinsic_barrier: {
+      if (tcs_prog_data->instances == 1)
+         break;
+
+      fs_reg m0 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+      fs_reg m0_2 = byte_offset(m0, 2 * sizeof(uint32_t));
+
+      const fs_builder fwa_bld = bld.exec_all();
+
+      /* Zero the message header */
+      fwa_bld.MOV(m0, brw_imm_ud(0u));
+
+      /* Copy "Barrier ID" from r0.2, bits 16:13 */
+      fwa_bld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
+                  brw_imm_ud(INTEL_MASK(16, 13)));
+
+      /* Shift it up to bits 27:24. */
+      fwa_bld.SHL(m0_2, m0_2, brw_imm_ud(11));
+
+      /* Set the Barrier Count and the enable bit */
+      fwa_bld.OR(m0_2, m0_2,
+                 brw_imm_ud(tcs_prog_data->instances << 8 | (1 << 15)));
+
+      bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0);
+      break;
+   }
+
+   case nir_intrinsic_load_input:
+      unreachable("nir_lower_io should never give us these.");
+      break;
+
+   case nir_intrinsic_load_per_vertex_input: {
+      fs_reg indirect_offset = get_indirect_offset(instr);
+      unsigned imm_offset = instr->const_index[0];
+
+      const nir_src &vertex_src = instr->src[0];
+      nir_const_value *vertex_const = nir_src_as_const_value(vertex_src);
+
+      fs_inst *inst;
+
+      fs_reg icp_handle;
+
+      if (vertex_const) {
+         /* Emit a MOV to resolve <0,1,0> regioning. */
+         icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+         bld.MOV(icp_handle,
+                 retype(brw_vec1_grf(1 + (vertex_const->i32[0] >> 3),
+                                     vertex_const->i32[0] & 7),
+                        BRW_REGISTER_TYPE_UD));
+      } else if (tcs_prog_data->instances == 1 &&
+                 vertex_src.is_ssa &&
+                 vertex_src.ssa->parent_instr->type == nir_instr_type_intrinsic &&
+                 nir_instr_as_intrinsic(vertex_src.ssa->parent_instr)->intrinsic == nir_intrinsic_load_invocation_id) {
+         /* For the common case of only 1 instance, an array index of
+          * gl_InvocationID means reading g1.  Skip all the indirect work.
+          */
+         icp_handle = retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD);
+      } else {
+         /* The vertex index is non-constant.  We need to use indirect
+          * addressing to fetch the proper URB handle.
+          */
+         icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+
+         /* Each ICP handle is a single DWord (4 bytes) */
+         fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+         bld.SHL(vertex_offset_bytes,
+                 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
+                 brw_imm_ud(2u));
+
+         /* Start at g1.  We might read up to 4 registers. */
+         bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
+                  fs_reg(brw_vec8_grf(1, 0)), vertex_offset_bytes,
+                  brw_imm_ud(4 * REG_SIZE));
+      }
+
+      if (indirect_offset.file == BAD_FILE) {
+         /* Constant indexing - use global offset. */
+         inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle);
+         inst->offset = imm_offset;
+         inst->mlen = 1;
+         inst->base_mrf = -1;
+         inst->regs_written = instr->num_components;
+      } else {
+         /* Indirect indexing - use per-slot offsets as well. */
+         const fs_reg srcs[] = { icp_handle, indirect_offset };
+         fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+         bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
+
+         inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst, payload);
+         inst->offset = imm_offset;
+         inst->base_mrf = -1;
+         inst->mlen = 2;
+         inst->regs_written = instr->num_components;
+      }
+
+      /* Copy the temporary to the destination to deal with writemasking.
+       *
+       * Also attempt to deal with gl_PointSize being in the .w component.
+       */
+      if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
+         inst->dst = bld.vgrf(dst.type, 4);
+         inst->regs_written = 4;
+         bld.MOV(dst, offset(inst->dst, bld, 3));
+      }
+      break;
+   }
+
+   case nir_intrinsic_load_output:
+   case nir_intrinsic_load_per_vertex_output: {
+      fs_reg indirect_offset = get_indirect_offset(instr);
+      unsigned imm_offset = instr->const_index[0];
+
+      fs_inst *inst;
+      if (indirect_offset.file == BAD_FILE) {
+         /* Replicate the patch handle to all enabled channels */
+         fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+         bld.MOV(patch_handle,
+                 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD));
+
+         if (imm_offset == 0) {
+            /* This is a read of gl_TessLevelInner[], which lives in the
+             * Patch URB header.  The layout depends on the domain.
+             */
+            dst.type = BRW_REGISTER_TYPE_F;
+            switch (tcs_key->tes_primitive_mode) {
+            case GL_QUADS: {
+               /* DWords 3-2 (reversed) */
+               fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
+
+               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, patch_handle);
+               inst->offset = 0;
+               inst->mlen = 1;
+               inst->base_mrf = -1;
+               inst->regs_written = 4;
+
+               /* dst.xy = tmp.wz */
+               bld.MOV(dst,                 offset(tmp, bld, 3));
+               bld.MOV(offset(dst, bld, 1), offset(tmp, bld, 2));
+               break;
+            }
+            case GL_TRIANGLES:
+               /* DWord 4; hardcode offset = 1 and regs_written = 1 */
+               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, patch_handle);
+               inst->offset = 1;
+               inst->mlen = 1;
+               inst->base_mrf = -1;
+               inst->regs_written = 1;
+               break;
+            case GL_ISOLINES:
+               /* All channels are undefined. */
+               break;
+            default:
+               unreachable("Bogus tessellation domain");
+            }
+         } else if (imm_offset == 1) {
+            /* This is a read of gl_TessLevelOuter[], which lives in the
+             * Patch URB header.  The layout depends on the domain.
+             */
+            dst.type = BRW_REGISTER_TYPE_F;
+
+            fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
+            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, patch_handle);
+            inst->offset = 1;
+            inst->mlen = 1;
+            inst->base_mrf = -1;
+            inst->regs_written = 4;
+
+            /* Reswizzle: WZYX */
+            fs_reg srcs[4] = {
+               offset(tmp, bld, 3),
+               offset(tmp, bld, 2),
+               offset(tmp, bld, 1),
+               offset(tmp, bld, 0),
+            };
+
+            unsigned num_components;
+            switch (tcs_key->tes_primitive_mode) {
+            case GL_QUADS:
+               num_components = 4;
+               break;
+            case GL_TRIANGLES:
+               num_components = 3;
+               break;
+            case GL_ISOLINES:
+               /* Isolines are not reversed; swizzle .zw -> .xy */
+               srcs[0] = offset(tmp, bld, 2);
+               srcs[1] = offset(tmp, bld, 3);
+               num_components = 2;
+               break;
+            default:
+               unreachable("Bogus tessellation domain");
+            }
+            bld.LOAD_PAYLOAD(dst, srcs, num_components, 0);
+         } else {
+            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, patch_handle);
+            inst->offset = imm_offset;
+            inst->mlen = 1;
+            inst->base_mrf = -1;
+            inst->regs_written = instr->num_components;
+         }
+      } else {
+         /* Indirect indexing - use per-slot offsets as well. */
+         const fs_reg srcs[] = {
+            retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
+            indirect_offset
+         };
+         fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+         bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
+
+         inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst, payload);
+         inst->offset = imm_offset;
+         inst->mlen = 2;
+         inst->base_mrf = -1;
+         inst->regs_written = instr->num_components;
+      }
+      break;
+   }
+
+   case nir_intrinsic_store_output:
+   case nir_intrinsic_store_per_vertex_output: {
+      fs_reg value = get_nir_src(instr->src[0]);
+      fs_reg indirect_offset = get_indirect_offset(instr);
+      unsigned imm_offset = instr->const_index[0];
+      unsigned swiz = BRW_SWIZZLE_XYZW;
+      unsigned mask = instr->const_index[1];
+      unsigned header_regs = 0;
+      fs_reg srcs[7];
+      srcs[header_regs++] = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD);
+
+      if (indirect_offset.file != BAD_FILE) {
+         srcs[header_regs++] = indirect_offset;
+      } else if (tcs_key->program_string_id != 0) {
+         if (imm_offset == 0) {
+            value.type = BRW_REGISTER_TYPE_F;
+
+            mask &= (1 << tesslevel_inner_components(tcs_key->tes_primitive_mode)) - 1;
+
+            /* This is a write to gl_TessLevelInner[], which lives in the
+             * Patch URB header.  The layout depends on the domain.
+             */
+            switch (tcs_key->tes_primitive_mode) {
+            case GL_QUADS:
+               /* gl_TessLevelInner[].xy lives at DWords 3-2 (reversed).
+                * We use an XXYX swizzle to reverse put .xy in the .wz
+                * channels, and use a .zw writemask.
+                */
+               mask = writemask_for_backwards_vector(mask);
+               swiz = BRW_SWIZZLE4(0, 0, 1, 0);
+               break;
+            case GL_TRIANGLES:
+               /* gl_TessLevelInner[].x lives at DWord 4, so we set the
+                * writemask to X and bump the URB offset by 1.
+                */
+               imm_offset = 1;
+               break;
+            case GL_ISOLINES:
+               /* Skip; gl_TessLevelInner[] doesn't exist for isolines. */
+               return;
+            default:
+               unreachable("Bogus tessellation domain");
+            }
+         } else if (imm_offset == 1) {
+            /* This is a write to gl_TessLevelOuter[] which lives in the
+             * Patch URB Header at DWords 4-7.  However, it's reversed, so
+             * instead of .xyzw we have .wzyx.
+             */
+            value.type = BRW_REGISTER_TYPE_F;
+
+            mask &= (1 << tesslevel_outer_components(tcs_key->tes_primitive_mode)) - 1;
+
+            if (tcs_key->tes_primitive_mode == GL_ISOLINES) {
+               /* Isolines .xy should be stored in .zw, in order. */
+               swiz = BRW_SWIZZLE4(0, 0, 0, 1);
+               mask <<= 2;
+            } else {
+               /* Other domains are reversed; store .wzyx instead of .xyzw */
+               swiz = BRW_SWIZZLE_WZYX;
+               mask = writemask_for_backwards_vector(mask);
+            }
+         }
+      }
+
+      if (mask == 0)
+         break;
+
+      unsigned num_components = _mesa_fls(mask);
+      enum opcode opcode;
+
+      if (mask != WRITEMASK_XYZW) {
+         srcs[header_regs++] = brw_imm_ud(mask << 16);
+         opcode = indirect_offset.file != BAD_FILE ?
+            SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT :
+            SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
+      } else {
+         opcode = indirect_offset.file != BAD_FILE ?
+            SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT :
+            SHADER_OPCODE_URB_WRITE_SIMD8;
+      }
+
+      for (unsigned i = 0; i < num_components; i++) {
+         if (mask & (1 << i))
+            srcs[header_regs + i] = offset(value, bld, BRW_GET_SWZ(swiz, i));
+      }
+
+      unsigned mlen = header_regs + num_components;
+
+      fs_reg payload =
+         bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
+      bld.LOAD_PAYLOAD(payload, srcs, mlen, header_regs);
+
+      fs_inst *inst = bld.emit(opcode, bld.null_reg_ud(), payload);
+      inst->offset = imm_offset;
+      inst->mlen = mlen;
+      inst->base_mrf = -1;
+      break;
+   }
+
+   default:
+      nir_emit_intrinsic(bld, instr);
+      break;
+   }
+}
+
 void
 fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
                                    nir_intrinsic_instr *instr)
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index da29f0be8b4..f0b99688dec 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -1015,6 +1015,9 @@ fs_visitor::init()
    case MESA_SHADER_VERTEX:
       key_tex = &((const brw_vs_prog_key *) key)->tex;
       break;
+   case MESA_SHADER_TESS_CTRL:
+      key_tex = &((const brw_tcs_prog_key *) key)->tex;
+      break;
    case MESA_SHADER_TESS_EVAL:
       key_tex = &((const brw_tes_prog_key *) key)->tex;
       break;
diff --git a/src/mesa/drivers/dri/i965/brw_tcs.c b/src/mesa/drivers/dri/i965/brw_tcs.c
index 0117ffe3589..98ed2b253a6 100644
--- a/src/mesa/drivers/dri/i965/brw_tcs.c
+++ b/src/mesa/drivers/dri/i965/brw_tcs.c
@@ -214,7 +214,8 @@ brw_codegen_tcs_prog(struct brw_context *brw,
       prog_data.base.base.nr_image_params = tcs->NumImages;
 
       brw_nir_setup_glsl_uniforms(nir, shader_prog, &tcp->program.Base,
-                                  &prog_data.base.base, false);
+                                  &prog_data.base.base,
+                                  compiler->scalar_stage[MESA_SHADER_TESS_CTRL]);
    } else {
       /* Upload the Patch URB Header as the first two uniforms.
        * Do the annoying scrambling so the shader doesn't have to.
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
index 5096f135124..4da30b9c47d 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
@@ -29,6 +29,7 @@
 
 #include "brw_nir.h"
 #include "brw_vec4_tcs.h"
+#include "brw_fs.h"
 
 namespace brw {
 
@@ -452,7 +453,10 @@ brw_compile_tcs(const struct brw_compiler *compiler,
    brw_nir_lower_tcs_outputs(nir, &vue_prog_data->vue_map);
    nir = brw_postprocess_nir(nir, compiler->devinfo, is_scalar);
 
-   prog_data->instances = DIV_ROUND_UP(nir->info.tcs.vertices_out, 2);
+   if (is_scalar)
+      prog_data->instances = DIV_ROUND_UP(nir->info.tcs.vertices_out, 8);
+   else
+      prog_data->instances = DIV_ROUND_UP(nir->info.tcs.vertices_out, 2);
 
    /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
     * That divides up as follows:
@@ -493,20 +497,49 @@ brw_compile_tcs(const struct brw_compiler *compiler,
       brw_print_vue_map(stderr, &vue_prog_data->vue_map);
    }
 
-   vec4_tcs_visitor v(compiler, log_data, key, prog_data,
-                      nir, mem_ctx, shader_time_index, &input_vue_map);
-   if (!v.run()) {
-      if (error_str)
-         *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
-      return NULL;
-   }
+   if (is_scalar) {
+      fs_visitor v(compiler, log_data, mem_ctx, (void *) key,
+                   &prog_data->base.base, NULL, nir, 8,
+                   shader_time_index, &input_vue_map);
+      if (!v.run_tcs_single_patch()) {
+         if (error_str)
+            *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
+         return NULL;
+      }
 
-   if (unlikely(INTEL_DEBUG & DEBUG_TCS))
-      v.dump_instructions();
+      prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
+
+      fs_generator g(compiler, log_data, mem_ctx, (void *) key,
+                     &prog_data->base.base, v.promoted_constants, false,
+                     MESA_SHADER_TESS_CTRL);
+      if (unlikely(INTEL_DEBUG & DEBUG_TCS)) {
+         g.enable_debug(ralloc_asprintf(mem_ctx,
+                                        "%s tessellation control shader %s",
+                                        nir->info.label ? nir->info.label
+                                                        : "unnamed",
+                                        nir->info.name));
+      }
+
+      g.generate_code(v.cfg, 8);
 
-   return brw_vec4_generate_assembly(compiler, log_data, mem_ctx, nir,
-                                     &prog_data->base, v.cfg,
-                                     final_assembly_size);
+      return g.get_assembly(final_assembly_size);
+   } else {
+      vec4_tcs_visitor v(compiler, log_data, key, prog_data,
+                         nir, mem_ctx, shader_time_index, &input_vue_map);
+      if (!v.run()) {
+         if (error_str)
+            *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
+         return NULL;
+      }
+
+      if (unlikely(INTEL_DEBUG & DEBUG_TCS))
+         v.dump_instructions();
+
+
+      return brw_vec4_generate_assembly(compiler, log_data, mem_ctx, nir,
+                                        &prog_data->base, v.cfg,
+                                        final_assembly_size);
+   }
 }