From 646924cfa1bb51c9091a8cf0079f00759cbea372 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Fri, 3 May 2019 14:57:54 -0700
Subject: [PATCH] intel/compiler: Implement TCS 8_PATCH mode and
 INTEL_DEBUG=tcs8

Our tessellation control shaders can be dispatched in several modes.

- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
  channel corresponding to a different patch vertex.  PATCHLIST_N will
  launch (N / 8) threads.  If N is less than 8, some channels will be
  disabled, leaving some untapped hardware capabilities.  Conditionals
  based on gl_InvocationID are non-uniform, which means that they'll
  often have to execute both paths.  However, if there are fewer than
  8 vertices, all invocations will happen within a single thread, so
  barriers can become no-ops, which is nice.  We also burn a maximum
  of 4 registers for ICP handles, so we can compile without regard for
  the value of N.  It also works in all cases.

- DUAL_PATCH mode processes up to two patches at a time, where the first
  four channels come from patch 1, and the second group of four come
  from patch 2.  This tries to provide better EU utilization for small
  patches (N <= 4).  It cannot be used in all cases.

- 8_PATCH mode processes 8 patches at a time, with a thread launched per
  vertex in the patch.  Each channel corresponds to the same vertex, but
  in each of the 8 patches.  This utilizes all channels even for small
  patches.  It also makes conditions on gl_InvocationID uniform, leading
  to proper jumps.  Barriers, unfortunately, become real.  Worse, for
  PATCHLIST_N, the thread payload burns N registers for ICP handles.
  This can burn up to 32 registers, or 1/4 of our register file, for
  URB handles.  For Vulkan (and DX), we know the number of vertices at
  compile time, so we can limit the amount of waste.  In GL, the patch
  dimension is dynamic state, so we either would have to waste all 32
  (not reasonable) or guess (badly) and recompile.  This is unfortunate.
  Because we can only spawn 16 thread instances, we can only use this
  mode for PATCHLIST_16 and smaller.  The rest must use SINGLE_PATCH.

This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default.  A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes.  We may
want to consider using 8_PATCH mode in Vulkan in some cases.

The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases.  Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.

Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
---
 src/gallium/drivers/iris/iris_program.c       |  9 ++
 src/gallium/drivers/iris/iris_state.c         |  5 +
 src/intel/compiler/brw_compiler.c             |  3 +
 src/intel/compiler/brw_compiler.h             |  7 ++
 src/intel/compiler/brw_fs.cpp                 | 53 +++++++---
 src/intel/compiler/brw_fs.h                   |  7 +-
 src/intel/compiler/brw_fs_nir.cpp             | 96 +++++++++++++++++--
 src/intel/compiler/brw_vec4_tcs.cpp           | 27 ++++--
 src/intel/dev/gen_debug.c                     |  1 +
 src/intel/dev/gen_debug.h                     |  4 +-
 src/intel/vulkan/genX_pipeline.c              |  5 +
 src/mesa/drivers/dri/i965/brw_tcs.c           |  5 +-
 src/mesa/drivers/dri/i965/genX_state_upload.c |  5 +
 13 files changed, 193 insertions(+), 34 deletions(-)

diff --git a/src/gallium/drivers/iris/iris_program.c b/src/gallium/drivers/iris/iris_program.c
index 30ec3f1ff86..d5c5a32bbc4 100644
--- a/src/gallium/drivers/iris/iris_program.c
+++ b/src/gallium/drivers/iris/iris_program.c
@@ -1526,6 +1526,7 @@ iris_create_tcs_state(struct pipe_context *ctx,
 {
    struct iris_context *ice = (void *) ctx;
    struct iris_screen *screen = (void *) ctx->screen;
+   const struct brw_compiler *compiler = screen->compiler;
    struct iris_uncompiled_shader *ish = iris_create_shader_state(ctx, state);
    struct shader_info *info = &ish->nir->info;
 
@@ -1544,6 +1545,14 @@ iris_create_tcs_state(struct pipe_context *ctx,
          .patch_outputs_written = info->patch_outputs_written,
       };
 
+      /* 8_PATCH mode needs the key to contain the input patch dimensionality.
+       * We don't have that information, so we randomly guess that the input
+       * and output patches are the same size.  This is a bad guess, but we
+       * can't do much better.
+       */
+      if (compiler->use_tcs_8_patch)
+         key.input_vertices = info->tess.tcs_vertices_out;
+
       iris_compile_tcs(ice, ish, &key);
    }
 
diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c
index a9af1cd0dc4..f48bacf77e1 100644
--- a/src/gallium/drivers/iris/iris_state.c
+++ b/src/gallium/drivers/iris/iris_state.c
@@ -3651,6 +3651,11 @@ iris_store_tcs_state(struct iris_context *ice,
       hs.InstanceCount = tcs_prog_data->instances - 1;
       hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
       hs.IncludeVertexHandles = true;
+
+#if GEN_GEN >= 9
+      hs.DispatchMode = vue_prog_data->dispatch_mode;
+      hs.IncludePrimitiveID = tcs_prog_data->include_primitive_id;
+#endif
    }
 }
 
diff --git a/src/intel/compiler/brw_compiler.c b/src/intel/compiler/brw_compiler.c
index 626ff4bb16a..6a41cd20270 100644
--- a/src/intel/compiler/brw_compiler.c
+++ b/src/intel/compiler/brw_compiler.c
@@ -99,6 +99,9 @@ brw_compiler_create(void *mem_ctx, const struct gen_device_info *devinfo)
 
    compiler->precise_trig = env_var_as_boolean("INTEL_PRECISE_TRIG", false);
 
+   compiler->use_tcs_8_patch =
+      devinfo->gen >= 9 && (INTEL_DEBUG & DEBUG_TCS_EIGHT_PATCH);
+
    if (devinfo->gen >= 10) {
       /* We don't support vec4 mode on Cannonlake. */
       for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_STAGES; i++)
diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
index 332789d7809..8c6ae35636b 100644
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -93,6 +93,7 @@ struct brw_compiler {
    void (*shader_perf_log)(void *, const char *str, ...) PRINTFLIKE(2, 3);
 
    bool scalar_stage[MESA_SHADER_STAGES];
+   bool use_tcs_8_patch;
    struct gl_shader_compiler_options glsl_compiler_options[MESA_SHADER_STAGES];
 
    /**
@@ -1002,6 +1003,9 @@ enum shader_dispatch_mode {
    DISPATCH_MODE_4X2_DUAL_INSTANCE = 1,
    DISPATCH_MODE_4X2_DUAL_OBJECT = 2,
    DISPATCH_MODE_SIMD8 = 3,
+
+   DISPATCH_MODE_TCS_SINGLE_PATCH = 0,
+   DISPATCH_MODE_TCS_8_PATCH = 2,
 };
 
 /**
@@ -1074,6 +1078,9 @@ struct brw_tcs_prog_data
 {
    struct brw_vue_prog_data base;
 
+   /** Should the non-SINGLE_PATCH payload provide primitive ID? */
+   bool include_primitive_id;
+
    /** Number vertices in output patch */
    int instances;
 };
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 9d05800d1fe..9b4e030b54f 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -1805,7 +1805,7 @@ fs_visitor::assign_vs_urb_setup()
 }
 
 void
-fs_visitor::assign_tcs_single_patch_urb_setup()
+fs_visitor::assign_tcs_urb_setup()
 {
    assert(stage == MESA_SHADER_TESS_CTRL);
 
@@ -7396,12 +7396,28 @@ void
 fs_visitor::set_tcs_invocation_id()
 {
    struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
+   struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
 
    const unsigned instance_id_mask =
       devinfo->gen >= 11 ? INTEL_MASK(22, 16) : INTEL_MASK(23, 17);
    const unsigned instance_id_shift =
       devinfo->gen >= 11 ? 16 : 17;
 
+   /* Get instance number from g0.2 bits 22:16 or 23:17 */
+   fs_reg t = bld.vgrf(BRW_REGISTER_TYPE_UD);
+   bld.AND(t, fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD)),
+           brw_imm_ud(instance_id_mask));
+
+   invocation_id = bld.vgrf(BRW_REGISTER_TYPE_UD);
+
+   if (vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH) {
+      /* gl_InvocationID is just the thread number */
+      bld.SHR(invocation_id, t, brw_imm_ud(instance_id_shift));
+      return;
+   }
+
+   assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH);
+
    fs_reg channels_uw = bld.vgrf(BRW_REGISTER_TYPE_UW);
    fs_reg channels_ud = bld.vgrf(BRW_REGISTER_TYPE_UD);
    bld.MOV(channels_uw, fs_reg(brw_imm_uv(0x76543210)));
@@ -7410,26 +7426,36 @@ fs_visitor::set_tcs_invocation_id()
    if (tcs_prog_data->instances == 1) {
       invocation_id = channels_ud;
    } else {
-      invocation_id = bld.vgrf(BRW_REGISTER_TYPE_UD);
-
-      /* Get instance number from g0.2 bits 23:17, and multiply it by 8. */
-      fs_reg t = bld.vgrf(BRW_REGISTER_TYPE_UD);
       fs_reg instance_times_8 = bld.vgrf(BRW_REGISTER_TYPE_UD);
-      bld.AND(t, fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD)),
-              brw_imm_ud(instance_id_mask));
       bld.SHR(instance_times_8, t, brw_imm_ud(instance_id_shift - 3));
-
       bld.ADD(invocation_id, instance_times_8, channels_ud);
    }
 }
 
 bool
-fs_visitor::run_tcs_single_patch()
+fs_visitor::run_tcs()
 {
    assert(stage == MESA_SHADER_TESS_CTRL);
 
-   /* r1-r4 contain the ICP handles. */
-   payload.num_regs = 5;
+   struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
+   struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
+   struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
+
+   assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH ||
+          vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH);
+
+   if (vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH) {
+      /* r1-r4 contain the ICP handles. */
+      payload.num_regs = 5;
+   } else {
+      assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH);
+      assert(tcs_key->input_vertices > 0);
+      /* r1 contains output handles, r2 may contain primitive ID, then the
+       * ICP handles occupy the next 1-32 registers.
+       */
+      payload.num_regs = 2 + tcs_prog_data->include_primitive_id +
+                         tcs_key->input_vertices;
+   }
 
    if (shader_time_index >= 0)
       emit_shader_time_begin();
@@ -7438,6 +7464,7 @@ fs_visitor::run_tcs_single_patch()
    set_tcs_invocation_id();
 
    const bool fix_dispatch_mask =
+      vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH &&
       (nir->info.tess.tcs_vertices_out % 8) != 0;
 
    /* Fix the disptach mask */
@@ -7455,7 +7482,7 @@ fs_visitor::run_tcs_single_patch()
 
    /* Emit EOT write; set TR DS Cache bit */
    fs_reg srcs[3] = {
-      fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
+      fs_reg(get_tcs_output_urb_handle()),
       fs_reg(brw_imm_ud(WRITEMASK_X << 16)),
       fs_reg(brw_imm_ud(0)),
    };
@@ -7478,7 +7505,7 @@ fs_visitor::run_tcs_single_patch()
    optimize();
 
    assign_curb_setup();
-   assign_tcs_single_patch_urb_setup();
+   assign_tcs_urb_setup();
 
    fixup_3src_null_dest();
    allocate_registers(8, true);
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index 8ae73401cdf..7db486688af 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -92,7 +92,7 @@ public:
 
    bool run_fs(bool allow_spilling, bool do_rep_send);
    bool run_vs();
-   bool run_tcs_single_patch();
+   bool run_tcs();
    bool run_tes();
    bool run_gs();
    bool run_cs(unsigned min_dispatch_width);
@@ -110,7 +110,7 @@ public:
    void assign_urb_setup();
    void convert_attr_sources_to_hw_regs(fs_inst *inst);
    void assign_vs_urb_setup();
-   void assign_tcs_single_patch_urb_setup();
+   void assign_tcs_urb_setup();
    void assign_tes_urb_setup();
    void assign_gs_urb_setup();
    bool assign_regs(bool allow_spilling, bool spill_all);
@@ -251,6 +251,9 @@ public:
    fs_reg get_indirect_offset(nir_intrinsic_instr *instr);
    fs_reg get_tcs_single_patch_icp_handle(const brw::fs_builder &bld,
                                           nir_intrinsic_instr *instr);
+   fs_reg get_tcs_eight_patch_icp_handle(const brw::fs_builder &bld,
+                                         nir_intrinsic_instr *instr);
+   struct brw_reg get_tcs_output_urb_handle();
 
    void emit_percomp(const brw::fs_builder &bld, const fs_inst &inst,
                      unsigned wr_mask);
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 73e2f38145e..a2c8f3f557f 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -2605,6 +2605,73 @@ fs_visitor::get_tcs_single_patch_icp_handle(const fs_builder &bld,
    return icp_handle;
 }
 
+fs_reg
+fs_visitor::get_tcs_eight_patch_icp_handle(const fs_builder &bld,
+                                           nir_intrinsic_instr *instr)
+{
+   struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
+   struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
+   const nir_src &vertex_src = instr->src[0];
+
+   unsigned first_icp_handle = tcs_prog_data->include_primitive_id ? 3 : 2;
+
+   if (nir_src_is_const(vertex_src)) {
+      return fs_reg(retype(brw_vec8_grf(first_icp_handle +
+                                        nir_src_as_uint(vertex_src), 0),
+                           BRW_REGISTER_TYPE_UD));
+   }
+
+   /* The vertex index is non-constant.  We need to use indirect
+    * addressing to fetch the proper URB handle.
+    *
+    * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
+    * indicating that channel <n> should read the handle from
+    * DWord <n>.  We convert that to bytes by multiplying by 4.
+    *
+    * Next, we convert the vertex index to bytes by multiplying
+    * by 32 (shifting by 5), and add the two together.  This is
+    * the final indirect byte offset.
+    */
+   fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+   fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_UW, 1);
+   fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+   fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+   fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+
+   /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */
+   bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210)));
+   /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
+   bld.SHL(channel_offsets, sequence, brw_imm_ud(2u));
+   /* Convert vertex_index to bytes (multiply by 32) */
+   bld.SHL(vertex_offset_bytes,
+           retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
+           brw_imm_ud(5u));
+   bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
+
+   /* Use first_icp_handle as the base offset.  There is one register
+    * of URB handles per vertex, so inform the register allocator that
+    * we might read up to nir->info.gs.vertices_in registers.
+    */
+   bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
+            retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
+            icp_offset_bytes, brw_imm_ud(tcs_key->input_vertices * REG_SIZE));
+
+   return icp_handle;
+}
+
+struct brw_reg
+fs_visitor::get_tcs_output_urb_handle()
+{
+   struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
+
+   if (vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH) {
+      return retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD);
+   } else {
+      assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH);
+      return retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD);
+   }
+}
+
 void
 fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
                                    nir_intrinsic_instr *instr)
@@ -2612,6 +2679,10 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
    assert(stage == MESA_SHADER_TESS_CTRL);
    struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
    struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
+   struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
+
+   bool eight_patch =
+      vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH;
 
    fs_reg dst;
    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
@@ -2619,7 +2690,8 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
 
    switch (instr->intrinsic) {
    case nir_intrinsic_load_primitive_id:
-      bld.MOV(dst, fs_reg(brw_vec1_grf(0, 1)));
+      bld.MOV(dst, fs_reg(eight_patch ? brw_vec8_grf(2, 0)
+                                      : brw_vec1_grf(0, 1)));
       break;
    case nir_intrinsic_load_invocation_id:
       bld.MOV(retype(dst, invocation_id.type), invocation_id);
@@ -2675,7 +2747,9 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
       unsigned imm_offset = instr->const_index[0];
       fs_inst *inst;
 
-      fs_reg icp_handle = get_tcs_single_patch_icp_handle(bld, instr);
+      fs_reg icp_handle =
+         eight_patch ? get_tcs_eight_patch_icp_handle(bld, instr)
+                     : get_tcs_single_patch_icp_handle(bld, instr);
 
       /* We can only read two double components with each URB read, so
        * we send two read messages in that case, each one loading up to
@@ -2776,12 +2850,15 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
       unsigned imm_offset = instr->const_index[0];
       unsigned first_component = nir_intrinsic_component(instr);
 
+      struct brw_reg output_handles = get_tcs_output_urb_handle();
+
       fs_inst *inst;
       if (indirect_offset.file == BAD_FILE) {
-         /* Replicate the patch handle to all enabled channels */
+         /* This MOV replicates the output handle to all enabled channels
+          * is SINGLE_PATCH mode.
+          */
          fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
-         bld.MOV(patch_handle,
-                 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD));
+         bld.MOV(patch_handle, output_handles);
 
          {
             if (first_component != 0) {
@@ -2805,10 +2882,7 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
          }
       } else {
          /* Indirect indexing - use per-slot offsets as well. */
-         const fs_reg srcs[] = {
-            retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
-            indirect_offset
-         };
+         const fs_reg srcs[] = { output_handles, indirect_offset };
          fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
          bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
          if (first_component != 0) {
@@ -2842,8 +2916,10 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
       unsigned imm_offset = instr->const_index[0];
       unsigned mask = instr->const_index[1];
       unsigned header_regs = 0;
+      struct brw_reg output_handles = get_tcs_output_urb_handle();
+
       fs_reg srcs[7];
-      srcs[header_regs++] = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD);
+      srcs[header_regs++] = output_handles;
 
       if (indirect_offset.file != BAD_FILE) {
          srcs[header_regs++] = indirect_offset;
diff --git a/src/intel/compiler/brw_vec4_tcs.cpp b/src/intel/compiler/brw_vec4_tcs.cpp
index f0ef8c0dd96..c37f34cbe81 100644
--- a/src/intel/compiler/brw_vec4_tcs.cpp
+++ b/src/intel/compiler/brw_vec4_tcs.cpp
@@ -406,10 +406,26 @@ brw_compile_tcs(const struct brw_compiler *compiler,
 
    nir = brw_postprocess_nir(nir, compiler, is_scalar);
 
-   if (is_scalar)
-      prog_data->instances = DIV_ROUND_UP(nir->info.tess.tcs_vertices_out, 8);
-   else
-      prog_data->instances = DIV_ROUND_UP(nir->info.tess.tcs_vertices_out, 2);
+   bool has_primitive_id =
+      nir->info.system_values_read & (1 << SYSTEM_VALUE_PRIMITIVE_ID);
+
+   if (compiler->use_tcs_8_patch &&
+       nir->info.tess.tcs_vertices_out <= 16 &&
+       2 + has_primitive_id + key->input_vertices <= 31) {
+      /* 3DSTATE_HS imposes two constraints on using 8_PATCH mode.  First,
+       * the "Instance" field limits the number of output vertices to [1, 16].
+       * Secondly, the "Dispatch GRF Start Register for URB Data" field is
+       * limited to [0, 31] - which imposes a limit on the input vertices.
+       */
+      vue_prog_data->dispatch_mode = DISPATCH_MODE_TCS_8_PATCH;
+      prog_data->instances = nir->info.tess.tcs_vertices_out;
+      prog_data->include_primitive_id = has_primitive_id;
+   } else {
+      unsigned verts_per_thread = is_scalar ? 8 : 2;
+      vue_prog_data->dispatch_mode = DISPATCH_MODE_TCS_SINGLE_PATCH;
+      prog_data->instances =
+         DIV_ROUND_UP(nir->info.tess.tcs_vertices_out, verts_per_thread);
+   }
 
    /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
     * That divides up as follows:
@@ -462,14 +478,13 @@ brw_compile_tcs(const struct brw_compiler *compiler,
       fs_visitor v(compiler, log_data, mem_ctx, (void *) key,
                    &prog_data->base.base, NULL, nir, 8,
                    shader_time_index, &input_vue_map);
-      if (!v.run_tcs_single_patch()) {
+      if (!v.run_tcs()) {
          if (error_str)
             *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
          return NULL;
       }
 
       prog_data->base.base.dispatch_grf_start_reg = v.payload.num_regs;
-      prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
 
       fs_generator g(compiler, log_data, mem_ctx,
                      &prog_data->base.base, v.promoted_constants, false,
diff --git a/src/intel/dev/gen_debug.c b/src/intel/dev/gen_debug.c
index b0d0d1a574a..5fa3d29fa51 100644
--- a/src/intel/dev/gen_debug.c
+++ b/src/intel/dev/gen_debug.c
@@ -86,6 +86,7 @@ static const struct debug_control debug_control[] = {
    { "color",       DEBUG_COLOR },
    { "reemit",      DEBUG_REEMIT },
    { "soft64",      DEBUG_SOFT64 },
+   { "tcs8",        DEBUG_TCS_EIGHT_PATCH },
    { NULL,    0 }
 };
 
diff --git a/src/intel/dev/gen_debug.h b/src/intel/dev/gen_debug.h
index e4dabc67f8d..a6592354a64 100644
--- a/src/intel/dev/gen_debug.h
+++ b/src/intel/dev/gen_debug.h
@@ -84,6 +84,7 @@ extern uint64_t INTEL_DEBUG;
 #define DEBUG_COLOR               (1ull << 40)
 #define DEBUG_REEMIT              (1ull << 41)
 #define DEBUG_SOFT64              (1ull << 42)
+#define DEBUG_TCS_EIGHT_PATCH     (1ull << 43)
 
 /* These flags are not compatible with the disk shader cache */
 #define DEBUG_DISK_CACHE_DISABLE_MASK DEBUG_SHADER_TIME
@@ -91,7 +92,8 @@ extern uint64_t INTEL_DEBUG;
 /* These flags may affect program generation */
 #define DEBUG_DISK_CACHE_MASK \
    (DEBUG_NO16 | DEBUG_NO_DUAL_OBJECT_GS | DEBUG_NO8 |  DEBUG_SPILL_FS | \
-   DEBUG_SPILL_VEC4 | DEBUG_NO_COMPACTION | DEBUG_DO32 | DEBUG_SOFT64)
+   DEBUG_SPILL_VEC4 | DEBUG_NO_COMPACTION | DEBUG_DO32 | DEBUG_SOFT64 | \
+   DEBUG_TCS_EIGHT_PATCH)
 
 #ifdef HAVE_ANDROID_PLATFORM
 #define LOG_TAG "INTEL-MESA"
diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c
index 0b58dce05b0..6b64f7ea8c7 100644
--- a/src/intel/vulkan/genX_pipeline.c
+++ b/src/intel/vulkan/genX_pipeline.c
@@ -1434,6 +1434,11 @@ emit_3dstate_hs_te_ds(struct anv_pipeline *pipeline,
       hs.PerThreadScratchSpace = get_scratch_space(tcs_bin);
       hs.ScratchSpaceBasePointer =
          get_scratch_address(pipeline, MESA_SHADER_TESS_CTRL, tcs_bin);
+
+#if GEN_GEN >= 9
+      hs.DispatchMode = tcs_prog_data->base.dispatch_mode;
+      hs.IncludePrimitiveID = tcs_prog_data->include_primitive_id;
+#endif
    }
 
    const VkPipelineTessellationDomainOriginStateCreateInfo *domain_origin_state =
diff --git a/src/mesa/drivers/dri/i965/brw_tcs.c b/src/mesa/drivers/dri/i965/brw_tcs.c
index 1c4d7988638..1050850bb1c 100644
--- a/src/mesa/drivers/dri/i965/brw_tcs.c
+++ b/src/mesa/drivers/dri/i965/brw_tcs.c
@@ -160,6 +160,7 @@ brw_tcs_populate_key(struct brw_context *brw,
                      struct brw_tcs_prog_key *key)
 {
    const struct gen_device_info *devinfo = &brw->screen->devinfo;
+   const struct brw_compiler *compiler = brw->screen->compiler;
    struct brw_program *tcp =
       (struct brw_program *) brw->programs[MESA_SHADER_TESS_CTRL];
    struct brw_program *tep =
@@ -177,7 +178,7 @@ brw_tcs_populate_key(struct brw_context *brw,
       per_patch_slots |= prog->info.patch_outputs_written;
    }
 
-   if (devinfo->gen < 8 || !tcp)
+   if (devinfo->gen < 8 || !tcp || compiler->use_tcs_8_patch)
       key->input_vertices = brw->ctx.TessCtrlProgram.patch_vertices;
    key->outputs_written = per_vertex_slots;
    key->patch_outputs_written = per_patch_slots;
@@ -251,7 +252,7 @@ brw_tcs_populate_default_key(const struct brw_compiler *compiler,
    brw_setup_tex_for_precompile(devinfo, &key->tex, prog);
 
    /* Guess that the input and output patches have the same dimensionality. */
-   if (devinfo->gen < 8)
+   if (devinfo->gen < 8 || compiler->use_tcs_8_patch)
       key->input_vertices = prog->info.tess.tcs_vertices_out;
 
    if (tes) {
diff --git a/src/mesa/drivers/dri/i965/genX_state_upload.c b/src/mesa/drivers/dri/i965/genX_state_upload.c
index ecffa2e8e86..961306b04fd 100644
--- a/src/mesa/drivers/dri/i965/genX_state_upload.c
+++ b/src/mesa/drivers/dri/i965/genX_state_upload.c
@@ -4111,6 +4111,11 @@ genX(upload_hs_state)(struct brw_context *brw)
          hs.IncludeVertexHandles = true;
 
          hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
+
+#if GEN_GEN >= 9
+         hs.DispatchMode = vue_prog_data->dispatch_mode;
+         hs.IncludePrimitiveID = tcs_prog_data->include_primitive_id;
+#endif
       }
    }
 }
-- 
2.30.2