From 4a1c8a3037cd29938b2a6e2c680c341e9903cfbe Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Sun, 27 Dec 2015 17:26:30 -0800
Subject: [PATCH] i965: Push most TES inputs in SIMD8 mode.

Using the push model for inputs is much more efficient than pulling
inputs - the hardware can simply copy a large chunk into URB registers
at thread creation time, rather than having the thread send messages to
request data from the L3 cache.  Unfortunately, it's possible to have
more TES inputs than fit in registers, so we have to fall back to the
pull model in some cases.

However, it turns out that most tessellation evaluation shaders are
fairly simple, and don't use many inputs.  An arbitrary cut-off of
32 vec4 slots (16 registers) is more than sufficient to ensure that
100% of TES inputs are pushed for Shadow of Mordor, Unigine Heaven,
GPUTest/TessMark, and SynMark.

Note that unlike most SIMD8 stages, this actually reads packed vec4
data, since that is what our vec4 TCS programs write.

Improves performance in GPUTest's tessmark_x64 microbenchmark
by 93.4426% +/- 5.35541% (n = 25) on my Lenovo X250 at 1024x768.

Improves performance in Synmark's Gl40TerrainFlyTess microbenchmark
by 22.74% +/- 0.309394% (n = 5).

Improves performance in Shadow of Mordor at low settings with
tessellation enabled at 1280x720 by 2.12197% +/- 0.478553% (n = 4).

shader-db statistics for files containing tessellation shaders:

total instructions in shared programs: 184358 -> 181181 (-1.72%)
instructions in affected programs: 27971 -> 24794 (-11.36%)
helped: 226

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 42 +++++++++++++++++-------
 1 file changed, 30 insertions(+), 12 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 788315f6c52..ad347fcdbaf 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -1849,15 +1849,33 @@ fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
 
       fs_inst *inst;
       if (indirect_offset.file == BAD_FILE) {
-         /* Replicate the patch handle to all enabled channels */
-         const fs_reg srcs[] = {
-            retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)
-         };
-         fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
-         bld.LOAD_PAYLOAD(patch_handle, srcs, ARRAY_SIZE(srcs), 0);
-
-         inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dest, patch_handle);
-         inst->mlen = 1;
+         /* Arbitrarily only push up to 32 vec4 slots worth of data,
+          * which is 16 registers (since each holds 2 vec4 slots).
+          */
+         const unsigned max_push_slots = 32;
+         if (imm_offset < max_push_slots) {
+            fs_reg src = fs_reg(ATTR, imm_offset / 2, dest.type);
+            for (int i = 0; i < instr->num_components; i++) {
+               bld.MOV(offset(dest, bld, i),
+                       component(src, 4 * (imm_offset % 2) + i));
+            }
+            tes_prog_data->base.urb_read_length =
+               MAX2(tes_prog_data->base.urb_read_length,
+                    DIV_ROUND_UP(imm_offset + 1, 2));
+         } else {
+            /* Replicate the patch handle to all enabled channels */
+            const fs_reg srcs[] = {
+               retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)
+            };
+            fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+            bld.LOAD_PAYLOAD(patch_handle, srcs, ARRAY_SIZE(srcs), 0);
+
+            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dest, patch_handle);
+            inst->mlen = 1;
+            inst->offset = imm_offset;
+            inst->base_mrf = -1;
+            inst->regs_written = instr->num_components;
+         }
       } else {
          /* Indirect indexing - use per-slot offsets as well. */
          const fs_reg srcs[] = {
@@ -1869,10 +1887,10 @@ fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
 
          inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dest, payload);
          inst->mlen = 2;
+         inst->offset = imm_offset;
+         inst->base_mrf = -1;
+         inst->regs_written = instr->num_components;
       }
-      inst->offset = imm_offset;
-      inst->base_mrf = -1;
-      inst->regs_written = instr->num_components;
       break;
    }
    default:
-- 
2.30.2