i965: Add tessellation control shaders.
authorKenneth Graunke <kenneth@whitecape.org>
Tue, 17 Nov 2015 09:07:39 +0000 (01:07 -0800)
committerKenneth Graunke <kenneth@whitecape.org>
Tue, 22 Dec 2015 10:12:05 +0000 (02:12 -0800)
The TCS is the first tessellation shader stage, and the most
complicated.  It has access to each of the control points in the input
patch, and computes a new output patch.  There is one logical invocation
per output control point; all invocations run in parallel, and can
communicate by reading and writing output variables.

One of the main responsibilities of the TCS is to write the special
gl_TessLevelOuter[] and gl_TessLevelInner[] output variables which
control how much new geometry the hardware tessellation engine will
produce.  Otherwise, it simply writes outputs that are passed along
to the TES.

We run in SIMD4x2 mode, handling two logical invocations per EU thread.
The hardware doesn't properly manage the dispatch mask for us; it always
initializes it to 0xFF.  We wrap the whole program in an IF..ENDIF block
to handle an odd number of invocations, essentially falling back to
SIMD4x1 on the last thread.

v2: Update comments (requested by Jordan Justen).

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
19 files changed:
src/mesa/drivers/dri/i965/Makefile.sources
src/mesa/drivers/dri/i965/brw_compiler.h
src/mesa/drivers/dri/i965/brw_context.h
src/mesa/drivers/dri/i965/brw_defines.h
src/mesa/drivers/dri/i965/brw_link.cpp
src/mesa/drivers/dri/i965/brw_program.h
src/mesa/drivers/dri/i965/brw_reg.h
src/mesa/drivers/dri/i965/brw_shader.cpp
src/mesa/drivers/dri/i965/brw_shader.h
src/mesa/drivers/dri/i965/brw_state_upload.c
src/mesa/drivers/dri/i965/brw_tcs.c [new file with mode: 0644]
src/mesa/drivers/dri/i965/brw_vec4.cpp
src/mesa/drivers/dri/i965/brw_vec4.h
src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp [new file with mode: 0644]
src/mesa/drivers/dri/i965/brw_vec4_tcs.h [new file with mode: 0644]

index 7354aafbd396b859a0c0879121058ea6adf989e4..0b706de69a08bc96d017270a7f974061fd17f195 100644 (file)
@@ -75,6 +75,7 @@ i965_compiler_FILES = \
        brw_vec4_reg_allocate.cpp \
        brw_vec4_surface_builder.cpp \
        brw_vec4_surface_builder.h \
+       brw_vec4_tcs.cpp \
        brw_vec4_visitor.cpp \
        brw_vec4_vs_visitor.cpp \
        brw_vue_map.c \
@@ -150,6 +151,7 @@ i965_FILES = \
        brw_state.h \
        brw_state_upload.c \
        brw_structs.h \
+       brw_tcs.c \
        brw_tcs_surface_state.c \
        brw_tes.c \
        brw_tes_surface_state.c \
index 64d831d4e91f580b0fa9e6aeaea49fad1814c406..e6bae8e902ff40b16cb3b747690f9dbdcd88fe20 100644 (file)
@@ -191,6 +191,16 @@ struct brw_vs_prog_key {
    struct brw_sampler_prog_key_data tex;
 };
 
+/** The program key for Tessellation Control Shaders. */
+struct brw_tcs_prog_key
+{
+   unsigned program_string_id;
+
+   GLenum tes_primitive_mode;
+
+   struct brw_sampler_prog_key_data tex;
+};
+
 /** The program key for Tessellation Evaluation Shaders. */
 struct brw_tes_prog_key
 {
@@ -676,6 +686,22 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
                unsigned *final_assembly_size,
                char **error_str);
 
+/**
+ * Compile a tessellation control shader.
+ *
+ * Returns the final assembly and the program's size.
+ */
+const unsigned *
+brw_compile_tcs(const struct brw_compiler *compiler,
+                void *log_data,
+                void *mem_ctx,
+                const struct brw_tcs_prog_key *key,
+                struct brw_tcs_prog_data *prog_data,
+                const struct nir_shader *nir,
+                int shader_time_index,
+                unsigned *final_assembly_size,
+                char **error_str);
+
 /**
  * Compile a tessellation evaluation shader.
  *
index 5e840d1892015c30d333c177b8279f15a065beba..1d989f351bfa25d4b5d9f52548f3699287e4ece0 100644 (file)
@@ -1704,6 +1704,12 @@ brw_vertex_program_const(const struct gl_vertex_program *p)
    return (const struct brw_vertex_program *) p;
 }
 
+static inline struct brw_tess_ctrl_program *
+brw_tess_ctrl_program(struct gl_tess_ctrl_program *p)
+{
+   return (struct brw_tess_ctrl_program *) p;
+}
+
 static inline struct brw_tess_eval_program *
 brw_tess_eval_program(struct gl_tess_eval_program *p)
 {
index 4a184cf72f3c3971df8112044614c869c57edabd..cc19c06f1625ab8cb7de29a37f5e4b87a3210374 100644 (file)
@@ -1305,6 +1305,14 @@ enum opcode {
     *           UD immediate).
     */
    SHADER_OPCODE_MOV_INDIRECT,
+
+   VEC4_OPCODE_URB_READ,
+   TCS_OPCODE_GET_INSTANCE_ID,
+   TCS_OPCODE_URB_WRITE,
+   TCS_OPCODE_SET_INPUT_URB_OFFSETS,
+   TCS_OPCODE_SET_OUTPUT_URB_OFFSETS,
+   TCS_OPCODE_GET_PRIMITIVE_ID,
+   TCS_OPCODE_CREATE_BARRIER_HEADER,
 };
 
 enum brw_urb_write_flags {
index f5a7d204b3d7a8714a1e109a4c9e727f95c0de85..7cdc830f6b89618564e3140b06145dde11c93fbb 100644 (file)
@@ -42,6 +42,7 @@ brw_shader_precompile(struct gl_context *ctx,
                       struct gl_shader_program *sh_prog)
 {
    struct gl_shader *vs = sh_prog->_LinkedShaders[MESA_SHADER_VERTEX];
+   struct gl_shader *tcs = sh_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL];
    struct gl_shader *tes = sh_prog->_LinkedShaders[MESA_SHADER_TESS_EVAL];
    struct gl_shader *gs = sh_prog->_LinkedShaders[MESA_SHADER_GEOMETRY];
    struct gl_shader *fs = sh_prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
@@ -56,6 +57,9 @@ brw_shader_precompile(struct gl_context *ctx,
    if (tes && !brw_tes_precompile(ctx, sh_prog, tes->Program))
       return false;
 
+   if (tcs && !brw_tcs_precompile(ctx, sh_prog, tcs->Program))
+      return false;
+
    if (vs && !brw_vs_precompile(ctx, sh_prog, vs->Program))
       return false;
 
index 1cdab97a82af3ac14ec22ada523529704a6471bb..3d9e1b983c80defec4c7afa47f7cf0283ea045d3 100644 (file)
@@ -56,6 +56,7 @@ void
 brw_dump_ir(const char *stage, struct gl_shader_program *shader_prog,
             struct gl_shader *shader, struct gl_program *prog);
 
+void brw_upload_tcs_prog(struct brw_context *brw);
 void brw_upload_tes_prog(struct brw_context *brw);
 
 #ifdef __cplusplus
index fa912c96c36f18b23b4489ff3b1d7ffd4443fa2a..9f2ff9ae5ad3d225e731a6dd69ea0d0437b9c23a 100644 (file)
@@ -84,6 +84,7 @@ struct brw_device_info;
 #define BRW_SWIZZLE_YZXW      BRW_SWIZZLE4(1,2,0,3)
 #define BRW_SWIZZLE_ZXYW      BRW_SWIZZLE4(2,0,1,3)
 #define BRW_SWIZZLE_ZWZW      BRW_SWIZZLE4(2,3,2,3)
+#define BRW_SWIZZLE_WZYX      BRW_SWIZZLE4(3,2,1,0)
 
 static inline bool
 brw_is_single_value_swizzle(unsigned swiz)
index d9545685b1b242760ca28a8a44f40d098ead5689..9b64ae475bbf028addbb05c37308fbe10ad8feab 100644 (file)
@@ -85,6 +85,7 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
 
    compiler->scalar_stage[MESA_SHADER_VERTEX] =
       devinfo->gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS);
+   compiler->scalar_stage[MESA_SHADER_TESS_CTRL] = false;
    compiler->scalar_stage[MESA_SHADER_TESS_EVAL] = true;
    compiler->scalar_stage[MESA_SHADER_GEOMETRY] =
       devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_GS", false);
@@ -137,6 +138,7 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
       compiler->glsl_compiler_options[i].LowerBufferInterfaceBlocks = true;
    }
 
+   compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].EmitNoIndirectInput = false;
    compiler->glsl_compiler_options[MESA_SHADER_TESS_EVAL].EmitNoIndirectInput = false;
 
    if (compiler->scalar_stage[MESA_SHADER_GEOMETRY])
@@ -549,6 +551,21 @@ brw_instruction_name(enum opcode op)
       return "mulh";
    case SHADER_OPCODE_MOV_INDIRECT:
       return "mov_indirect";
+
+   case VEC4_OPCODE_URB_READ:
+      return "urb_read";
+   case TCS_OPCODE_GET_INSTANCE_ID:
+      return "tcs_get_instance_id";
+   case TCS_OPCODE_URB_WRITE:
+      return "tcs_urb_write";
+   case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
+      return "tcs_set_input_urb_offsets";
+   case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
+      return "tcs_set_output_urb_offsets";
+   case TCS_OPCODE_GET_PRIMITIVE_ID:
+      return "tcs_get_primitive_id";
+   case TCS_OPCODE_CREATE_BARRIER_HEADER:
+      return "tcs_create_barrier_header";
    }
 
    unreachable("not reached");
index 2e73f123082fb5541e66a8e613007a33277a48f1..593361348fdfa2ca481ec185212ba21d27328964 100644 (file)
@@ -273,6 +273,9 @@ brw_assign_common_binding_table_offsets(gl_shader_stage stage,
 bool brw_vs_precompile(struct gl_context *ctx,
                        struct gl_shader_program *shader_prog,
                        struct gl_program *prog);
+bool brw_tcs_precompile(struct gl_context *ctx,
+                        struct gl_shader_program *shader_prog,
+                        struct gl_program *prog);
 bool brw_tes_precompile(struct gl_context *ctx,
                         struct gl_shader_program *shader_prog,
                         struct gl_program *prog);
index c657b254f04edab5c8ea95a7649e0bfa2256259f..56962d59c499dad1dadd59cde7bac7d77661f4be 100644 (file)
@@ -678,6 +678,7 @@ brw_upload_programs(struct brw_context *brw,
 {
    if (pipeline == BRW_RENDER_PIPELINE) {
       brw_upload_vs_prog(brw);
+      brw_upload_tcs_prog(brw);
       brw_upload_tes_prog(brw);
 
       if (brw->gen < 6)
diff --git a/src/mesa/drivers/dri/i965/brw_tcs.c b/src/mesa/drivers/dri/i965/brw_tcs.c
new file mode 100644 (file)
index 0000000..b33a16d
--- /dev/null
@@ -0,0 +1,262 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file brw_tcs.c
+ *
+ * Tessellation control shader state upload code.
+ */
+
+#include "brw_context.h"
+#include "brw_nir.h"
+#include "brw_program.h"
+#include "brw_shader.h"
+#include "brw_state.h"
+#include "program/prog_parameter.h"
+
+static void
+brw_tcs_debug_recompile(struct brw_context *brw,
+                       struct gl_shader_program *shader_prog,
+                       const struct brw_tcs_prog_key *key)
+{
+   struct brw_cache_item *c = NULL;
+   const struct brw_tcs_prog_key *old_key = NULL;
+   bool found = false;
+
+   perf_debug("Recompiling tessellation control shader for program %d\n",
+              shader_prog->Name);
+
+   for (unsigned int i = 0; i < brw->cache.size; i++) {
+      for (c = brw->cache.items[i]; c; c = c->next) {
+         if (c->cache_id == BRW_CACHE_TCS_PROG) {
+            old_key = c->key;
+
+            if (old_key->program_string_id == key->program_string_id)
+               break;
+         }
+      }
+      if (c)
+         break;
+   }
+
+   if (!c) {
+      perf_debug("  Didn't find previous compile in the shader cache for "
+                 "debug\n");
+      return;
+   }
+
+   found |= key_debug(brw, "TES primitive mode", old_key->tes_primitive_mode,
+                      key->tes_primitive_mode);
+   found |= brw_debug_recompile_sampler_key(brw, &old_key->tex, &key->tex);
+
+   if (!found) {
+      perf_debug("  Something else\n");
+   }
+}
+
+static bool
+brw_codegen_tcs_prog(struct brw_context *brw,
+                     struct gl_shader_program *shader_prog,
+                     struct brw_tess_ctrl_program *tcp,
+                     struct brw_tcs_prog_key *key)
+{
+   const struct brw_compiler *compiler = brw->intelScreen->compiler;
+   struct brw_stage_state *stage_state = &brw->tcs.base;
+   nir_shader *nir = tcp->program.Base.nir;
+   struct brw_tcs_prog_data prog_data;
+   bool start_busy = false;
+   double start_time = 0;
+
+   memset(&prog_data, 0, sizeof(prog_data));
+
+   /* Allocate the references to the uniforms that will end up in the
+    * prog_data associated with the compiled program, and which will be freed
+    * by the state cache.
+    *
+    * Note: param_count needs to be num_uniform_components * 4, since we add
+    * padding around uniform values below vec4 size, so the worst case is that
+    * every uniform is a float which gets padded to the size of a vec4.
+    */
+   struct gl_shader *tcs = shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL];
+   int param_count = nir->num_uniforms;
+   if (!compiler->scalar_stage[MESA_SHADER_TESS_CTRL])
+      param_count *= 4;
+
+   prog_data.base.base.param =
+      rzalloc_array(NULL, const gl_constant_value *, param_count);
+   prog_data.base.base.pull_param =
+      rzalloc_array(NULL, const gl_constant_value *, param_count);
+   prog_data.base.base.image_param =
+      rzalloc_array(NULL, struct brw_image_param, tcs->NumImages);
+   prog_data.base.base.nr_params = param_count;
+   prog_data.base.base.nr_image_params = tcs->NumImages;
+
+   brw_nir_setup_glsl_uniforms(nir, shader_prog, &tcp->program.Base,
+                               &prog_data.base.base, false);
+
+   if (unlikely(INTEL_DEBUG & DEBUG_TCS))
+      brw_dump_ir("tessellation control", shader_prog, tcs, NULL);
+
+   int st_index = -1;
+   if (unlikely(INTEL_DEBUG & DEBUG_SHADER_TIME))
+      st_index = brw_get_shader_time_index(brw, shader_prog, NULL, ST_TCS);
+
+   if (unlikely(brw->perf_debug)) {
+      start_busy = brw->batch.last_bo && drm_intel_bo_busy(brw->batch.last_bo);
+      start_time = get_time();
+   }
+
+   void *mem_ctx = ralloc_context(NULL);
+   unsigned program_size;
+   char *error_str;
+   const unsigned *program =
+      brw_compile_tcs(compiler, brw, mem_ctx, key, &prog_data, nir, st_index,
+                      &program_size, &error_str);
+   if (program == NULL) {
+      if (shader_prog) {
+         shader_prog->LinkStatus = false;
+         ralloc_strcat(&shader_prog->InfoLog, error_str);
+      }
+
+      _mesa_problem(NULL, "Failed to compile tessellation control shader: "
+                    "%s\n", error_str);
+
+      ralloc_free(mem_ctx);
+      return false;
+   }
+
+   if (unlikely(brw->perf_debug)) {
+      struct brw_shader *btcs = (struct brw_shader *) tcs;
+      if (btcs->compiled_once) {
+         brw_tcs_debug_recompile(brw, shader_prog, key);
+      }
+      if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
+         perf_debug("TCS compile took %.03f ms and stalled the GPU\n",
+                    (get_time() - start_time) * 1000);
+      }
+      btcs->compiled_once = true;
+   }
+
+   /* Scratch space is used for register spilling */
+   if (prog_data.base.base.total_scratch) {
+      brw_get_scratch_bo(brw, &stage_state->scratch_bo,
+                        prog_data.base.base.total_scratch *
+                         brw->max_hs_threads);
+   }
+
+   brw_upload_cache(&brw->cache, BRW_CACHE_TCS_PROG,
+                    key, sizeof(*key),
+                    program, program_size,
+                    &prog_data, sizeof(prog_data),
+                    &stage_state->prog_offset, &brw->tcs.prog_data);
+   ralloc_free(mem_ctx);
+
+   return true;
+}
+
+
+void
+brw_upload_tcs_prog(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+   struct gl_shader_program **current = ctx->_Shader->CurrentProgram;
+   struct brw_stage_state *stage_state = &brw->tcs.base;
+   struct brw_tcs_prog_key key;
+   /* BRW_NEW_TESS_CTRL_PROGRAM */
+   struct brw_tess_ctrl_program *tcp =
+      (struct brw_tess_ctrl_program *) brw->tess_ctrl_program;
+
+   if (!brw_state_dirty(brw,
+                        _NEW_TEXTURE,
+                        BRW_NEW_TESS_CTRL_PROGRAM |
+                        BRW_NEW_TESS_EVAL_PROGRAM))
+      return;
+
+   if (tcp == NULL) {
+      /* Other state atoms had better not try to access prog_data, since
+       * there's no HS program.
+       */
+      brw->tcs.prog_data = NULL;
+      brw->tcs.base.prog_data = NULL;
+      return;
+   }
+
+   struct gl_program *prog = &tcp->program.Base;
+
+   memset(&key, 0, sizeof(key));
+
+   key.program_string_id = tcp->id;
+
+   /* _NEW_TEXTURE */
+   brw_populate_sampler_prog_key_data(ctx, prog, stage_state->sampler_count,
+                                      &key.tex);
+
+   /* BRW_NEW_TESS_EVAL_PROGRAM */
+   /* We need to specialize our code generation for tessellation levels
+    * based on the domain the DS is expecting to tessellate.
+    */
+   struct brw_tess_eval_program *tep =
+      (struct brw_tess_eval_program *) brw->tess_eval_program;
+   assert(tep);
+   key.tes_primitive_mode = tep->program.PrimitiveMode;
+
+   if (!brw_search_cache(&brw->cache, BRW_CACHE_TCS_PROG,
+                         &key, sizeof(key),
+                         &stage_state->prog_offset, &brw->tcs.prog_data)) {
+      bool success = brw_codegen_tcs_prog(brw, current[MESA_SHADER_TESS_CTRL],
+                                          tcp, &key);
+      assert(success);
+      (void)success;
+   }
+   brw->tcs.base.prog_data = &brw->tcs.prog_data->base.base;
+}
+
+
+bool
+brw_tcs_precompile(struct gl_context *ctx,
+                   struct gl_shader_program *shader_prog,
+                   struct gl_program *prog)
+{
+   struct brw_context *brw = brw_context(ctx);
+   struct brw_tcs_prog_key key;
+   uint32_t old_prog_offset = brw->tcs.base.prog_offset;
+   struct brw_tcs_prog_data *old_prog_data = brw->tcs.prog_data;
+   bool success;
+
+   struct gl_tess_ctrl_program *tcp = (struct gl_tess_ctrl_program *)prog;
+   struct brw_tess_ctrl_program *btcp = brw_tess_ctrl_program(tcp);
+
+   memset(&key, 0, sizeof(key));
+
+   key.program_string_id = btcp->id;
+   brw_setup_tex_for_precompile(brw, &key.tex, prog);
+
+   key.tes_primitive_mode = GL_TRIANGLES;
+
+   success = brw_codegen_tcs_prog(brw, shader_prog, btcp, &key);
+
+   brw->tcs.base.prog_offset = old_prog_offset;
+   brw->tcs.prog_data = old_prog_data;
+
+   return success;
+}
index a697bdf84a082533ec4065291ee5c399521c89b2..0cded0c87c62b83ad860dd88eddecf3ceeb896ab 100644 (file)
@@ -155,6 +155,9 @@ vec4_instruction::is_send_from_grf()
    case SHADER_OPCODE_TYPED_ATOMIC:
    case SHADER_OPCODE_TYPED_SURFACE_READ:
    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+   case VEC4_OPCODE_URB_READ:
+   case TCS_OPCODE_URB_WRITE:
+   case SHADER_OPCODE_BARRIER:
       return true;
    default:
       return false;
@@ -184,7 +187,9 @@ bool
 vec4_instruction::has_source_and_destination_hazard() const
 {
    switch (opcode) {
-   /* Most opcodes in the vec4 world use MRFs. */
+   case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
+   case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
+      return true;
    default:
       return false;
    }
@@ -204,6 +209,7 @@ vec4_instruction::regs_read(unsigned arg) const
    case SHADER_OPCODE_TYPED_ATOMIC:
    case SHADER_OPCODE_TYPED_SURFACE_READ:
    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+   case TCS_OPCODE_URB_WRITE:
       return arg == 0 ? mlen : 1;
 
    case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
@@ -281,6 +287,8 @@ vec4_visitor::implied_mrf_writes(vec4_instruction *inst)
       return 0;
    case GS_OPCODE_FF_SYNC:
       return 1;
+   case TCS_OPCODE_URB_WRITE:
+      return 0;
    case SHADER_OPCODE_SHADER_TIME_ADD:
       return 0;
    case SHADER_OPCODE_TEX:
index 61e57380c42e836d57e479e581907645f767fe71..ddfd87d31c9540dd9e211de7bffa958b2d55d807 100644 (file)
@@ -342,6 +342,7 @@ public:
                        unsigned num_components = 4);
    src_reg get_nir_src(nir_src src,
                        unsigned num_components = 4);
+   src_reg get_indirect_offset(nir_intrinsic_instr *instr);
 
    virtual dst_reg *make_reg_for_system_value(int location,
                                               const glsl_type *type) = 0;
index 85cbf24092e23b4e2d94a15a78137f60cc769069..0c1f0c31b0dd42a9c74ffdeb0cd5c842524c09fd 100644 (file)
@@ -75,6 +75,8 @@ is_expression(const vec4_instruction *const inst)
    case VEC4_OPCODE_UNPACK_UNIFORM:
    case SHADER_OPCODE_FIND_LIVE_CHANNEL:
    case SHADER_OPCODE_BROADCAST:
+   case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
+   case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
       return true;
    case SHADER_OPCODE_RCP:
    case SHADER_OPCODE_RSQ:
index 2d0722aa1ebd08ee8fb0665a1de740f406ed96ce..c31e72def67d5173223548c1113a525728de2c59 100644 (file)
@@ -45,6 +45,9 @@ can_do_writemask(const struct brw_device_info *devinfo,
    case VS_OPCODE_PULL_CONSTANT_LOAD:
    case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
    case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9:
+   case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
+   case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
+   case VEC4_OPCODE_URB_READ:
       return false;
    default:
       /* The MATH instruction on Gen6 only executes in align1 mode, which does
index c3426ddd1c874b58aa0c8b8b71b8fd6fcc3e1966..6325569956fda245195ae8b7a8858bcb5a74369d 100644 (file)
@@ -713,6 +713,220 @@ generate_gs_set_primitive_id(struct brw_codegen *p, struct brw_reg dst)
    brw_pop_insn_state(p);
 }
 
+static void
+generate_tcs_get_instance_id(struct brw_codegen *p, struct brw_reg dst)
+{
+   /* "Instance Count" comes as part of the payload in r0.2 bits 23:17.
+    *
+    * Since we operate in SIMD4x2 mode, we need run half as many threads
+    * as necessary.  So we assign (2i + 1, 2i) as the thread counts.  We
+    * shift right by one less to accomplish the multiplication by two.
+    */
+   dst = retype(dst, BRW_REGISTER_TYPE_UD);
+   struct brw_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+   const int mask = INTEL_MASK(23, 17);
+   const int shift = 17;
+
+   brw_AND(p, get_element_ud(dst, 0), get_element_ud(r0, 2), brw_imm_ud(mask));
+   brw_SHR(p, get_element_ud(dst, 0), get_element_ud(dst, 0),
+           brw_imm_ud(shift - 1));
+   brw_ADD(p, get_element_ud(dst, 4), get_element_ud(dst, 0), brw_imm_ud(1));
+
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_tcs_urb_write(struct brw_codegen *p,
+                       vec4_instruction *inst,
+                       struct brw_reg urb_header)
+{
+   const struct brw_device_info *devinfo = p->devinfo;
+
+   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_set_dest(p, send, brw_null_reg());
+   brw_set_src0(p, send, urb_header);
+
+   brw_set_message_descriptor(p, send, BRW_SFID_URB,
+                              inst->mlen /* mlen */, 0 /* rlen */,
+                              true /* header */, false /* eot */);
+   brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_WRITE_OWORD);
+   brw_inst_set_urb_global_offset(devinfo, send, inst->offset);
+   brw_inst_set_urb_per_slot_offset(devinfo, send, 1);
+   brw_inst_set_urb_swizzle_control(devinfo, send, BRW_URB_SWIZZLE_INTERLEAVE);
+
+   /* what happens to swizzles? */
+}
+
+
+static void
+generate_tcs_input_urb_offsets(struct brw_codegen *p,
+                               struct brw_reg dst,
+                               struct brw_reg vertex,
+                               struct brw_reg offset)
+{
+   /* Generates an URB read/write message header for HS/DS operation.
+    * Inputs are a vertex index, and a byte offset from the beginning of
+    * the vertex. */
+
+   /* If `vertex` is not an immediate, we clobber a0.0 */
+
+   assert(vertex.file == BRW_IMMEDIATE_VALUE || vertex.file == BRW_GENERAL_REGISTER_FILE);
+   assert(vertex.type == BRW_REGISTER_TYPE_UD || vertex.type == BRW_REGISTER_TYPE_D);
+
+   assert(dst.file == BRW_GENERAL_REGISTER_FILE);
+
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_MOV(p, dst, brw_imm_ud(0));
+
+   /* m0.5 bits 8-15 are channel enables */
+   brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud(0xff00));
+
+   /* m0.0-0.1: URB handles */
+   if (vertex.file == BRW_IMMEDIATE_VALUE) {
+      uint32_t vertex_index = vertex.ud;
+      struct brw_reg index_reg = brw_vec1_grf(
+            1 + (vertex_index >> 3), vertex_index & 7);
+
+      brw_MOV(p, vec2(get_element_ud(dst, 0)),
+              retype(index_reg, BRW_REGISTER_TYPE_UD));
+   } else {
+      /* Use indirect addressing.  ICP Handles are DWords (single channels
+       * of a register) and start at g1.0.
+       *
+       * In order to start our region at g1.0, we add 8 to the vertex index,
+       * effectively skipping over the 8 channels in g0.0.  This gives us a
+       * DWord offset to the ICP Handle.
+       *
+       * Indirect addressing works in terms of bytes, so we then multiply
+       * the DWord offset by 4 (by shifting left by 2).
+       */
+      struct brw_reg addr = brw_address_reg(0);
+
+      /* bottom half: m0.0 = g[1.0 + vertex.0]UD */
+      brw_ADD(p, addr, get_element_ud(vertex, 0), brw_imm_uw(0x8));
+      brw_SHL(p, addr, addr, brw_imm_ud(2));
+      brw_MOV(p, get_element_ud(dst, 0), deref_1ud(brw_indirect(0, 0), 0));
+
+      /* top half: m0.1 = g[1.0 + vertex.4]UD */
+      brw_ADD(p, addr, get_element_ud(vertex, 4), brw_imm_uw(0x8));
+      brw_SHL(p, addr, addr, brw_imm_ud(2));
+      brw_MOV(p, get_element_ud(dst, 1), deref_1ud(brw_indirect(0, 0), 0));
+   }
+
+   /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */
+   if (offset.file != ARF)
+      brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0));
+
+   brw_pop_insn_state(p);
+}
+
+
+static void
+generate_tcs_output_urb_offsets(struct brw_codegen *p,
+                                struct brw_reg dst,
+                                struct brw_reg write_mask,
+                                struct brw_reg offset)
+{
+   /* Generates an URB read/write message header for HS/DS operation, for the patch URB entry. */
+   assert(dst.file == BRW_GENERAL_REGISTER_FILE || dst.file == BRW_MESSAGE_REGISTER_FILE);
+
+   assert(write_mask.file == BRW_IMMEDIATE_VALUE);
+   assert(write_mask.type == BRW_REGISTER_TYPE_UD);
+
+   brw_push_insn_state(p);
+
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_MOV(p, dst, brw_imm_ud(0));
+
+   unsigned mask = write_mask.ud;
+
+   /* m0.5 bits 15:12 and 11:8 are channel enables */
+   brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud((mask << 8) | (mask << 12)));
+
+   /* HS patch URB handle is delivered in r0.0 */
+   struct brw_reg urb_handle = brw_vec1_grf(0, 0);
+
+   /* m0.0-0.1: URB handles */
+   brw_MOV(p, vec2(get_element_ud(dst, 0)),
+           retype(urb_handle, BRW_REGISTER_TYPE_UD));
+
+   /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */
+   if (offset.file != ARF)
+      brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0));
+
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_vec4_urb_read(struct brw_codegen *p,
+                       vec4_instruction *inst,
+                       struct brw_reg dst,
+                       struct brw_reg header)
+{
+   const struct brw_device_info *devinfo = p->devinfo;
+
+   assert(header.file == BRW_GENERAL_REGISTER_FILE);
+   assert(header.type == BRW_REGISTER_TYPE_UD);
+
+   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_set_dest(p, send, dst);
+   brw_set_src0(p, send, header);
+
+   brw_set_message_descriptor(p, send, BRW_SFID_URB,
+                              1 /* mlen */, 1 /* rlen */,
+                              true /* header */, false /* eot */);
+   brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD);
+   brw_inst_set_urb_swizzle_control(devinfo, send, BRW_URB_SWIZZLE_INTERLEAVE);
+   brw_inst_set_urb_per_slot_offset(devinfo, send, 1);
+
+   brw_inst_set_urb_global_offset(devinfo, send, inst->offset);
+}
+
+static void
+generate_tcs_get_primitive_id(struct brw_codegen *p, struct brw_reg dst)
+{
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_MOV(p, dst, retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD));
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_tcs_create_barrier_header(struct brw_codegen *p,
+                                   struct brw_vue_prog_data *prog_data,
+                                   struct brw_reg dst)
+{
+   struct brw_reg m0_2 = get_element_ud(dst, 2);
+   unsigned instances = ((struct brw_tcs_prog_data *) prog_data)->instances;
+
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+   /* Zero the message header */
+   brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u));
+
+   /* Copy "Barrier ID" from DW0 bits 16:13 */
+   brw_AND(p, m0_2,
+           retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
+           brw_imm_ud(0x1e000));
+
+   /* Shift it into place */
+   brw_SHL(p, m0_2, get_element_ud(dst, 2), brw_imm_ud(11));
+
+   /* Set the Barrier Count and the enable bit */
+   brw_OR(p, m0_2, m0_2, brw_imm_ud(instances << 9 | (1 << 15)));
+
+   brw_pop_insn_state(p);
+}
+
 static void
 generate_oword_dual_block_offsets(struct brw_codegen *p,
                                   struct brw_reg m1,
@@ -1538,6 +1752,39 @@ generate_code(struct brw_codegen *p,
          break;
       }
 
+      case TCS_OPCODE_URB_WRITE:
+         generate_tcs_urb_write(p, inst, src[0]);
+         break;
+
+      case VEC4_OPCODE_URB_READ:
+         generate_vec4_urb_read(p, inst, dst, src[0]);
+         break;
+
+      case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
+         generate_tcs_input_urb_offsets(p, dst, src[0], src[1]);
+         break;
+
+      case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
+         generate_tcs_output_urb_offsets(p, dst, src[0], src[1]);
+         break;
+
+      case TCS_OPCODE_GET_INSTANCE_ID:
+         generate_tcs_get_instance_id(p, dst);
+         break;
+
+      case TCS_OPCODE_GET_PRIMITIVE_ID:
+         generate_tcs_get_primitive_id(p, dst);
+         break;
+
+      case TCS_OPCODE_CREATE_BARRIER_HEADER:
+         generate_tcs_create_barrier_header(p, prog_data, dst);
+         break;
+
+      case SHADER_OPCODE_BARRIER:
+         brw_barrier(p, src[0]);
+         brw_WAIT(p);
+         break;
+
       default:
          unreachable("Unsupported opcode");
       }
index 0ded465ebff74268ff3a994ec2307ced985a4031..25996281131b128467342533f126a8a2ccaa5646 100644 (file)
@@ -327,6 +327,24 @@ vec4_visitor::get_nir_src(nir_src src, unsigned num_components)
    return get_nir_src(src, nir_type_int, num_components);
 }
 
+src_reg
+vec4_visitor::get_indirect_offset(nir_intrinsic_instr *instr)
+{
+   nir_src *offset_src = nir_get_io_offset_src(instr);
+   nir_const_value *const_value = nir_src_as_const_value(*offset_src);
+
+   if (const_value) {
+      /* The only constant offset we should find is 0.  brw_nir.c's
+       * add_const_offset_to_base() will fold other constant offsets
+       * into instr->const_index[0].
+       */
+      assert(const_value->u[0] == 0);
+      return src_reg();
+   }
+
+   return get_nir_src(*offset_src, BRW_REGISTER_TYPE_UD, 1);
+}
+
 void
 vec4_visitor::nir_emit_load_const(nir_load_const_instr *instr)
 {
@@ -650,7 +668,10 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
 
    case nir_intrinsic_load_vertex_id_zero_base:
    case nir_intrinsic_load_base_vertex:
-   case nir_intrinsic_load_instance_id: {
+   case nir_intrinsic_load_instance_id:
+   case nir_intrinsic_load_invocation_id:
+   case nir_intrinsic_load_tess_level_inner:
+   case nir_intrinsic_load_tess_level_outer: {
       gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
       src_reg val = src_reg(nir_system_values[sv]);
       assert(val.file != BAD_FILE);
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
new file mode 100644 (file)
index 0000000..22224d1
--- /dev/null
@@ -0,0 +1,496 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file brw_vec4_tcs.cpp
+ *
+ * Tessellaton control shader specific code derived from the vec4_visitor class.
+ */
+
+#include "brw_nir.h"
+#include "brw_vec4_tcs.h"
+
+namespace brw {
+
+vec4_tcs_visitor::vec4_tcs_visitor(const struct brw_compiler *compiler,
+                                   void *log_data,
+                                   const struct brw_tcs_prog_key *key,
+                                   struct brw_tcs_prog_data *prog_data,
+                                   const nir_shader *nir,
+                                   void *mem_ctx,
+                                   int shader_time_index)
+   : vec4_visitor(compiler, log_data, &key->tex, &prog_data->base,
+                  nir, mem_ctx, false, shader_time_index),
+     key(key)
+{
+}
+
+
+void
+vec4_tcs_visitor::nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr)
+{
+}
+
+dst_reg *
+vec4_tcs_visitor::make_reg_for_system_value(int location, const glsl_type *type)
+{
+   return NULL;
+}
+
+
+void
+vec4_tcs_visitor::setup_payload()
+{
+   int reg = 0;
+
+   /* The payload always contains important data in r0, which contains
+    * the URB handles that are passed on to the URB write at the end
+    * of the thread.
+    */
+   reg++;
+
+   /* r1.0 - r4.7 may contain the input control point URB handles,
+    * which we use to pull vertex data.
+    */
+   reg += 4;
+
+   /* Push constants may start at r5.0 */
+   reg = setup_uniforms(reg);
+
+   this->first_non_payload_grf = reg;
+}
+
+
+void
+vec4_tcs_visitor::emit_prolog()
+{
+   invocation_id = src_reg(this, glsl_type::uint_type);
+   emit(TCS_OPCODE_GET_INSTANCE_ID, dst_reg(invocation_id));
+
+   /* HS threads are dispatched with the dispatch mask set to 0xFF.
+    * If there are an odd number of output vertices, then the final
+    * HS instance dispatched will only have its bottom half doing real
+    * work, and so we need to disable the upper half:
+    */
+   if (nir->info.tcs.vertices_out % 2) {
+      emit(CMP(dst_null_d(), invocation_id,
+               brw_imm_ud(nir->info.tcs.vertices_out), BRW_CONDITIONAL_L));
+
+      /* Matching ENDIF is in emit_thread_end() */
+      emit(IF(BRW_PREDICATE_NORMAL));
+   }
+}
+
+
+void
+vec4_tcs_visitor::emit_thread_end()
+{
+   current_annotation = "thread end";
+
+   if (nir->info.tcs.vertices_out % 2) {
+      emit(BRW_OPCODE_ENDIF);
+   }
+
+   if (unlikely(INTEL_DEBUG & DEBUG_SHADER_TIME))
+      emit_shader_time_end();
+
+   vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
+   inst->mlen = 1;   /* just the header, no data. */
+   inst->urb_write_flags = BRW_URB_WRITE_EOT_COMPLETE;
+}
+
+
+void
+vec4_tcs_visitor::emit_input_urb_read(const dst_reg &dst,
+                                      const src_reg &vertex_index,
+                                      unsigned base_offset,
+                                      const src_reg &indirect_offset)
+{
+   vec4_instruction *inst;
+   dst_reg temp(this, glsl_type::ivec4_type);
+   temp.type = dst.type;
+
+   /* Set up the message header to reference the proper parts of the URB */
+   dst_reg header = dst_reg(this, glsl_type::uvec4_type);
+   inst = emit(TCS_OPCODE_SET_INPUT_URB_OFFSETS, header, vertex_index,
+               indirect_offset);
+   inst->force_writemask_all = true;
+
+   /* Read into a temporary, ignoring writemasking. */
+   inst = emit(VEC4_OPCODE_URB_READ, temp, src_reg(header));
+   inst->offset = base_offset;
+   inst->mlen = 1;
+   inst->base_mrf = -1;
+
+   /* Copy the temporary to the destination to deal with writemasking.
+    *
+    * Also attempt to deal with gl_PointSize being in the .w component.
+    */
+   if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
+      emit(MOV(dst, swizzle(src_reg(temp), BRW_SWIZZLE_WWWW)));
+   } else {
+      emit(MOV(dst, src_reg(temp)));
+   }
+}
+
+void
+vec4_tcs_visitor::emit_output_urb_read(const dst_reg &dst,
+                                       unsigned base_offset,
+                                       const src_reg &indirect_offset)
+{
+   vec4_instruction *inst;
+
+   /* Set up the message header to reference the proper parts of the URB */
+   dst_reg header = dst_reg(this, glsl_type::uvec4_type);
+   inst = emit(TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, header,
+               brw_imm_ud(dst.writemask), indirect_offset);
+   inst->force_writemask_all = true;
+
+   /* Read into a temporary, ignoring writemasking. */
+   vec4_instruction *read = emit(VEC4_OPCODE_URB_READ, dst, src_reg(header));
+   read->offset = base_offset;
+   read->mlen = 1;
+   read->base_mrf = -1;
+}
+
+void
+vec4_tcs_visitor::emit_urb_write(const src_reg &value,
+                                 unsigned writemask,
+                                 unsigned base_offset,
+                                 const src_reg &indirect_offset)
+{
+   if (writemask == 0)
+      return;
+
+   src_reg message(this, glsl_type::uvec4_type, 2);
+   vec4_instruction *inst;
+
+   inst = emit(TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, dst_reg(message),
+               brw_imm_ud(writemask), indirect_offset);
+   inst->force_writemask_all = true;
+   inst = emit(MOV(offset(dst_reg(retype(message, value.type)), 1), value));
+   inst->force_writemask_all = true;
+
+   inst = emit(TCS_OPCODE_URB_WRITE, dst_null_f(), message);
+   inst->offset = base_offset;
+   inst->mlen = 2;
+   inst->base_mrf = -1;
+}
+
+static unsigned
+tesslevel_outer_components(GLenum tes_primitive_mode)
+{
+   switch (tes_primitive_mode) {
+   case GL_QUADS:
+      return 4;
+   case GL_TRIANGLES:
+      return 3;
+   case GL_ISOLINES:
+      return 2;
+   default:
+      unreachable("Bogus tessellation domain");
+   }
+   return 0;
+}
+
+static unsigned
+tesslevel_inner_components(GLenum tes_primitive_mode)
+{
+   switch (tes_primitive_mode) {
+   case GL_QUADS:
+      return 2;
+   case GL_TRIANGLES:
+      return 1;
+   case GL_ISOLINES:
+      return 0;
+   default:
+      unreachable("Bogus tessellation domain");
+   }
+   return 0;
+}
+
+/**
+ * Given a normal .xyzw writemask, convert it to a writemask for a vector
+ * that's stored backwards, i.e. .wzyx.
+ */
+static unsigned
+writemask_for_backwards_vector(unsigned mask)
+{
+   unsigned new_mask = 0;
+
+   for (int i = 0; i < 4; i++)
+      new_mask |= ((mask >> i) & 1) << (3 - i);
+
+   return new_mask;
+}
+
+void
+vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
+{
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_invocation_id:
+      emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD),
+               invocation_id));
+      break;
+   case nir_intrinsic_load_primitive_id:
+      emit(TCS_OPCODE_GET_PRIMITIVE_ID,
+           get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD));
+      break;
+   case nir_intrinsic_load_patch_vertices_in:
+      unreachable("XXX: gl_PatchVerticesIn not implemented yet.");
+      break;
+   case nir_intrinsic_load_per_vertex_input: {
+      src_reg indirect_offset = get_indirect_offset(instr);
+      unsigned imm_offset = instr->const_index[0];
+
+      nir_const_value *vertex_const = nir_src_as_const_value(instr->src[0]);
+      src_reg vertex_index =
+         vertex_const ? src_reg(brw_imm_ud(vertex_const->u[0]))
+                      : get_nir_src(instr->src[0], BRW_REGISTER_TYPE_UD, 1);
+
+      dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D);
+      dst.writemask = brw_writemask_for_size(instr->num_components);
+
+      emit_input_urb_read(dst, vertex_index, imm_offset, indirect_offset);
+      break;
+   }
+   case nir_intrinsic_load_input:
+      unreachable("nir_lower_io should use load_per_vertex_input intrinsics");
+      break;
+   case nir_intrinsic_load_output:
+   case nir_intrinsic_load_per_vertex_output: {
+      src_reg indirect_offset = get_indirect_offset(instr);
+      unsigned imm_offset = instr->const_index[0];;
+
+      dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D);
+      dst.writemask = brw_writemask_for_size(instr->num_components);
+
+      if (imm_offset == 0 && indirect_offset.file == BAD_FILE) {
+         dst.type = BRW_REGISTER_TYPE_F;
+
+         /* This is a read of gl_TessLevelInner[], which lives in the
+          * Patch URB header.  The layout depends on the domain.
+          */
+         switch (key->tes_primitive_mode) {
+         case GL_QUADS: {
+            /* DWords 3-2 (reversed); use offset 0 and WZYX swizzle. */
+            dst_reg tmp(this, glsl_type::vec4_type);
+            emit_output_urb_read(tmp, 0, src_reg());
+            emit(MOV(writemask(dst, WRITEMASK_XY),
+                     swizzle(src_reg(tmp), BRW_SWIZZLE_WZYX)));
+            break;
+         }
+         case GL_TRIANGLES:
+            /* DWord 4; use offset 1 but normal swizzle/writemask. */
+            emit_output_urb_read(writemask(dst, WRITEMASK_X), 1, src_reg());
+            break;
+         case GL_ISOLINES:
+            /* All channels are undefined. */
+            return;
+         default:
+            unreachable("Bogus tessellation domain");
+         }
+      } else if (imm_offset == 1 && indirect_offset.file == BAD_FILE) {
+         dst.type = BRW_REGISTER_TYPE_F;
+
+         /* This is a read of gl_TessLevelOuter[], which lives in the
+          * high 4 DWords of the Patch URB header, in reverse order.
+          */
+         switch (key->tes_primitive_mode) {
+         case GL_QUADS:
+            dst.writemask = WRITEMASK_XYZW;
+            break;
+         case GL_TRIANGLES:
+            dst.writemask = WRITEMASK_XYZ;
+            break;
+         case GL_ISOLINES:
+            dst.writemask = WRITEMASK_XY;
+            return;
+         default:
+            unreachable("Bogus tessellation domain");
+         }
+
+         dst_reg tmp(this, glsl_type::vec4_type);
+         emit_output_urb_read(tmp, 1, src_reg());
+         emit(MOV(dst, swizzle(src_reg(tmp), BRW_SWIZZLE_WZYX)));
+      } else {
+         emit_output_urb_read(dst, imm_offset, indirect_offset);
+      }
+      break;
+   }
+   case nir_intrinsic_store_output:
+   case nir_intrinsic_store_per_vertex_output: {
+      src_reg value = get_nir_src(instr->src[0]);
+      unsigned mask = instr->const_index[1];
+      unsigned swiz = BRW_SWIZZLE_XYZW;
+
+      src_reg indirect_offset = get_indirect_offset(instr);
+      unsigned imm_offset = instr->const_index[0];
+
+      if (imm_offset == 0 && indirect_offset.file == BAD_FILE) {
+         value.type = BRW_REGISTER_TYPE_F;
+
+         mask &= (1 << tesslevel_inner_components(key->tes_primitive_mode)) - 1;
+
+         /* This is a write to gl_TessLevelInner[], which lives in the
+          * Patch URB header.  The layout depends on the domain.
+          */
+         switch (key->tes_primitive_mode) {
+         case GL_QUADS:
+            /* gl_TessLevelInner[].xy lives at DWords 3-2 (reversed).
+             * We use an XXYX swizzle to reverse put .xy in the .wz
+             * channels, and use a .zw writemask.
+             */
+            swiz = BRW_SWIZZLE4(0, 0, 1, 0);
+            mask = writemask_for_backwards_vector(mask);
+            break;
+         case GL_TRIANGLES:
+            /* gl_TessLevelInner[].x lives at DWord 4, so we set the
+             * writemask to X and bump the URB offset by 1.
+             */
+            imm_offset = 1;
+            break;
+         case GL_ISOLINES:
+            /* Skip; gl_TessLevelInner[] doesn't exist for isolines. */
+            return;
+         default:
+            unreachable("Bogus tessellation domain");
+         }
+      } else if (imm_offset == 1 && indirect_offset.file == BAD_FILE) {
+         value.type = BRW_REGISTER_TYPE_F;
+
+         mask &= (1 << tesslevel_outer_components(key->tes_primitive_mode)) - 1;
+
+         /* This is a write to gl_TessLevelOuter[] which lives in the
+          * Patch URB Header at DWords 4-7.  However, it's reversed, so
+          * instead of .xyzw we have .wzyx.
+          */
+         swiz = BRW_SWIZZLE_WZYX;
+         mask = writemask_for_backwards_vector(mask);
+      }
+
+      emit_urb_write(swizzle(value, swiz), mask,
+                     imm_offset, indirect_offset);
+      break;
+   }
+
+   case nir_intrinsic_barrier: {
+      dst_reg header = dst_reg(this, glsl_type::uvec4_type);
+      emit(TCS_OPCODE_CREATE_BARRIER_HEADER, header);
+      emit(SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header));
+      break;
+   }
+
+   default:
+      vec4_visitor::nir_emit_intrinsic(instr);
+   }
+}
+
+
+extern "C" const unsigned *
+brw_compile_tcs(const struct brw_compiler *compiler,
+                void *log_data,
+                void *mem_ctx,
+                const struct brw_tcs_prog_key *key,
+                struct brw_tcs_prog_data *prog_data,
+                const nir_shader *src_shader,
+                int shader_time_index,
+                unsigned *final_assembly_size,
+                char **error_str)
+{
+   const struct brw_device_info *devinfo = compiler->devinfo;
+   struct brw_vue_prog_data *vue_prog_data = &prog_data->base;
+   const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_CTRL];
+
+   nir_shader *nir = nir_shader_clone(mem_ctx, src_shader);
+   nir = brw_nir_apply_sampler_key(nir, devinfo, &key->tex, is_scalar);
+   nir = brw_postprocess_nir(nir, compiler->devinfo, is_scalar);
+
+   prog_data->instances = DIV_ROUND_UP(nir->info.tcs.vertices_out, 2);
+
+   brw_compute_tess_vue_map(&vue_prog_data->vue_map,
+                            nir->info.outputs_written,
+                            nir->info.patch_outputs_written);
+
+   /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
+    * That divides up as follows:
+    *
+    *     32 bytes for the patch header (tessellation factors)
+    *    480 bytes for per-patch varyings (a varying component is 4 bytes and
+    *              gl_MaxTessPatchComponents = 120)
+    *  16384 bytes for per-vertex varyings (a varying component is 4 bytes,
+    *              gl_MaxPatchVertices = 32 and
+    *              gl_MaxTessControlOutputComponents = 128)
+    *
+    *  15808 bytes left for varying packing overhead
+    */
+   const int num_per_patch_slots = vue_prog_data->vue_map.num_per_patch_slots;
+   const int num_per_vertex_slots = vue_prog_data->vue_map.num_per_vertex_slots;
+   unsigned output_size_bytes = 0;
+   /* Note that the patch header is counted in num_per_patch_slots. */
+   output_size_bytes += num_per_patch_slots * 16;
+   output_size_bytes += nir->info.tcs.vertices_out * num_per_vertex_slots * 16;
+
+   assert(output_size_bytes >= 1);
+   if (output_size_bytes > GEN7_MAX_HS_URB_ENTRY_SIZE_BYTES)
+      return false;
+
+   /* URB entry sizes are stored as a multiple of 64 bytes. */
+   vue_prog_data->urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
+
+   struct brw_vue_map input_vue_map;
+   brw_compute_vue_map(devinfo, &input_vue_map,
+                       nir->info.inputs_read & ~VARYING_BIT_PRIMITIVE_ID,
+                       true);
+
+   /* HS does not use the usual payload pushing from URB to GRFs,
+    * because we don't have enough registers for a full-size payload, and
+    * the hardware is broken on Haswell anyway.
+    */
+   vue_prog_data->urb_read_length = 0;
+
+   if (unlikely(INTEL_DEBUG & DEBUG_TCS)) {
+      fprintf(stderr, "TCS Input ");
+      brw_print_vue_map(stderr, &input_vue_map);
+      fprintf(stderr, "TCS Output ");
+      brw_print_vue_map(stderr, &vue_prog_data->vue_map);
+   }
+
+   vec4_tcs_visitor v(compiler, log_data, key, prog_data,
+                      nir, mem_ctx, shader_time_index);
+   if (!v.run()) {
+      if (error_str)
+         *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
+      return NULL;
+   }
+
+   if (unlikely(INTEL_DEBUG & DEBUG_TCS))
+      v.dump_instructions();
+
+   return brw_vec4_generate_assembly(compiler, log_data, mem_ctx, nir,
+                                     &prog_data->base, v.cfg,
+                                     final_assembly_size);
+}
+
+
+} /* namespace brw */
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tcs.h b/src/mesa/drivers/dri/i965/brw_vec4_tcs.h
new file mode 100644 (file)
index 0000000..2bf4885
--- /dev/null
@@ -0,0 +1,84 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file brw_vec4_tcs.h
+ *
+ * The vec4-mode tessellation control shader compiler backend.
+ */
+
+#ifndef BRW_VEC4_TCS_H
+#define BRW_VEC4_TCS_H
+
+#include "brw_compiler.h"
+#include "brw_vec4.h"
+
+#ifdef __cplusplus
+namespace brw {
+
+class vec4_tcs_visitor : public vec4_visitor
+{
+public:
+   vec4_tcs_visitor(const struct brw_compiler *compiler,
+                    void *log_data,
+                    const struct brw_tcs_prog_key *key,
+                    struct brw_tcs_prog_data *prog_data,
+                    const nir_shader *nir,
+                    void *mem_ctx,
+                    int shader_time_index);
+
+protected:
+   virtual dst_reg *make_reg_for_system_value(int location,
+                                              const glsl_type *type);
+   virtual void nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr);
+   virtual void setup_payload();
+   virtual void emit_prolog();
+   virtual void emit_thread_end();
+
+   virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr);
+
+   void emit_input_urb_read(const dst_reg &dst,
+                            const src_reg &vertex_index,
+                            unsigned base_offset,
+                            const src_reg &indirect_offset);
+   void emit_output_urb_read(const dst_reg &dst,
+                             unsigned base_offset,
+                             const src_reg &indirect_offset);
+
+   void emit_urb_write(const src_reg &value, unsigned writemask,
+                       unsigned base_offset, const src_reg &indirect_offset);
+
+   /* we do not use the normal end-of-shader URB write mechanism -- but every vec4 stage
+    * must provide implementations of these:
+    */
+   virtual void emit_urb_write_header(int mrf) {}
+   virtual vec4_instruction *emit_urb_write_opcode(bool complete) { return NULL; }
+
+   const struct brw_tcs_prog_key *key;
+   src_reg invocation_id;
+};
+
+} /* namespace brw */
+#endif /* __cplusplus */
+
+#endif /* BRW_VEC4_TCS_H */