From 8b6a797d743be38396fcaf4a2f7fb01d3bcd9ba3 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Kristian=20H=C3=B8gsberg?= <krh@bitplanet.net>
Date: Mon, 27 Oct 2014 22:42:50 -0700
Subject: [PATCH] i965: Add fs_visitor::run_vs() to generate scalar vertex
 shader code
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

This patch uses the previous refactoring to add a new run_vs() method
that generates vertex shader code using the scalar visitor and
optimizer.

Signed-off-by: Kristian HÃ¸gsberg <krh@bitplanet.net>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp         | 111 ++++++-
 src/mesa/drivers/dri/i965/brw_fs.h           |  21 +-
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 317 ++++++++++++++++++-
 3 files changed, 436 insertions(+), 13 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index b1afe46b6b2..16f8b32639c 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -1808,6 +1808,61 @@ fs_visitor::assign_urb_setup()
       urb_start + prog_data->num_varying_inputs * 2;
 }
 
+void
+fs_visitor::assign_vs_urb_setup()
+{
+   brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
+   int grf, count, slot, channel, attr;
+
+   assert(stage == MESA_SHADER_VERTEX);
+   count = _mesa_bitcount_64(vs_prog_data->inputs_read);
+   if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
+      count++;
+
+   /* Each attribute is 4 regs. */
+   this->first_non_payload_grf =
+      payload.num_regs + prog_data->curb_read_length + count * 4;
+
+   unsigned vue_entries =
+      MAX2(count, vs_prog_data->base.vue_map.num_slots);
+
+   vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
+   vs_prog_data->base.urb_read_length = (count + 1) / 2;
+
+   assert(vs_prog_data->base.urb_read_length <= 15);
+
+   /* Rewrite all ATTR file references to the hw grf that they land in. */
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      for (int i = 0; i < inst->sources; i++) {
+         if (inst->src[i].file == ATTR) {
+
+            if (inst->src[i].reg == VERT_ATTRIB_MAX) {
+               slot = count - 1;
+            } else {
+               /* Attributes come in in a contiguous block, ordered by their
+                * gl_vert_attrib value.  That means we can compute the slot
+                * number for an attribute by masking out the enabled
+                * attributes before it and counting the bits.
+                */
+               attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
+               slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
+                                        BITFIELD64_MASK(attr));
+            }
+
+            channel = inst->src[i].reg_offset & 3;
+
+            grf = payload.num_regs +
+               prog_data->curb_read_length +
+               slot * 4 + channel;
+
+            inst->src[i].file = HW_REG;
+            inst->src[i].fixed_hw_reg =
+               retype(brw_vec8_grf(grf, 0), inst->src[i].type);
+         }
+      }
+   }
+}
+
 /**
  * Split large virtual GRFs into separate components if we can.
  *
@@ -3395,6 +3450,13 @@ fs_visitor::setup_payload_gen6()
    }
 }
 
+void
+fs_visitor::setup_vs_payload()
+{
+   /* R0: thread header, R1: urb handles */
+   payload.num_regs = 2;
+}
+
 void
 fs_visitor::assign_binding_table_offsets()
 {
@@ -3433,6 +3495,8 @@ fs_visitor::calculate_register_pressure()
 void
 fs_visitor::optimize()
 {
+   const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs";
+
    calculate_cfg();
 
    split_virtual_grfs();
@@ -3447,8 +3511,8 @@ fs_visitor::optimize()
                                                                         \
       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
          char filename[64];                                             \
-         snprintf(filename, 64, "fs%d-%04d-%02d-%02d-" #pass,           \
-                  dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
+         snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
+                  stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
                                                                         \
          backend_visitor::dump_instructions(filename);                  \
       }                                                                 \
@@ -3458,8 +3522,8 @@ fs_visitor::optimize()
 
    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
       char filename[64];
-      snprintf(filename, 64, "fs%d-%04d-00-start",
-               dispatch_width, shader_prog ? shader_prog->Name : 0);
+      snprintf(filename, 64, "%s%d-%04d-00-start",
+               stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0);
 
       backend_visitor::dump_instructions(filename);
    }
@@ -3527,6 +3591,9 @@ fs_visitor::allocate_registers()
    }
 
    if (!allocated_without_spills) {
+      const char *stage_name = stage == MESA_SHADER_VERTEX ?
+         "Vertex" : "Fragment";
+
       /* We assume that any spilling is worse than just dropping back to
        * SIMD8.  There's probably actually some intermediate point where
        * SIMD16 with a couple of spills is still better.
@@ -3535,9 +3602,9 @@ fs_visitor::allocate_registers()
          fail("Failure to register allocate.  Reduce number of "
               "live scalar values to avoid this.");
       } else {
-         perf_debug("Fragment shader triggered register spilling.  "
+         perf_debug("%s shader triggered register spilling.  "
                     "Try reducing the number of live scalar values to "
-                    "improve performance.\n");
+                    "improve performance.\n", stage_name);
       }
 
       /* Since we're out of heuristics, just go spill registers until we
@@ -3565,6 +3632,38 @@ fs_visitor::allocate_registers()
       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
 }
 
+bool
+fs_visitor::run_vs()
+{
+   assert(stage == MESA_SHADER_VERTEX);
+
+   assign_common_binding_table_offsets(0);
+   setup_vs_payload();
+
+   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+      emit_shader_time_begin();
+
+   foreach_in_list(ir_instruction, ir, shader->base.ir) {
+      base_ir = ir;
+      this->result = reg_undef;
+      ir->accept(this);
+   }
+   base_ir = NULL;
+   if (failed)
+      return false;
+
+   emit_urb_writes();
+
+   optimize();
+
+   assign_curb_setup();
+   assign_vs_urb_setup();
+
+   allocate_registers();
+
+   return !failed;
+}
+
 bool
 fs_visitor::run()
 {
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index a674a0256a4..84a0b101e5b 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -308,12 +308,23 @@ public:
               struct gl_shader_program *shader_prog,
               struct gl_fragment_program *fp,
               unsigned dispatch_width);
+
+   fs_visitor(struct brw_context *brw,
+              void *mem_ctx,
+              const struct brw_vs_prog_key *key,
+              struct brw_vs_prog_data *prog_data,
+              struct gl_shader_program *shader_prog,
+              struct gl_vertex_program *cp,
+              unsigned dispatch_width);
+
    ~fs_visitor();
    void init();
 
    fs_reg *variable_storage(ir_variable *var);
    int virtual_grf_alloc(int size);
    void import_uniforms(fs_visitor *v);
+   void setup_uniform_clipplane_values();
+   void compute_clip_distance();
 
    void visit(ir_variable *ir);
    void visit(ir_assignment *ir);
@@ -404,14 +415,17 @@ public:
                                         uint32_t const_offset);
 
    bool run();
+   bool run_vs();
    void optimize();
    void allocate_registers();
    void assign_binding_table_offsets();
    void setup_payload_gen4();
    void setup_payload_gen6();
+   void setup_vs_payload();
    void assign_curb_setup();
    void calculate_urb_setup();
    void assign_urb_setup();
+   void assign_vs_urb_setup();
    bool assign_regs(bool allow_spilling);
    void assign_regs_trivial();
    void get_used_mrfs(bool *mrf_used);
@@ -465,6 +479,7 @@ public:
    fs_reg *emit_samplepos_setup();
    fs_reg *emit_sampleid_setup();
    fs_reg *emit_general_interpolation(ir_variable *ir);
+   fs_reg *emit_vs_system_value(enum brw_reg_type type, int location);
    void emit_interpolation_setup_gen4();
    void emit_interpolation_setup_gen6();
    void compute_sample_position(fs_reg dst, fs_reg int_sample_pos);
@@ -552,6 +567,7 @@ public:
    fs_inst *emit_single_fb_write(fs_reg color1, fs_reg color2,
                                  fs_reg src0_alpha, unsigned components);
    void emit_fb_writes();
+   void emit_urb_writes();
 
    void emit_shader_time_begin();
    void emit_shader_time_end();
@@ -627,8 +643,8 @@ public:
    struct hash_table *variable_ht;
    fs_reg frag_depth;
    fs_reg sample_mask;
-   fs_reg outputs[BRW_MAX_DRAW_BUFFERS];
-   unsigned output_components[BRW_MAX_DRAW_BUFFERS];
+   fs_reg outputs[VARYING_SLOT_MAX];
+   unsigned output_components[VARYING_SLOT_MAX];
    fs_reg dual_src_output;
    bool do_dual_src;
    int first_non_payload_grf;
@@ -675,6 +691,7 @@ public:
    fs_reg delta_x[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT];
    fs_reg delta_y[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT];
    fs_reg shader_start_time;
+   fs_reg userplane[MAX_CLIP_PLANES];
 
    int grf_used;
    bool spilled_any_registers;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index a9f5474cea3..399e772e3c5 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -43,11 +43,40 @@ extern "C" {
 #include "brw_eu.h"
 #include "brw_wm.h"
 }
+#include "brw_vec4.h"
 #include "brw_fs.h"
 #include "main/uniforms.h"
 #include "glsl/glsl_types.h"
 #include "glsl/ir_optimization.h"
 
+fs_reg *
+fs_visitor::emit_vs_system_value(enum brw_reg_type type, int location)
+{
+   fs_reg *reg = new(this->mem_ctx)
+      fs_reg(ATTR, VERT_ATTRIB_MAX, type);
+   brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
+
+   switch (location) {
+   case SYSTEM_VALUE_BASE_VERTEX:
+      reg->reg_offset = 0;
+      vs_prog_data->uses_vertexid = true;
+      break;
+   case SYSTEM_VALUE_VERTEX_ID:
+   case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
+      reg->reg_offset = 2;
+      vs_prog_data->uses_vertexid = true;
+      break;
+   case SYSTEM_VALUE_INSTANCE_ID:
+      reg->reg_offset = 3;
+      vs_prog_data->uses_instanceid = true;
+      break;
+   default:
+      unreachable("not reached");
+   }
+
+   return reg;
+}
+
 void
 fs_visitor::visit(ir_variable *ir)
 {
@@ -58,7 +87,11 @@ fs_visitor::visit(ir_variable *ir)
 
    if (ir->data.mode == ir_var_shader_in) {
       assert(ir->data.location != -1);
-      if (!strcmp(ir->name, "gl_FragCoord")) {
+      if (stage == MESA_SHADER_VERTEX) {
+         reg = new(this->mem_ctx)
+            fs_reg(ATTR, ir->data.location,
+                   brw_type_for_base_type(ir->type->get_scalar_type()));
+      } else if (!strcmp(ir->name, "gl_FragCoord")) {
 	 reg = emit_fragcoord_interpolation(ir);
       } else if (!strcmp(ir->name, "gl_FrontFacing")) {
 	 reg = emit_frontfacing_interpolation();
@@ -71,7 +104,19 @@ fs_visitor::visit(ir_variable *ir)
    } else if (ir->data.mode == ir_var_shader_out) {
       reg = new(this->mem_ctx) fs_reg(this, ir->type);
 
-      if (ir->data.index > 0) {
+      if (stage == MESA_SHADER_VERTEX) {
+	 int vector_elements =
+	    ir->type->is_array() ? ir->type->fields.array->vector_elements
+				 : ir->type->vector_elements;
+
+	 for (int i = 0; i < (type_size(ir->type) + 3) / 4; i++) {
+	    int output = ir->data.location + i;
+	    this->outputs[output] = *reg;
+	    this->outputs[output].reg_offset = i * 4;
+	    this->output_components[output] = vector_elements;
+	 }
+
+      } else if (ir->data.index > 0) {
 	 assert(ir->data.location == FRAG_RESULT_DATA0);
 	 assert(ir->data.index == 1);
 	 this->dual_src_output = *reg;
@@ -135,15 +180,26 @@ fs_visitor::visit(ir_variable *ir)
       reg->type = brw_type_for_base_type(ir->type);
 
    } else if (ir->data.mode == ir_var_system_value) {
-      if (ir->data.location == SYSTEM_VALUE_SAMPLE_POS) {
+      switch (ir->data.location) {
+      case SYSTEM_VALUE_BASE_VERTEX:
+      case SYSTEM_VALUE_VERTEX_ID:
+      case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
+      case SYSTEM_VALUE_INSTANCE_ID:
+         reg = emit_vs_system_value(brw_type_for_base_type(ir->type),
+                                    ir->data.location);
+         break;
+      case SYSTEM_VALUE_SAMPLE_POS:
 	 reg = emit_samplepos_setup();
-      } else if (ir->data.location == SYSTEM_VALUE_SAMPLE_ID) {
+         break;
+      case SYSTEM_VALUE_SAMPLE_ID:
 	 reg = emit_sampleid_setup();
-      } else if (ir->data.location == SYSTEM_VALUE_SAMPLE_MASK_IN) {
+         break;
+      case SYSTEM_VALUE_SAMPLE_MASK_IN:
          assert(brw->gen >= 7);
          reg = new(mem_ctx)
             fs_reg(retype(brw_vec8_grf(payload.sample_mask_in_reg, 0),
                           BRW_REGISTER_TYPE_D));
+         break;
       }
    }
 
@@ -1770,6 +1826,8 @@ get_tex(gl_shader_stage stage, const void *key)
    switch (stage) {
    case MESA_SHADER_FRAGMENT:
       return &((brw_wm_prog_key*) key)->tex;
+   case MESA_SHADER_VERTEX:
+      return &((brw_vue_prog_key*) key)->tex;
    default:
       unreachable("unhandled shader stage");
    }
@@ -3448,6 +3506,236 @@ fs_visitor::emit_fb_writes()
    this->current_annotation = NULL;
 }
 
+void
+fs_visitor::setup_uniform_clipplane_values()
+{
+   gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
+   const struct brw_vue_prog_key *key =
+      (const struct brw_vue_prog_key *) this->key;
+
+   for (int i = 0; i < key->nr_userclip_plane_consts; i++) {
+      this->userplane[i] = fs_reg(UNIFORM, uniforms);
+      for (int j = 0; j < 4; ++j) {
+         stage_prog_data->param[uniforms + j] =
+            (gl_constant_value *) &clip_planes[i][j];
+      }
+      uniforms += 4;
+   }
+}
+
+void fs_visitor::compute_clip_distance()
+{
+   struct brw_vue_prog_data *vue_prog_data =
+      (struct brw_vue_prog_data *) prog_data;
+   const struct brw_vue_prog_key *key =
+      (const struct brw_vue_prog_key *) this->key;
+
+   /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
+    *
+    *     "If a linked set of shaders forming the vertex stage contains no
+    *     static write to gl_ClipVertex or gl_ClipDistance, but the
+    *     application has requested clipping against user clip planes through
+    *     the API, then the coordinate written to gl_Position is used for
+    *     comparison against the user clip planes."
+    *
+    * This function is only called if the shader didn't write to
+    * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
+    * if the user wrote to it; otherwise we use gl_Position.
+    */
+
+   gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
+   if (!(vue_prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX))
+      clip_vertex = VARYING_SLOT_POS;
+
+   /* If the clip vertex isn't written, skip this.  Typically this means
+    * the GS will set up clipping. */
+   if (outputs[clip_vertex].file == BAD_FILE)
+      return;
+
+   setup_uniform_clipplane_values();
+
+   current_annotation = "user clip distances";
+
+   this->outputs[VARYING_SLOT_CLIP_DIST0] = fs_reg(this, glsl_type::vec4_type);
+   this->outputs[VARYING_SLOT_CLIP_DIST1] = fs_reg(this, glsl_type::vec4_type);
+
+   for (int i = 0; i < key->nr_userclip_plane_consts; i++) {
+      fs_reg u = userplane[i];
+      fs_reg output = outputs[VARYING_SLOT_CLIP_DIST0 + i / 4];
+      output.reg_offset = i & 3;
+
+      emit(MUL(output, outputs[clip_vertex], u));
+      for (int j = 1; j < 4; j++) {
+         u.reg = userplane[i].reg + j;
+         emit(MAD(output, output, offset(outputs[clip_vertex], j), u));
+      }
+   }
+}
+
+void
+fs_visitor::emit_urb_writes()
+{
+   int slot, urb_offset, length;
+   struct brw_vs_prog_data *vs_prog_data =
+      (struct brw_vs_prog_data *) prog_data;
+   const struct brw_vs_prog_key *key =
+      (const struct brw_vs_prog_key *) this->key;
+   const GLbitfield64 psiz_mask =
+      VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PSIZ;
+   const struct brw_vue_map *vue_map = &vs_prog_data->base.vue_map;
+   bool flush;
+   fs_reg sources[8];
+
+   /* Lower legacy ff and ClipVertex clipping to clip distances */
+   if (key->base.userclip_active && !prog->UsesClipDistanceOut)
+      compute_clip_distance();
+
+   /* If we don't have any valid slots to write, just do a minimal urb write
+    * send to terminate the shader. */
+   if (vue_map->slots_valid == 0) {
+
+      fs_reg payload = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD);
+      fs_inst *inst = emit(MOV(payload, fs_reg(retype(brw_vec8_grf(1, 0),
+                                                      BRW_REGISTER_TYPE_UD))));
+      inst->force_writemask_all = true;
+
+      inst = emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
+      inst->eot = true;
+      inst->mlen = 1;
+      inst->offset = 1;
+      return;
+   }
+
+   length = 0;
+   urb_offset = 0;
+   flush = false;
+   for (slot = 0; slot < vue_map->num_slots; slot++) {
+      fs_reg reg, src, zero;
+
+      int varying = vue_map->slot_to_varying[slot];
+      switch (varying) {
+      case VARYING_SLOT_PSIZ:
+
+         /* The point size varying slot is the vue header and is always in the
+          * vue map.  But often none of the special varyings that live there
+          * are written and in that case we can skip writing to the vue
+          * header, provided the corresponding state properly clamps the
+          * values further down the pipeline. */
+         if ((vue_map->slots_valid & psiz_mask) == 0) {
+            assert(length == 0);
+            urb_offset++;
+            break;
+         }
+
+         zero = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD);
+         emit(MOV(zero, fs_reg(0u)));
+
+         sources[length++] = zero;
+         if (vue_map->slots_valid & VARYING_BIT_LAYER)
+            sources[length++] = this->outputs[VARYING_SLOT_LAYER];
+         else
+            sources[length++] = zero;
+
+         if (vue_map->slots_valid & VARYING_BIT_VIEWPORT)
+            sources[length++] = this->outputs[VARYING_SLOT_VIEWPORT];
+         else
+            sources[length++] = zero;
+
+         if (vue_map->slots_valid & VARYING_BIT_PSIZ)
+            sources[length++] = this->outputs[VARYING_SLOT_PSIZ];
+         else
+            sources[length++] = zero;
+         break;
+
+      case BRW_VARYING_SLOT_NDC:
+      case VARYING_SLOT_EDGE:
+         unreachable("unexpected scalar vs output");
+         break;
+
+      case BRW_VARYING_SLOT_PAD:
+         break;
+
+      default:
+         /* gl_Position is always in the vue map, but isn't always written by
+          * the shader.  Other varyings (clip distances) get added to the vue
+          * map but don't always get written.  In those cases, the
+          * corresponding this->output[] slot will be invalid we and can skip
+          * the urb write for the varying.  If we've already queued up a vue
+          * slot for writing we flush a mlen 5 urb write, otherwise we just
+          * advance the urb_offset.
+          */
+         if (this->outputs[varying].file == BAD_FILE) {
+            if (length > 0)
+               flush = true;
+            else
+               urb_offset++;
+            break;
+         }
+
+         if ((varying == VARYING_SLOT_COL0 ||
+              varying == VARYING_SLOT_COL1 ||
+              varying == VARYING_SLOT_BFC0 ||
+              varying == VARYING_SLOT_BFC1) &&
+             key->clamp_vertex_color) {
+            /* We need to clamp these guys, so do a saturating MOV into a
+             * temp register and use that for the payload.
+             */
+            for (int i = 0; i < 4; i++) {
+               reg = fs_reg(GRF, virtual_grf_alloc(1), outputs[varying].type);
+               src = offset(this->outputs[varying], i);
+               fs_inst *inst = emit(MOV(reg, src));
+               inst->saturate = true;
+               sources[length++] = reg;
+            }
+         } else {
+            for (int i = 0; i < 4; i++)
+               sources[length++] = offset(this->outputs[varying], i);
+         }
+         break;
+      }
+
+      current_annotation = "URB write";
+
+      /* If we've queued up 8 registers of payload (2 VUE slots), if this is
+       * the last slot or if we need to flush (see BAD_FILE varying case
+       * above), emit a URB write send now to flush out the data.
+       */
+      int last = slot == vue_map->num_slots - 1;
+      if (length == 8 || last)
+         flush = true;
+      if (flush) {
+         if (last && (INTEL_DEBUG & DEBUG_SHADER_TIME))
+            emit_shader_time_end();
+
+         fs_reg *payload_sources = ralloc_array(mem_ctx, fs_reg, length + 1);
+         fs_reg payload = fs_reg(GRF, virtual_grf_alloc(length + 1),
+                                 BRW_REGISTER_TYPE_F);
+
+         /* We need WE_all on the MOV for the message header (the URB handles)
+          * so do a MOV to a dummy register and set force_writemask_all on the
+          * MOV.  LOAD_PAYLOAD will preserve that.
+          */
+         fs_reg dummy = fs_reg(GRF, virtual_grf_alloc(1),
+                               BRW_REGISTER_TYPE_UD);
+         fs_inst *inst = emit(MOV(dummy, fs_reg(retype(brw_vec8_grf(1, 0),
+                                                       BRW_REGISTER_TYPE_UD))));
+         inst->force_writemask_all = true;
+         payload_sources[0] = dummy;
+
+         memcpy(&payload_sources[1], sources, length * sizeof sources[0]);
+         emit(LOAD_PAYLOAD(payload, payload_sources, length + 1));
+
+         inst = emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
+         inst->eot = last;
+         inst->mlen = length + 1;
+         inst->offset = urb_offset;
+         urb_offset = slot + 1;
+         length = 0;
+         flush = false;
+      }
+   }
+}
+
 void
 fs_visitor::resolve_ud_negate(fs_reg *reg)
 {
@@ -3500,6 +3788,25 @@ fs_visitor::fs_visitor(struct brw_context *brw,
    init();
 }
 
+fs_visitor::fs_visitor(struct brw_context *brw,
+                       void *mem_ctx,
+                       const struct brw_vs_prog_key *key,
+                       struct brw_vs_prog_data *prog_data,
+                       struct gl_shader_program *shader_prog,
+                       struct gl_vertex_program *cp,
+                       unsigned dispatch_width)
+   : backend_visitor(brw, shader_prog, &cp->Base, &prog_data->base.base,
+                     MESA_SHADER_VERTEX),
+     reg_null_f(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_F)),
+     reg_null_d(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_D)),
+     reg_null_ud(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_UD)),
+     key(key), prog_data(&prog_data->base.base),
+     dispatch_width(dispatch_width)
+{
+   this->mem_ctx = mem_ctx;
+   init();
+}
+
 void
 fs_visitor::init()
 {
-- 
2.30.2