i965: Don't re-layout varyings for separate shader programs.

author Kenneth Graunke <kenneth@whitecape.org>

Wed, 9 Sep 2015 23:21:56 +0000 (16:21 -0700)

committer Kenneth Graunke <kenneth@whitecape.org>

Sat, 26 Sep 2015 18:59:56 +0000 (11:59 -0700)
author Kenneth Graunke <kenneth@whitecape.org>
Wed, 9 Sep 2015 23:21:56 +0000 (16:21 -0700)
committer Kenneth Graunke <kenneth@whitecape.org>
Sat, 26 Sep 2015 18:59:56 +0000 (11:59 -0700)
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h

index 144d3e327d43b134d72ac181f66185589d22f420..a7b612ad545d58f0840a7d363aef4d8ecb3dae59 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -540,6 +540,17 @@ struct brw_vue_map {
      */
     GLbitfield64 slots_valid;
  
+   /**
+    * Is this VUE map for a separate shader pipeline?
+    *
+    * Separable programs (GL_ARB_separate_shader_objects) can be mixed and matched
+    * without the linker having a chance to dead code eliminate unused varyings.
+    *
+    * This means that we have to use a fixed slot layout, based on the output's
+    * location field, rather than assigning slots in a compact contiguous block.
+    */
+   bool separate;
+
     /**
      * Map from gl_varying_slot value to VUE slot.  For gl_varying_slots that are
      * not stored in a slot (because they are not written, or because
@@ -585,7 +596,8 @@ static inline GLuint brw_varying_to_offset(struct brw_vue_map *vue_map,
  
  void brw_compute_vue_map(const struct brw_device_info *devinfo,
                           struct brw_vue_map *vue_map,
-                         GLbitfield64 slots_valid);
+                         GLbitfield64 slots_valid,
+                         bool separate_shader);
  
  
  /**
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp

index a8f5520fb942bd14e985ee0925e4f4c48641309e..49dc7f65b48e53472fe462e828334a81be956f89 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -1440,7 +1440,8 @@ fs_visitor::calculate_urb_setup()
            */
           struct brw_vue_map prev_stage_vue_map;
           brw_compute_vue_map(devinfo, &prev_stage_vue_map,
-                             key->input_slots_valid);
+                             key->input_slots_valid,
+                             shader_prog->SeparateShader);
           int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
           assert(prev_stage_vue_map.num_slots <= first_slot + 32);
           for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c

index 16ea68462858968e072ebdce0828351b4af3a937..38b3e3a5cd98da115415b26f52612acc794059ce 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -120,7 +120,8 @@ brw_codegen_gs_prog(struct brw_context *brw,
     GLbitfield64 outputs_written = gp->program.Base.OutputsWritten;
  
     brw_compute_vue_map(brw->intelScreen->devinfo,
-                       &c.prog_data.base.vue_map, outputs_written);
+                       &c.prog_data.base.vue_map, outputs_written,
+                       prog ? prog->SeparateShader : false);
  
     /* Compute the output vertex size.
      *
@@ -243,7 +244,8 @@ brw_codegen_gs_prog(struct brw_context *brw,
        get_hw_prim_for_gl_prim(gp->program.OutputType);
  
     brw_compute_vue_map(brw->intelScreen->devinfo,
-                       &c.input_vue_map, c.key.input_varyings);
+                       &c.input_vue_map, c.key.input_varyings,
+                       prog->SeparateShader);
  
     /* GS inputs are read from the VUE 256 bits (2 vec4's) at a time, so we
      * need to program a URB read length of ceiling(num_slots / 2).
@@ -357,7 +359,9 @@ brw_upload_gs_prog(struct brw_context *brw)
     brw->gs.base.prog_data = &brw->gs.prog_data->base.base;
  
     if (brw->gs.prog_data->base.vue_map.slots_valid !=
-       brw->vue_map_geom_out.slots_valid) {
+       brw->vue_map_geom_out.slots_valid ||
+       brw->gs.prog_data->base.vue_map.separate !=
+       brw->vue_map_geom_out.separate) {
        brw->vue_map_geom_out = brw->gs.prog_data->base.vue_map;
        brw->ctx.NewDriverState |= BRW_NEW_VUE_MAP_GEOM_OUT;
     }
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c

index 465e78f4c747524db14b710a37ad24e2bdebd421..b1ec9637c323887c4f09edf99dd8316047381502 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -180,7 +180,8 @@ brw_codegen_vs_prog(struct brw_context *brw,
     }
  
     brw_compute_vue_map(brw->intelScreen->devinfo,
-                       &prog_data.base.vue_map, outputs_written);
+                       &prog_data.base.vue_map, outputs_written,
+                       prog ? prog->SeparateShader : false);
  
     if (0) {
        _mesa_fprint_program_opt(stderr, &vp->program.Base, PROG_PRINT_DEBUG,
@@ -388,7 +389,9 @@ brw_upload_vs_prog(struct brw_context *brw)
     brw->vs.base.prog_data = &brw->vs.prog_data->base.base;
  
     if (brw->vs.prog_data->base.vue_map.slots_valid !=
-       brw->vue_map_geom_out.slots_valid) {
+       brw->vue_map_geom_out.slots_valid ||
+       brw->vs.prog_data->base.vue_map.separate !=
+       brw->vue_map_geom_out.separate) {
        brw->vue_map_vs = brw->vs.prog_data->base.vue_map;
        brw->ctx.NewDriverState |= BRW_NEW_VUE_MAP_VS;
        if (brw->gen < 6) {
diff --git a/src/mesa/drivers/dri/i965/brw_vue_map.c b/src/mesa/drivers/dri/i965/brw_vue_map.c

index 1ef52143cc540f9d79595d44a5f6ed2e74106392..45662bd5afcdeeeffc9bfa8a3cb43a48e6ddb6fe 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_vue_map.c
+++ b/src/mesa/drivers/dri/i965/brw_vue_map.c
@@ -59,10 +59,18 @@ assign_vue_slot(struct brw_vue_map *vue_map, int varying, int slot)
  void
  brw_compute_vue_map(const struct brw_device_info *devinfo,
                      struct brw_vue_map *vue_map,
-                    GLbitfield64 slots_valid)
+                    GLbitfield64 slots_valid,
+                    bool separate)
  {
+   /* Keep using the packed/contiguous layout on old hardware - we only need
+    * the SSO layout when using geometry/tessellation shaders or 32 FS input
+    * varyings, which only exist on Gen >= 6.  It's also a bit more efficient.
+    */
+   if (devinfo->gen < 6)
+      separate = false;
+
     vue_map->slots_valid = slots_valid;
-   int i;
+   vue_map->separate = separate;
  
     /* gl_Layer and gl_ViewportIndex don't get their own varying slots -- they
      * are stored in the first VUE slot (VARYING_SLOT_PSIZ).
@@ -77,7 +85,7 @@ brw_compute_vue_map(const struct brw_device_info *devinfo,
      */
     STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 127);
  
-   for (i = 0; i < BRW_VARYING_SLOT_COUNT; ++i) {
+   for (int i = 0; i < BRW_VARYING_SLOT_COUNT; ++i) {
        vue_map->varying_to_slot[i] = -1;
        vue_map->slot_to_varying[i] = BRW_VARYING_SLOT_PAD;
     }
@@ -131,21 +139,42 @@ brw_compute_vue_map(const struct brw_device_info *devinfo,
           assign_vue_slot(vue_map, VARYING_SLOT_BFC1, slot++);
     }
  
-   /* The hardware doesn't care about the rest of the vertex outputs, so just
-    * assign them contiguously.  Don't reassign outputs that already have a
-    * slot.
+   /* The hardware doesn't care about the rest of the vertex outputs, so we
+    * can assign them however we like.  For normal programs, we simply assign
+    * them contiguously.
+    *
+    * For separate shader pipelines, we first assign built-in varyings
+    * contiguous slots.  This works because ARB_separate_shader_objects
+    * requires that all shaders have matching built-in varying interface
+    * blocks.  Next, we assign generic varyings based on their location
+    * (either explicit or linker assigned).  This guarantees a fixed layout.
      *
      * We generally don't need to assign a slot for VARYING_SLOT_CLIP_VERTEX,
      * since it's encoded as the clip distances by emit_clip_distances().
      * However, it may be output by transform feedback, and we'd rather not
      * recompute state when TF changes, so we just always include it.
      */
-   for (int i = 0; i < VARYING_SLOT_MAX; ++i) {
-      if ((slots_valid & BITFIELD64_BIT(i)) &&
-          vue_map->varying_to_slot[i] == -1) {
-         assign_vue_slot(vue_map, i, slot++);
+   GLbitfield64 builtins = slots_valid & BITFIELD64_MASK(VARYING_SLOT_VAR0);
+   while (builtins != 0) {
+      const int varying = ffsll(builtins) - 1;
+      if (vue_map->varying_to_slot[varying] == -1) {
+         assign_vue_slot(vue_map, varying, slot++);
+      }
+      builtins &= ~BITFIELD64_BIT(varying);
+   }
+
+   const int first_generic_slot = slot;
+   GLbitfield64 generics = slots_valid & ~BITFIELD64_MASK(VARYING_SLOT_VAR0);
+   while (generics != 0) {
+      const int varying = ffsll(generics) - 1;
+      if (separate) {
+         slot = first_generic_slot + varying - VARYING_SLOT_VAR0;
+         assign_vue_slot(vue_map, varying, slot);
+      } else {
+         assign_vue_slot(vue_map, varying, slot++);
        }
+      generics &= ~BITFIELD64_BIT(varying);
     }
  
-   vue_map->num_slots = slot;
+   vue_map->num_slots = separate ? slot + 1 : slot;
  }
author	Kenneth Graunke <kenneth@whitecape.org>
	Wed, 9 Sep 2015 23:21:56 +0000 (16:21 -0700)
committer	Kenneth Graunke <kenneth@whitecape.org>
	Sat, 26 Sep 2015 18:59:56 +0000 (11:59 -0700)
src/mesa/drivers/dri/i965/brw_context.h		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_fs.cpp		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_gs.c		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_vs.c		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_vue_map.c		patch \| blob \| history