i965: Allocate URB space for HS and DS stages when required.

author Chris Forbes <chrisf@ijw.co.nz>

Tue, 9 Sep 2014 09:30:48 +0000 (21:30 +1200)

committer Kenneth Graunke <kenneth@whitecape.org>

Tue, 15 Dec 2015 10:16:14 +0000 (02:16 -0800)
author Chris Forbes <chrisf@ijw.co.nz>
Tue, 9 Sep 2014 09:30:48 +0000 (21:30 +1200)
committer Kenneth Graunke <kenneth@whitecape.org>
Tue, 15 Dec 2015 10:16:14 +0000 (02:16 -0800)
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h

index 1cc4c7b1282423cc506fe223dd2e90df7f4826a1..69bc04ceb0800fe5d77e6b3612b04ea40b853a02 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -1008,6 +1008,8 @@ struct brw_context
     struct {
        GLuint vsize;            /* vertex size plus header in urb registers */
        GLuint gsize;            /* GS output size in urb registers */
+      GLuint hsize;             /* Tessellation control output size in urb registers */
+      GLuint dsize;             /* Tessellation evaluation output size in urb registers */
        GLuint csize;            /* constant buffer size in urb registers */
        GLuint sfsize;           /* setup data size in urb registers */
  
@@ -1020,12 +1022,16 @@ struct brw_context
        GLuint max_gs_entries;   /* Maximum number of GS entries */
  
        GLuint nr_vs_entries;
+      GLuint nr_hs_entries;
+      GLuint nr_ds_entries;
        GLuint nr_gs_entries;
        GLuint nr_clip_entries;
        GLuint nr_sf_entries;
        GLuint nr_cs_entries;
  
        GLuint vs_start;
+      GLuint hs_start;
+      GLuint ds_start;
        GLuint gs_start;
        GLuint clip_start;
        GLuint sf_start;
@@ -1042,6 +1048,11 @@ struct brw_context
         * URB space for the GS.
         */
        bool gs_present;
+
+      /* True if the most recently sent _3DSTATE_URB message allocated
+       * URB space for the HS and DS.
+       */
+      bool tess_present;
     } urb;
  
  
@@ -1648,12 +1659,18 @@ void gen8_emit_3dstate_sample_pattern(struct brw_context *brw);
  /* gen7_urb.c */
  void
  gen7_emit_push_constant_state(struct brw_context *brw, unsigned vs_size,
+                              unsigned hs_size, unsigned ds_size,
                                unsigned gs_size, unsigned fs_size);
  
  void
  gen7_emit_urb_state(struct brw_context *brw,
-                    unsigned nr_vs_entries, unsigned vs_size,
-                    unsigned vs_start, unsigned nr_gs_entries,
+                    unsigned nr_vs_entries,
+                    unsigned vs_size, unsigned vs_start,
+                    unsigned nr_hs_entries,
+                    unsigned hs_size, unsigned hs_start,
+                    unsigned nr_ds_entries,
+                    unsigned ds_size, unsigned ds_start,
+                    unsigned nr_gs_entries,
                      unsigned gs_size, unsigned gs_start);
  
  
diff --git a/src/mesa/drivers/dri/i965/gen7_blorp.cpp b/src/mesa/drivers/dri/i965/gen7_blorp.cpp

index e87b9d1657fe03bb49628b4e28fb4029a97c5512..89b73ca7519d7661b551165929b16f943ca2c742 100644 (file)
--- a/src/mesa/drivers/dri/i965/gen7_blorp.cpp
+++ b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
@@ -50,6 +50,8 @@ gen7_blorp_emit_urb_config(struct brw_context *brw)
     unsigned urb_size = (brw->is_haswell && brw->gt == 3) ? 32 : 16;
     gen7_emit_push_constant_state(brw,
                                   urb_size / 2 /* vs_size */,
+                                 0 /* hs_size */,
+                                 0 /* ds_size */,
                                   0 /* gs_size */,
                                   urb_size / 2 /* fs_size */);
  
@@ -60,6 +62,12 @@ gen7_blorp_emit_urb_config(struct brw_context *brw)
                         32 /* num_vs_entries */,
                         2 /* vs_size */,
                         2 /* vs_start */,
+                       0 /* num_hs_entries */,
+                       1 /* hs_size */,
+                       2 /* hs_start */,
+                       0 /* num_ds_entries */,
+                       1 /* ds_size */,
+                       2 /* ds_start */,
                         0 /* num_gs_entries */,
                         1 /* gs_size */,
                         2 /* gs_start */);
diff --git a/src/mesa/drivers/dri/i965/gen7_urb.c b/src/mesa/drivers/dri/i965/gen7_urb.c

index 421512b2fc8d9f44595a1128faf9bc3a6f60f97f..f2b6ec3de058ca4adb5a3e560bc9a3f81f99e513 100644 (file)
--- a/src/mesa/drivers/dri/i965/gen7_urb.c
+++ b/src/mesa/drivers/dri/i965/gen7_urb.c
@@ -34,7 +34,7 @@
   *   __________-__________   _________________-_________________
   *  /                     \ /                                   \
   * +-------------------------------------------------------------+
- * |     VS/FS/GS Push     |              VS/GS URB              |
+ * |  VS/HS/DS/GS/FS Push  |           VS/HS/DS/GS URB           |
   * |       Constants       |               Entries               |
   * +-------------------------------------------------------------+
   *
@@ -60,27 +60,32 @@
  static void
  gen7_allocate_push_constants(struct brw_context *brw)
  {
+   /* BRW_NEW_GEOMETRY_PROGRAM */
+   bool gs_present = brw->geometry_program;
+
+   /* BRW_NEW_TESS_CTRL_PROGRAM, BRW_NEW_TESS_EVAL_PROGRAM */
+   bool tess_present = brw->tess_eval_program;
+
     unsigned avail_size = 16;
     unsigned multiplier =
        (brw->gen >= 8 || (brw->is_haswell && brw->gt == 3)) ? 2 : 1;
  
-   /* BRW_NEW_GEOMETRY_PROGRAM */
-   bool gs_present = brw->geometry_program;
+   int stages = 2 + gs_present + 2 * tess_present;
  
-   unsigned vs_size, gs_size;
-   if (gs_present) {
-      vs_size = avail_size / 3;
-      avail_size -= vs_size;
-      gs_size = avail_size / 2;
-      avail_size -= gs_size;
-   } else {
-      vs_size = avail_size / 2;
-      avail_size -= vs_size;
-      gs_size = 0;
-   }
-   unsigned fs_size = avail_size;
+   /* Divide up the available space equally between stages.  Because we
+    * round down (using floor division), there may be some left over
+    * space.  We allocate that to the pixel shader stage.
+    */
+   unsigned size_per_stage = avail_size / stages;
+
+   unsigned vs_size = size_per_stage;
+   unsigned hs_size = tess_present ? size_per_stage : 0;
+   unsigned ds_size = tess_present ? size_per_stage : 0;
+   unsigned gs_size = gs_present ? size_per_stage : 0;
+   unsigned fs_size = avail_size - size_per_stage * (stages - 1);
  
     gen7_emit_push_constant_state(brw, multiplier * vs_size,
+                                 multiplier * hs_size, multiplier * ds_size,
                                   multiplier * gs_size, multiplier * fs_size);
  
     /* From p115 of the Ivy Bridge PRM (3.2.1.4 3DSTATE_PUSH_CONSTANT_ALLOC_VS):
@@ -99,15 +104,24 @@ gen7_allocate_push_constants(struct brw_context *brw)
  
  void
  gen7_emit_push_constant_state(struct brw_context *brw, unsigned vs_size,
+                              unsigned hs_size, unsigned ds_size,
                                unsigned gs_size, unsigned fs_size)
  {
     unsigned offset = 0;
  
-   BEGIN_BATCH(6);
+   BEGIN_BATCH(10);
     OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_VS << 16 | (2 - 2));
     OUT_BATCH(vs_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT);
     offset += vs_size;
  
+   OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_HS << 16 | (2 - 2));
+   OUT_BATCH(hs_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT);
+   offset += hs_size;
+
+   OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_DS << 16 | (2 - 2));
+   OUT_BATCH(ds_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT);
+   offset += ds_size;
+
     OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_GS << 16 | (2 - 2));
     OUT_BATCH(gs_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT);
     offset += gs_size;
@@ -130,7 +144,10 @@ gen7_emit_push_constant_state(struct brw_context *brw, unsigned vs_size,
  const struct brw_tracked_state gen7_push_constant_space = {
     .dirty = {
        .mesa = 0,
-      .brw = BRW_NEW_CONTEXT | BRW_NEW_GEOMETRY_PROGRAM,
+      .brw = BRW_NEW_CONTEXT |
+             BRW_NEW_GEOMETRY_PROGRAM |
+             BRW_NEW_TESS_CTRL_PROGRAM |
+             BRW_NEW_TESS_EVAL_PROGRAM,
     },
     .emit = gen7_allocate_push_constants,
  };
@@ -138,6 +155,7 @@ const struct brw_tracked_state gen7_push_constant_space = {
  static void
  gen7_upload_urb(struct brw_context *brw)
  {
+   const struct brw_device_info *devinfo = brw->intelScreen->devinfo;
     const int push_size_kB =
        (brw->gen >= 8 || (brw->is_haswell && brw->gt == 3)) ? 32 : 16;
  
@@ -149,6 +167,16 @@ gen7_upload_urb(struct brw_context *brw)
     unsigned gs_size = gs_present ? brw->gs.prog_data->base.urb_entry_size : 1;
     unsigned gs_entry_size_bytes = gs_size * 64;
  
+   /* BRW_NEW_TESS_CTRL_PROGRAM, BRW_NEW_TESS_EVAL_PROGRAM */
+   const bool tess_present = brw->tess_eval_program;
+   assert(!tess_present || brw->tess_ctrl_program);
+   /* BRW_NEW_TCS_PROG_DATA */
+   unsigned hs_size = tess_present ? brw->tcs.prog_data->base.urb_entry_size : 1;
+   unsigned hs_entry_size_bytes = hs_size * 64;
+   /* BRW_NEW_TES_PROG_DATA */
+   unsigned ds_size = tess_present ? brw->tes.prog_data->base.urb_entry_size : 1;
+   unsigned ds_entry_size_bytes = ds_size * 64;
+
     /* If we're just switching between programs with the same URB requirements,
      * skip the rest of the logic.
      */
@@ -156,21 +184,29 @@ gen7_upload_urb(struct brw_context *brw)
         !(brw->ctx.NewDriverState & BRW_NEW_URB_SIZE) &&
         brw->urb.vsize == vs_size &&
         brw->urb.gs_present == gs_present &&
-       brw->urb.gsize == gs_size) {
+       brw->urb.gsize == gs_size &&
+       brw->urb.tess_present == tess_present &&
+       brw->urb.hsize == hs_size &&
+       brw->urb.dsize == ds_size) {
        return;
     }
     brw->urb.vsize = vs_size;
     brw->urb.gs_present = gs_present;
     brw->urb.gsize = gs_size;
+   brw->urb.tess_present = tess_present;
+   brw->urb.hsize = hs_size;
+   brw->urb.dsize = ds_size;
  
     /* From p35 of the Ivy Bridge PRM (section 1.7.1: 3DSTATE_URB_GS):
      *
      *     VS Number of URB Entries must be divisible by 8 if the VS URB Entry
      *     Allocation Size is less than 9 512-bit URB entries.
      *
-    * Similar text exists for GS.
+    * Similar text exists for HS, DS and GS.
      */
     unsigned vs_granularity = (vs_size < 9) ? 8 : 1;
+   unsigned hs_granularity = (hs_size < 9) ? 8 : 1;
+   unsigned ds_granularity = (ds_size < 9) ? 8 : 1;
     unsigned gs_granularity = (gs_size < 9) ? 8 : 1;
  
     /* URB allocations must be done in 8k chunks. */
@@ -191,10 +227,17 @@ gen7_upload_urb(struct brw_context *brw)
      * additional space it could actually make use of).
      */
  
-   /* VS has a lower limit on the number of URB entries */
+   /* VS has a lower limit on the number of URB entries.
+    *
+    * From the Broadwell PRM, 3DSTATE_URB_VS instruction:
+    * "When tessellation is enabled, the VS Number of URB Entries must be
+    *  greater than or equal to 192."
+    */
+   unsigned vs_min_entries =
+      tess_present && brw->gen == 8 ? 192 : brw->urb.min_vs_entries;
+
     unsigned vs_chunks =
-      DIV_ROUND_UP(brw->urb.min_vs_entries * vs_entry_size_bytes,
-                   chunk_size_bytes);
+      DIV_ROUND_UP(vs_min_entries * vs_entry_size_bytes, chunk_size_bytes);
     unsigned vs_wants =
        DIV_ROUND_UP(brw->urb.max_vs_entries * vs_entry_size_bytes,
                     chunk_size_bytes) - vs_chunks;
@@ -216,14 +259,36 @@ gen7_upload_urb(struct brw_context *brw)
                                chunk_size_bytes) - gs_chunks;
     }
  
+   unsigned hs_chunks = 0;
+   unsigned hs_wants = 0;
+   unsigned ds_chunks = 0;
+   unsigned ds_wants = 0;
+
+   if (tess_present) {
+      hs_chunks =
+         DIV_ROUND_UP(hs_granularity * hs_entry_size_bytes,
+                      chunk_size_bytes);
+      hs_wants =
+         DIV_ROUND_UP(devinfo->urb.max_hs_entries * hs_entry_size_bytes,
+                      chunk_size_bytes) - hs_chunks;
+
+      ds_chunks =
+         DIV_ROUND_UP(devinfo->urb.min_ds_entries * ds_entry_size_bytes,
+                      chunk_size_bytes);
+      ds_wants =
+         DIV_ROUND_UP(brw->urb.max_ds_entries * ds_entry_size_bytes,
+                      chunk_size_bytes) - ds_chunks;
+   }
+
     /* There should always be enough URB space to satisfy the minimum
      * requirements of each stage.
      */
-   unsigned total_needs = push_constant_chunks + vs_chunks + gs_chunks;
+   unsigned total_needs = push_constant_chunks +
+                          vs_chunks + hs_chunks + ds_chunks + gs_chunks;
     assert(total_needs <= urb_chunks);
  
     /* Mete out remaining space (if any) in proportion to "wants". */
-   unsigned total_wants = vs_wants + gs_wants;
+   unsigned total_wants = vs_wants + hs_wants + ds_wants + gs_wants;
     unsigned remaining_space = urb_chunks - total_needs;
     if (remaining_space > total_wants)
        remaining_space = total_wants;
@@ -232,61 +297,100 @@ gen7_upload_urb(struct brw_context *brw)
           roundf(vs_wants * (((float) remaining_space) / total_wants));
        vs_chunks += vs_additional;
        remaining_space -= vs_additional;
+      total_wants -= vs_wants;
+
+      unsigned hs_additional = (unsigned)
+         round(hs_wants * (((double) remaining_space) / total_wants));
+      hs_chunks += hs_additional;
+      remaining_space -= hs_additional;
+      total_wants -= hs_wants;
+
+      unsigned ds_additional = (unsigned)
+         round(ds_wants * (((double) remaining_space) / total_wants));
+      ds_chunks += ds_additional;
+      remaining_space -= ds_additional;
+      total_wants -= ds_wants;
+
        gs_chunks += remaining_space;
     }
  
     /* Sanity check that we haven't over-allocated. */
-   assert(push_constant_chunks + vs_chunks + gs_chunks <= urb_chunks);
+   assert(push_constant_chunks +
+          vs_chunks + hs_chunks + ds_chunks + gs_chunks <= urb_chunks);
  
     /* Finally, compute the number of entries that can fit in the space
      * allocated to each stage.
      */
     unsigned nr_vs_entries = vs_chunks * chunk_size_bytes / vs_entry_size_bytes;
+   unsigned nr_hs_entries = hs_chunks * chunk_size_bytes / hs_entry_size_bytes;
+   unsigned nr_ds_entries = ds_chunks * chunk_size_bytes / ds_entry_size_bytes;
     unsigned nr_gs_entries = gs_chunks * chunk_size_bytes / gs_entry_size_bytes;
  
     /* Since we rounded up when computing *_wants, this may be slightly more
      * than the maximum allowed amount, so correct for that.
      */
     nr_vs_entries = MIN2(nr_vs_entries, brw->urb.max_vs_entries);
+   nr_hs_entries = MIN2(nr_hs_entries, brw->urb.max_hs_entries);
+   nr_ds_entries = MIN2(nr_ds_entries, brw->urb.max_ds_entries);
     nr_gs_entries = MIN2(nr_gs_entries, brw->urb.max_gs_entries);
  
     /* Ensure that we program a multiple of the granularity. */
     nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, vs_granularity);
+   nr_hs_entries = ROUND_DOWN_TO(nr_hs_entries, hs_granularity);
+   nr_ds_entries = ROUND_DOWN_TO(nr_ds_entries, ds_granularity);
     nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, gs_granularity);
  
     /* Finally, sanity check to make sure we have at least the minimum number
      * of entries needed for each stage.
      */
-   assert(nr_vs_entries >= brw->urb.min_vs_entries);
+   assert(nr_vs_entries >= vs_min_entries);
     if (gs_present)
        assert(nr_gs_entries >= 2);
+   if (tess_present) {
+      assert(nr_hs_entries >= 1);
+      assert(nr_ds_entries >= devinfo->urb.min_ds_entries);
+   }
  
     /* Gen7 doesn't actually use brw->urb.nr_{vs,gs}_entries, but it seems
      * better to put reasonable data in there rather than leave them
      * uninitialized.
      */
     brw->urb.nr_vs_entries = nr_vs_entries;
+   brw->urb.nr_hs_entries = nr_hs_entries;
+   brw->urb.nr_ds_entries = nr_ds_entries;
     brw->urb.nr_gs_entries = nr_gs_entries;
  
     /* Lay out the URB in the following order:
      * - push constants
      * - VS
+    * - HS
+    * - DS
      * - GS
      */
     brw->urb.vs_start = push_constant_chunks;
-   brw->urb.gs_start = push_constant_chunks + vs_chunks;
+   brw->urb.hs_start = push_constant_chunks + vs_chunks;
+   brw->urb.ds_start = push_constant_chunks + vs_chunks + hs_chunks;
+   brw->urb.gs_start = push_constant_chunks + vs_chunks + hs_chunks +
+                       ds_chunks;
  
     if (brw->gen == 7 && !brw->is_haswell && !brw->is_baytrail)
        gen7_emit_vs_workaround_flush(brw);
     gen7_emit_urb_state(brw,
                         brw->urb.nr_vs_entries, vs_size, brw->urb.vs_start,
+                       brw->urb.nr_hs_entries, hs_size, brw->urb.hs_start,
+                       brw->urb.nr_ds_entries, ds_size, brw->urb.ds_start,
                         brw->urb.nr_gs_entries, gs_size, brw->urb.gs_start);
  }
  
  void
  gen7_emit_urb_state(struct brw_context *brw,
-                    unsigned nr_vs_entries, unsigned vs_size,
-                    unsigned vs_start, unsigned nr_gs_entries,
+                    unsigned nr_vs_entries,
+                    unsigned vs_size, unsigned vs_start,
+                    unsigned nr_hs_entries,
+                    unsigned hs_size, unsigned hs_start,
+                    unsigned nr_ds_entries,
+                    unsigned ds_size, unsigned ds_start,
+                    unsigned nr_gs_entries,
                      unsigned gs_size, unsigned gs_start)
  {
     BEGIN_BATCH(8);
@@ -300,14 +404,15 @@ gen7_emit_urb_state(struct brw_context *brw,
               ((gs_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) |
               (gs_start << GEN7_URB_STARTING_ADDRESS_SHIFT));
  
-   /* Allocate the HS and DS zero space - we don't use them. */
     OUT_BATCH(_3DSTATE_URB_HS << 16 | (2 - 2));
-   OUT_BATCH((0 << GEN7_URB_ENTRY_SIZE_SHIFT) |
-             (vs_start << GEN7_URB_STARTING_ADDRESS_SHIFT));
+   OUT_BATCH(nr_hs_entries |
+             ((hs_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) |
+             (hs_start << GEN7_URB_STARTING_ADDRESS_SHIFT));
  
     OUT_BATCH(_3DSTATE_URB_DS << 16 | (2 - 2));
-   OUT_BATCH((0 << GEN7_URB_ENTRY_SIZE_SHIFT) |
-             (vs_start << GEN7_URB_STARTING_ADDRESS_SHIFT));
+   OUT_BATCH(nr_ds_entries |
+             ((ds_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) |
+             (ds_start << GEN7_URB_STARTING_ADDRESS_SHIFT));
     ADVANCE_BATCH();
  }
  
@@ -317,7 +422,11 @@ const struct brw_tracked_state gen7_urb = {
        .brw = BRW_NEW_CONTEXT |
               BRW_NEW_URB_SIZE |
               BRW_NEW_GEOMETRY_PROGRAM |
+             BRW_NEW_TESS_CTRL_PROGRAM |
+             BRW_NEW_TESS_EVAL_PROGRAM |
               BRW_NEW_GS_PROG_DATA |
+             BRW_NEW_TCS_PROG_DATA |
+             BRW_NEW_TES_PROG_DATA |
               BRW_NEW_VS_PROG_DATA,
     },
     .emit = gen7_upload_urb,
author	Chris Forbes <chrisf@ijw.co.nz>
	Tue, 9 Sep 2014 09:30:48 +0000 (21:30 +1200)
committer	Kenneth Graunke <kenneth@whitecape.org>
	Tue, 15 Dec 2015 10:16:14 +0000 (02:16 -0800)
src/mesa/drivers/dri/i965/brw_context.h		patch \| blob \| history
src/mesa/drivers/dri/i965/gen7_blorp.cpp		patch \| blob \| history
src/mesa/drivers/dri/i965/gen7_urb.c		patch \| blob \| history