radeonsi: move PSIZE and CLIPDIST unique IO indices after GENERIC
authorMarek Olšák <marek.olsak@amd.com>
Sun, 28 May 2017 22:40:39 +0000 (00:40 +0200)
committerMarek Olšák <marek.olsak@amd.com>
Wed, 7 Jun 2017 18:14:15 +0000 (20:14 +0200)
Heaven LDS usage for LS+HS is below. The masks are "outputs_written"
for LS and HS. Note that 32K is the maximum size.

Before:
  heaven_x64: ls=1f1 tcs=1f1, lds=32K
  heaven_x64: ls=31 tcs=31, lds=24K
  heaven_x64: ls=71 tcs=71, lds=28K

After:
  heaven_x64: ls=3f tcs=3f, lds=24K
  heaven_x64: ls=7 tcs=7, lds=13K
  heaven_x64: ls=f tcs=f, lds=17K

All other apps have a similar decrease in LDS usage, because
the "outputs_written" masks are similar. Also, most apps don't write
POSITION in these shader stages, so there is room for improvement.
(tight per-component input/output packing might help even more)

It's unknown whether this improves performance.

Tested-by: Edmondo Tommasina <edmondo.tommasina@gmail.com>
Tested-by: Dieter Nützel <Dieter@nuetzel-hh.de>
Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
src/gallium/drivers/radeonsi/si_shader.c
src/gallium/drivers/radeonsi/si_state_shaders.c

index 0ca07ab4ee53009f79fcfc0cdb85a9072ee65fd7..5c7deeb250ec49429fbb2abf43a5ca70dbcae98a 100644 (file)
@@ -136,18 +136,22 @@ unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
        switch (semantic_name) {
        case TGSI_SEMANTIC_POSITION:
                return 0;
-       case TGSI_SEMANTIC_PSIZE:
-               return 1;
-       case TGSI_SEMANTIC_CLIPDIST:
-               assert(index <= 1);
-               return 2 + index;
        case TGSI_SEMANTIC_GENERIC:
+               /* Since some shader stages use the the highest used IO index
+                * to determine the size to allocate for inputs/outputs
+                * (in LDS, tess and GS rings). GENERIC should be placed right
+                * after POSITION to make that size as small as possible.
+                */
                if (index < SI_MAX_IO_GENERIC)
-                       return 4 + index;
+                       return 1 + index;
 
                assert(!"invalid generic index");
                return 0;
-
+       case TGSI_SEMANTIC_PSIZE:
+               return SI_MAX_IO_GENERIC + 1;
+       case TGSI_SEMANTIC_CLIPDIST:
+               assert(index <= 1);
+               return SI_MAX_IO_GENERIC + 2 + index;
        case TGSI_SEMANTIC_FOG:
                return SI_MAX_IO_GENERIC + 4;
        case TGSI_SEMANTIC_LAYER:
index 08d647b5a54e9eb1a6aa18aaad139019101a540d..41f8bdf9a544e1b10444c50c42ae246bc07d885b 100644 (file)
@@ -1233,7 +1233,9 @@ static void si_shader_selector_key_hw_vs(struct si_context *sctx,
        uint64_t outputs_written = vs->outputs_written;
        uint64_t inputs_read = 0;
 
-       outputs_written &= ~0x3; /* ignore POSITION, PSIZE */
+       /* ignore POSITION, PSIZE */
+       outputs_written &= ~((1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_POSITION, 0) |
+                            (1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_PSIZE, 0))));
 
        if (!ps_disabled) {
                inputs_read = ps->inputs_read;