i965: Add support for instruction compaction.
authorEric Anholt <eric@anholt.net>
Wed, 1 Feb 2012 00:55:20 +0000 (16:55 -0800)
committerEric Anholt <eric@anholt.net>
Mon, 17 Sep 2012 19:32:52 +0000 (12:32 -0700)
This reduces program size by using some smaller encodings for common bit
patterns in the Gen ISA, with the hope of making programs fit in the
instruction cache better.

v2: Use larger bitshifts for the uncompressed field setups, in line with the
    way it's described in the spec.  Consistently name a brw_compile "p" like
    all other code.  Add a couple more tests.  Consistently call things
    "compacted" not "compressed" (which is a different feature).  Drop the
    explicit check for not compacting SENDs, which is unjustified and already
    implied by our lack of support for immediate values.

Reviewed-by: Paul Berry <stereotype441@gmail.com>
src/mesa/drivers/dri/i965/.gitignore
src/mesa/drivers/dri/i965/Makefile.am
src/mesa/drivers/dri/i965/Makefile.sources
src/mesa/drivers/dri/i965/brw_eu.c
src/mesa/drivers/dri/i965/brw_eu.h
src/mesa/drivers/dri/i965/brw_eu_compact.c [new file with mode: 0644]
src/mesa/drivers/dri/i965/brw_eu_debug.c
src/mesa/drivers/dri/i965/brw_fs.cpp
src/mesa/drivers/dri/i965/brw_structs.h
src/mesa/drivers/dri/i965/test_eu_compact.c [new file with mode: 0644]

index fe4578e39d40800862f735f2be5355c88f901249..c6ea403f5db342f017b8c09eea681216b65e52e9 100644 (file)
@@ -1,3 +1,4 @@
 Makefile
 i965_symbols_test
 libi965_dri.la
+test_eu_compact
index 0ac3de75ca676c9f5084c60b802cbb7e78d94797..5bb62c49efde837a9499dcc5a056d659cb49e8e2 100644 (file)
@@ -51,16 +51,30 @@ libi965_dri_la_SOURCES = \
        $(i965_C_FILES) \
        $(i965_CXX_FILES)
 
+# list of libs to be linked against by i965_dri.so and i965 test programs.
 COMMON_LIBS = \
        libi965_dri.la \
        $(DRI_LIB_DEPS) \
        $(INTEL_LIBS) \
        ../common/libdricommon.la
 
+TEST_LIBS = \
+       $(COMMON_LIBS) \
+        -lrt \
+       ../common/libdri_test_stubs.la
+
 i965_dri_la_SOURCES =
 i965_dri_la_LIBADD = $(COMMON_LIBS)
 i965_dri_la_LDFLAGS = -module -avoid-version -shared
 
+TESTS = test_eu_compact
+check_PROGRAMS = test_eu_compact
+
+test_eu_compact_SOURCES = \
+       test_eu_compact.c
+nodist_EXTRA_test_eu_compact_SOURCES = dummy.cpp
+test_eu_compact_LDADD = $(TEST_LIBS)
+
 # Provide compatibility with scripts for the old Mesa build system for
 # a while by putting a link to the driver into /lib of the build tree.
 all-local: i965_dri.la
index d6d189a184178c0d4a29f71d36ceac16a9c2eb90..3715b0f300f7d38cfa4a259f19f09c1f64514397 100644 (file)
@@ -44,6 +44,7 @@ i965_C_FILES = \
        brw_draw.c \
        brw_draw_upload.c \
        brw_eu.c \
+       brw_eu_compact.c \
        brw_eu_debug.c \
        brw_eu_emit.c \
        brw_eu_util.c \
index 20a8ec4229e324256d9924268ea2000ba2af711f..02a07ec17b4ede4f04d1f677b0cd6ca652f8d5ca 100644 (file)
@@ -214,6 +214,11 @@ const GLuint *brw_get_program( struct brw_compile *p,
 {
    GLuint i;
 
+   brw_compact_instructions(p);
+
+   /* We emit a cacheline (8 instructions) of NOPs at the end of the program to
+    * make sure that instruction prefetch doesn't wander off into some other BO.
+    */
    for (i = 0; i < 8; i++)
       brw_NOP(p);
 
@@ -224,19 +229,36 @@ const GLuint *brw_get_program( struct brw_compile *p,
 void
 brw_dump_compile(struct brw_compile *p, FILE *out, int start, int end)
 {
+   struct brw_context *brw = p->brw;
+   struct intel_context *intel = &brw->intel;
    void *store = p->store;
+   bool dump_hex = false;
 
-   for (int offset = start; offset < end; offset += 16) {
+   for (int offset = start; offset < end;) {
       struct brw_instruction *insn = store + offset;
-
+      struct brw_instruction uncompacted;
       printf("0x%08x: ", offset);
 
-      if (0) {
-        printf("0x%08x 0x%08x 0x%08x 0x%08x ",
-               ((uint32_t *)insn)[3],
-               ((uint32_t *)insn)[2],
-               ((uint32_t *)insn)[1],
-               ((uint32_t *)insn)[0]);
+      if (insn->header.cmpt_control) {
+        struct brw_compact_instruction *compacted = (void *)insn;
+        if (dump_hex) {
+           printf("0x%08x 0x%08x                       ",
+                  ((uint32_t *)insn)[1],
+                  ((uint32_t *)insn)[0]);
+        }
+
+        brw_uncompact_instruction(intel, &uncompacted, compacted);
+        insn = &uncompacted;
+        offset += 8;
+      } else {
+        if (dump_hex) {
+           printf("0x%08x 0x%08x 0x%08x 0x%08x ",
+                  ((uint32_t *)insn)[3],
+                  ((uint32_t *)insn)[2],
+                  ((uint32_t *)insn)[1],
+                  ((uint32_t *)insn)[0]);
+        }
+        offset += 16;
       }
 
       brw_disasm(stdout, insn, p->brw->intel.gen);
index 2fa84dff490ed40ea953625887a99dc528d5870b..21967bd0cd7cfd08d4c5f23b18fd47197763614f 100644 (file)
@@ -1107,6 +1107,19 @@ void brw_set_uip_jip(struct brw_compile *p);
 
 uint32_t brw_swap_cmod(uint32_t cmod);
 
+/* brw_eu_compact.c */
+void brw_compact_instructions(struct brw_compile *p);
+void brw_uncompact_instruction(struct intel_context *intel,
+                              struct brw_instruction *dst,
+                              struct brw_compact_instruction *src);
+bool brw_try_compact_instruction(struct brw_compile *p,
+                                 struct brw_compact_instruction *dst,
+                                 struct brw_instruction *src);
+
+void brw_debug_compact_uncompact(struct intel_context *intel,
+                                struct brw_instruction *orig,
+                                struct brw_instruction *uncompacted);
+
 /* brw_optimize.c */
 void brw_optimize(struct brw_compile *p);
 void brw_remove_duplicate_mrf_moves(struct brw_compile *p);
diff --git a/src/mesa/drivers/dri/i965/brw_eu_compact.c b/src/mesa/drivers/dri/i965/brw_eu_compact.c
new file mode 100644 (file)
index 0000000..210657a
--- /dev/null
@@ -0,0 +1,558 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_eu_compact.c
+ *
+ * Instruction compaction is a feature of gm45 and newer hardware that allows
+ * for a smaller instruction encoding.
+ *
+ * The instruction cache is on the order of 32KB, and many programs generate
+ * far more instructions than that.  The instruction cache is built to barely
+ * keep up with instruction dispatch abaility in cache hit cases -- L1
+ * instruction cache misses that still hit in the next level could limit
+ * throughput by around 50%.
+ *
+ * The idea of instruction compaction is that most instructions use a tiny
+ * subset of the GPU functionality, so we can encode what would be a 16 byte
+ * instruction in 8 bytes using some lookup tables for various fields.
+ */
+
+#include "brw_context.h"
+#include "brw_eu.h"
+
+static const uint32_t gen6_control_index_table[32] = {
+   0b00000000000000000,
+   0b01000000000000000,
+   0b00110000000000000,
+   0b00000000100000000,
+   0b00010000000000000,
+   0b00001000100000000,
+   0b00000000100000010,
+   0b00000000000000010,
+   0b01000000100000000,
+   0b01010000000000000,
+   0b10110000000000000,
+   0b00100000000000000,
+   0b11010000000000000,
+   0b11000000000000000,
+   0b01001000100000000,
+   0b01000000000001000,
+   0b01000000000000100,
+   0b00000000000001000,
+   0b00000000000000100,
+   0b00111000100000000,
+   0b00001000100000010,
+   0b00110000100000000,
+   0b00110000000000001,
+   0b00100000000000001,
+   0b00110000000000010,
+   0b00110000000000101,
+   0b00110000000001001,
+   0b00110000000010000,
+   0b00110000000000011,
+   0b00110000000000100,
+   0b00110000100001000,
+   0b00100000000001001
+};
+
+static const uint32_t gen6_datatype_table[32] = {
+   0b001001110000000000,
+   0b001000110000100000,
+   0b001001110000000001,
+   0b001000000001100000,
+   0b001010110100101001,
+   0b001000000110101101,
+   0b001100011000101100,
+   0b001011110110101101,
+   0b001000000111101100,
+   0b001000000001100001,
+   0b001000110010100101,
+   0b001000000001000001,
+   0b001000001000110001,
+   0b001000001000101001,
+   0b001000000000100000,
+   0b001000001000110010,
+   0b001010010100101001,
+   0b001011010010100101,
+   0b001000000110100101,
+   0b001100011000101001,
+   0b001011011000101100,
+   0b001011010110100101,
+   0b001011110110100101,
+   0b001111011110111101,
+   0b001111011110111100,
+   0b001111011110111101,
+   0b001111011110011101,
+   0b001111011110111110,
+   0b001000000000100001,
+   0b001000000000100010,
+   0b001001111111011101,
+   0b001000001110111110,
+};
+
+static const uint32_t gen6_subreg_table[32] = {
+   0b000000000000000,
+   0b000000000000100,
+   0b000000110000000,
+   0b111000000000000,
+   0b011110000001000,
+   0b000010000000000,
+   0b000000000010000,
+   0b000110000001100,
+   0b001000000000000,
+   0b000001000000000,
+   0b000001010010100,
+   0b000000001010110,
+   0b010000000000000,
+   0b110000000000000,
+   0b000100000000000,
+   0b000000010000000,
+   0b000000000001000,
+   0b100000000000000,
+   0b000001010000000,
+   0b001010000000000,
+   0b001100000000000,
+   0b000000001010100,
+   0b101101010010100,
+   0b010100000000000,
+   0b000000010001111,
+   0b011000000000000,
+   0b111110000000000,
+   0b101000000000000,
+   0b000000000001111,
+   0b000100010001111,
+   0b001000010001111,
+   0b000110000000000,
+};
+
+static const uint32_t gen6_src_index_table[32] = {
+   0b000000000000,
+   0b010110001000,
+   0b010001101000,
+   0b001000101000,
+   0b011010010000,
+   0b000100100000,
+   0b010001101100,
+   0b010101110000,
+   0b011001111000,
+   0b001100101000,
+   0b010110001100,
+   0b001000100000,
+   0b010110001010,
+   0b000000000010,
+   0b010101010000,
+   0b010101101000,
+   0b111101001100,
+   0b111100101100,
+   0b011001110000,
+   0b010110001001,
+   0b010101011000,
+   0b001101001000,
+   0b010000101100,
+   0b010000000000,
+   0b001101110000,
+   0b001100010000,
+   0b001100000000,
+   0b010001101010,
+   0b001101111000,
+   0b000001110000,
+   0b001100100000,
+   0b001101010000,
+};
+
+static bool
+set_control_index(struct brw_compact_instruction *dst,
+                  struct brw_instruction *src)
+{
+   uint32_t *src_u32 = (uint32_t *)src;
+   uint32_t uncompacted = 0;
+
+   uncompacted |= ((src_u32[0] >> 8) & 0xffff) << 0;
+   uncompacted |= ((src_u32[0] >> 31) & 0x1) << 16;
+
+   for (int i = 0; i < ARRAY_SIZE(gen6_control_index_table); i++) {
+      if (gen6_control_index_table[i] == uncompacted) {
+         dst->dw0.control_index = i;
+         return true;
+      }
+   }
+
+   return false;
+}
+
+static bool
+set_datatype_index(struct brw_compact_instruction *dst,
+                   struct brw_instruction *src)
+{
+   uint32_t uncompacted = 0;
+
+   uncompacted |= src->bits1.ud & 0x7fff;
+   uncompacted |= (src->bits1.ud >> 29) << 15;
+
+   for (int i = 0; i < ARRAY_SIZE(gen6_datatype_table); i++) {
+      if (gen6_datatype_table[i] == uncompacted) {
+         dst->dw0.data_type_index = i;
+         return true;
+      }
+   }
+
+   return false;
+}
+
+static bool
+set_subreg_index(struct brw_compact_instruction *dst,
+                 struct brw_instruction *src)
+{
+   uint32_t uncompacted = 0;
+
+   uncompacted |= src->bits1.da1.dest_subreg_nr << 0;
+   uncompacted |= src->bits2.da1.src0_subreg_nr << 5;
+   uncompacted |= src->bits3.da1.src1_subreg_nr << 10;
+
+   for (int i = 0; i < ARRAY_SIZE(gen6_subreg_table); i++) {
+      if (gen6_subreg_table[i] == uncompacted) {
+         dst->dw0.sub_reg_index = i;
+         return true;
+      }
+   }
+
+   return false;
+}
+
+static bool
+get_src_index(uint32_t uncompacted,
+              uint32_t *compacted)
+{
+   for (int i = 0; i < ARRAY_SIZE(gen6_src_index_table); i++) {
+      if (gen6_src_index_table[i] == uncompacted) {
+         *compacted = i;
+         return true;
+      }
+   }
+
+   return false;
+}
+
+static bool
+set_src0_index(struct brw_compact_instruction *dst,
+               struct brw_instruction *src)
+{
+   uint32_t compacted, uncompacted = 0;
+
+   uncompacted |= (src->bits2.ud >> 13) & 0xfff;
+
+   if (!get_src_index(uncompacted, &compacted))
+      return false;
+
+   dst->dw0.src0_index = compacted & 0x3;
+   dst->dw1.src0_index = compacted >> 2;
+
+   return true;
+}
+
+static bool
+set_src1_index(struct brw_compact_instruction *dst,
+               struct brw_instruction *src)
+{
+   uint32_t compacted, uncompacted = 0;
+
+   uncompacted |= (src->bits3.ud >> 13) & 0xfff;
+
+   if (!get_src_index(uncompacted, &compacted))
+      return false;
+
+   dst->dw1.src1_index = compacted;
+
+   return true;
+}
+
+/**
+ * Tries to compact instruction src into dst.
+ *
+ * It doesn't modify dst unless src is compactable, which is relied on by
+ * brw_compact_instructions().
+ */
+bool
+brw_try_compact_instruction(struct brw_compile *p,
+                            struct brw_compact_instruction *dst,
+                            struct brw_instruction *src)
+{
+   struct brw_compact_instruction temp;
+
+   /* FINISHME: immediates */
+   if (src->bits1.da1.src0_reg_file == BRW_IMMEDIATE_VALUE ||
+       src->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE)
+      return false;
+
+   memset(&temp, 0, sizeof(temp));
+
+   temp.dw0.opcode = src->header.opcode;
+   temp.dw0.debug_control = src->header.debug_control;
+   if (!set_control_index(&temp, src))
+      return false;
+   if (!set_datatype_index(&temp, src))
+      return false;
+   if (!set_subreg_index(&temp, src))
+      return false;
+   temp.dw0.acc_wr_control = src->header.acc_wr_control;
+   temp.dw0.conditionalmod = src->header.destreg__conditionalmod;
+   temp.dw0.flag_reg_nr = src->bits2.da1.flag_reg_nr;
+   temp.dw0.cmpt_ctrl = 1;
+   if (!set_src0_index(&temp, src))
+      return false;
+   if (!set_src1_index(&temp, src))
+      return false;
+   temp.dw1.dst_reg_nr = src->bits1.da1.dest_reg_nr;
+   temp.dw1.src0_reg_nr = src->bits2.da1.src0_reg_nr;
+   temp.dw1.src1_reg_nr = src->bits3.da1.src1_reg_nr;
+
+   *dst = temp;
+
+   return true;
+}
+
+static void
+set_uncompacted_control(struct brw_instruction *dst,
+                        struct brw_compact_instruction *src)
+{
+   uint32_t *dst_u32 = (uint32_t *)dst;
+   uint32_t uncompacted = gen6_control_index_table[src->dw0.control_index];
+
+   dst_u32[0] |= ((uncompacted >> 0) & 0xffff) << 8;
+   dst_u32[0] |= ((uncompacted >> 16) & 0x1) << 31;
+}
+
+static void
+set_uncompacted_datatype(struct brw_instruction *dst,
+                         struct brw_compact_instruction *src)
+{
+   uint32_t uncompacted = gen6_datatype_table[src->dw0.data_type_index];
+
+   dst->bits1.ud &= ~(0x7 << 29);
+   dst->bits1.ud |= ((uncompacted >> 15) & 0x7) << 29;
+   dst->bits1.ud &= ~0x7fff;
+   dst->bits1.ud |= uncompacted & 0x7fff;
+}
+
+static void
+set_uncompacted_subreg(struct brw_instruction *dst,
+                       struct brw_compact_instruction *src)
+{
+   uint32_t uncompacted = gen6_subreg_table[src->dw0.sub_reg_index];
+
+   dst->bits1.da1.dest_subreg_nr = (uncompacted >> 0)  & 0x1f;
+   dst->bits2.da1.src0_subreg_nr = (uncompacted >> 5)  & 0x1f;
+   dst->bits3.da1.src1_subreg_nr = (uncompacted >> 10) & 0x1f;
+}
+
+static void
+set_uncompacted_src0(struct brw_instruction *dst,
+                     struct brw_compact_instruction *src)
+{
+   uint32_t compacted = src->dw0.src0_index | src->dw1.src0_index << 2;
+   uint32_t uncompacted = gen6_src_index_table[compacted];
+
+   dst->bits2.ud |= uncompacted << 13;
+}
+
+static void
+set_uncompacted_src1(struct brw_instruction *dst,
+                     struct brw_compact_instruction *src)
+{
+   uint32_t uncompacted = gen6_src_index_table[src->dw1.src1_index];
+
+   dst->bits3.ud |= uncompacted << 13;
+}
+
+void
+brw_uncompact_instruction(struct intel_context *intel,
+                          struct brw_instruction *dst,
+                          struct brw_compact_instruction *src)
+{
+   memset(dst, 0, sizeof(*dst));
+
+   dst->header.opcode = src->dw0.opcode;
+   dst->header.debug_control = src->dw0.debug_control;
+
+   set_uncompacted_control(dst, src);
+   set_uncompacted_datatype(dst, src);
+   set_uncompacted_subreg(dst, src);
+   dst->header.acc_wr_control = src->dw0.acc_wr_control;
+   dst->header.destreg__conditionalmod = src->dw0.conditionalmod;
+   dst->bits2.da1.flag_reg_nr = src->dw0.flag_reg_nr;
+   set_uncompacted_src0(dst, src);
+   set_uncompacted_src1(dst, src);
+   dst->bits1.da1.dest_reg_nr = src->dw1.dst_reg_nr;
+   dst->bits2.da1.src0_reg_nr = src->dw1.src0_reg_nr;
+   dst->bits3.da1.src1_reg_nr = src->dw1.src1_reg_nr;
+}
+
+void brw_debug_compact_uncompact(struct intel_context *intel,
+                                 struct brw_instruction *orig,
+                                 struct brw_instruction *uncompacted)
+{
+   fprintf(stderr, "Instruction compact/uncompact changed:\n");
+
+   fprintf(stderr, "  before: ");
+   brw_disasm(stderr, orig, intel->gen);
+
+   fprintf(stderr, "  after:  ");
+   brw_disasm(stderr, uncompacted, intel->gen);
+
+   uint32_t *before_bits = (uint32_t *)orig;
+   uint32_t *after_bits = (uint32_t *)uncompacted;
+   printf("  changed bits:\n");
+   for (int i = 0; i < 128; i++) {
+      uint32_t before = before_bits[i / 32] & (1 << (i & 31));
+      uint32_t after = after_bits[i / 32] & (1 << (i & 31));
+
+      if (before != after) {
+         printf("  bit %d, %s to %s\n", i,
+                before ? "set" : "unset",
+                after ? "set" : "unset");
+      }
+   }
+}
+
+void
+brw_compact_instructions(struct brw_compile *p)
+{
+   struct brw_context *brw = p->brw;
+   struct intel_context *intel = &brw->intel;
+   void *store = p->store;
+
+   assert(gen6_control_index_table[ARRAY_SIZE(gen6_control_index_table) - 1] != 0);
+   assert(gen6_datatype_table[ARRAY_SIZE(gen6_datatype_table) - 1] != 0);
+   assert(gen6_subreg_table[ARRAY_SIZE(gen6_subreg_table) - 1] != 0);
+   assert(gen6_src_index_table[ARRAY_SIZE(gen6_src_index_table) - 1] != 0);
+
+   if (intel->gen != 6)
+      return;
+
+   /* FINISHME: If we are going to compress instructions between flow control,
+    * we have to do fixups to flow control offsets to represent the new
+    * distances, since flow control uses (virtual address distance)/2, not a
+    * logical instruction count.  We can at least compress up until an IF
+    * instruction, but there's no instruction indicating the start of a
+    * do/while loop.
+    */
+   bool continue_compressing = true;
+   for (int i = 0; i < p->nr_insn; i++) {
+      if (p->store[i].header.opcode == BRW_OPCODE_WHILE)
+         return;
+   }
+
+   int src_offset;
+   int offset = 0;
+   for (src_offset = 0; src_offset < p->nr_insn * 16;) {
+      struct brw_instruction *src = store + src_offset;
+      void *dst = store + offset;
+
+      switch (src->header.opcode) {
+      case BRW_OPCODE_IF:
+      case BRW_OPCODE_HALT:
+      case BRW_OPCODE_JMPI:
+         continue_compressing = false;
+         break;
+      }
+
+      struct brw_instruction saved = *src;
+
+      if (continue_compressing &&
+          !src->header.cmpt_control &&
+          brw_try_compact_instruction(p, dst, src)) {
+
+         /* debug */
+         if (INTEL_DEBUG) {
+            struct brw_instruction uncompacted;
+            brw_uncompact_instruction(intel, &uncompacted, dst);
+            if (memcmp(&saved, &uncompacted, sizeof(uncompacted))) {
+               brw_debug_compact_uncompact(intel, &saved, &uncompacted);
+            }
+         }
+
+         offset += 8;
+         src_offset += 16;
+      } else {
+         int size = src->header.cmpt_control ? 8 : 16;
+
+         /* It appears that the end of thread SEND instruction needs to be
+          * aligned, or the GPU hangs.
+          */
+         if ((src->header.opcode == BRW_OPCODE_SEND ||
+              src->header.opcode == BRW_OPCODE_SENDC) &&
+             src->bits3.generic.end_of_thread &&
+             (offset & 8) != 0) {
+            struct brw_compact_instruction *align = store + offset;
+            memset(align, 0, sizeof(*align));
+            align->dw0.opcode = BRW_OPCODE_NOP;
+            align->dw0.cmpt_ctrl = 1;
+            offset += 8;
+            dst = store + offset;
+         }
+
+         /* If we didn't compact this instruction, we need to move it down into
+          * place.
+          */
+         if (offset != src_offset) {
+            memmove(dst, src, size);
+         }
+         offset += size;
+         src_offset += size;
+      }
+   }
+
+   /* p->nr_insn is counting the number of uncompacted instructions still, so
+    * divide.  We do want to be sure there's a valid instruction in any
+    * alignment padding, so that the next compression pass (for the FS 8/16
+    * compile passes) parses correctly.
+    */
+   if (offset & 8) {
+      struct brw_compact_instruction *align = store + offset;
+      memset(align, 0, sizeof(*align));
+      align->dw0.opcode = BRW_OPCODE_NOP;
+      align->dw0.cmpt_ctrl = 1;
+      offset += 8;
+   }
+   p->next_insn_offset = offset;
+   p->nr_insn = offset / 16;
+
+   if (0) {
+      fprintf(stdout, "dumping compacted program\n");
+      brw_dump_compile(p, stdout, 0, p->next_insn_offset);
+
+      int cmp = 0;
+      for (offset = 0; offset < p->next_insn_offset;) {
+         struct brw_instruction *insn = store + offset;
+
+         if (insn->header.cmpt_control) {
+            offset += 8;
+            cmp++;
+         } else {
+            offset += 16;
+         }
+      }
+      fprintf(stderr, "%db/%db saved (%d%%)\n", cmp * 8, offset + cmp * 8,
+              cmp * 8 * 100 / (offset + cmp * 8));
+   }
+}
index 99453afdcafd976f06e5b83e6446d20363383db0..a8e10a9edf6aa1b3a9871de74b43ca854a309bb7 100644 (file)
@@ -32,6 +32,7 @@
 
 #include "main/mtypes.h"
 #include "main/imports.h"
+#include "brw_context.h"
 #include "brw_eu.h"
 
 void brw_print_reg( struct brw_reg hwreg )
index cd6819176bcc533e6462b0b5ace7dd33f2dfd828..9ac2a49d94830e5a604f27c2fc2595dc5a4bce82 100644 (file)
@@ -1947,6 +1947,12 @@ fs_visitor::run()
    brw_wm_payload_setup(brw, c);
 
    if (c->dispatch_width == 16) {
+      /* We have to do a compaction pass now, or the one at the end of
+       * execution will squash down where our prog_offset start needs
+       * to be.
+       */
+      brw_compact_instructions(p);
+
       /* align to 64 byte boundary. */
       while ((c->func.nr_insn * sizeof(struct brw_instruction)) % 64) {
         brw_NOP(p);
index 465d2a28a8ea7ae24d20040ee233eb4f65865669..26def6e9054d667be9351ad1f85cd4ada2c563d3 100644 (file)
@@ -1048,6 +1048,8 @@ struct brw_instruction
         GLuint dest_subreg_nr:3;
         GLuint dest_reg_nr:8;
       } da3src;
+
+      uint32_t ud;
    } bits1;
 
 
@@ -1137,6 +1139,8 @@ struct brw_instruction
         GLuint src1_swizzle:8;
         GLuint src1_subreg_nr_low:2;
       } da3src;
+
+      uint32_t ud;
    } bits2;
 
    union
@@ -1534,5 +1538,27 @@ struct brw_instruction
    } bits3;
 };
 
+struct brw_compact_instruction {
+   struct {
+      unsigned opcode:7;          /*  0- 6 */
+      unsigned debug_control:1;   /*  7- 7 */
+      unsigned control_index:5;   /*  8-12 */
+      unsigned data_type_index:5; /* 13-17 */
+      unsigned sub_reg_index:5;   /* 18-22 */
+      unsigned acc_wr_control:1;  /* 23-23 */
+      unsigned conditionalmod:4;  /* 24-27 */
+      unsigned flag_reg_nr:1;     /* 28-28 */
+      unsigned cmpt_ctrl:1;       /* 29-29 */
+      unsigned src0_index:2;      /* 30-31 */
+   } dw0;
+
+   struct {
+      unsigned src0_index:3;  /* 32-24 */
+      unsigned src1_index:5;  /* 35-39 */
+      unsigned dst_reg_nr:8;  /* 40-47 */
+      unsigned src0_reg_nr:8; /* 48-55 */
+      unsigned src1_reg_nr:8; /* 56-63 */
+   } dw1;
+};
 
 #endif
diff --git a/src/mesa/drivers/dri/i965/test_eu_compact.c b/src/mesa/drivers/dri/i965/test_eu_compact.c
new file mode 100644 (file)
index 0000000..e9d4301
--- /dev/null
@@ -0,0 +1,296 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include "glsl/ralloc.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+
+static bool
+test_compact_instruction(struct brw_compile *p, struct brw_instruction src)
+{
+   struct brw_context *brw = p->brw;
+   struct intel_context *intel = &brw->intel;
+
+   struct brw_compact_instruction dst;
+   memset(&dst, 0xd0, sizeof(dst));
+
+   if (brw_try_compact_instruction(p, &dst, &src)) {
+      struct brw_instruction uncompacted;
+
+      brw_uncompact_instruction(intel, &uncompacted, &dst);
+      if (memcmp(&uncompacted, &src, sizeof(src))) {
+        brw_debug_compact_uncompact(intel, &src, &uncompacted);
+        return false;
+      }
+   } else {
+      struct brw_compact_instruction unchanged;
+      memset(&unchanged, 0xd0, sizeof(unchanged));
+      /* It's not supposed to change dst unless it compacted. */
+      if (memcmp(&unchanged, &dst, sizeof(dst))) {
+        fprintf(stderr, "Failed to compact, but dst changed\n");
+        fprintf(stderr, "  Instruction: ");
+        brw_disasm(stderr, &src, intel->gen);
+        return false;
+      }
+   }
+
+   return true;
+}
+
+/**
+ * When doing fuzz testing, pad bits won't round-trip.
+ *
+ * This sort of a superset of skip_bit, which is testing for changing bits that
+ * aren't worth testing for fuzzing.  We also just want to clear bits that
+ * become meaningless once fuzzing twiddles a related bit.
+ */
+static void
+clear_pad_bits(struct brw_instruction *inst)
+{
+   if (inst->header.opcode != BRW_OPCODE_SEND &&
+       inst->header.opcode != BRW_OPCODE_SENDC &&
+       inst->header.opcode != BRW_OPCODE_BREAK &&
+       inst->header.opcode != BRW_OPCODE_CONTINUE &&
+       inst->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE &&
+       inst->bits1.da1.src1_reg_file != BRW_IMMEDIATE_VALUE) {
+      if (inst->bits3.da1.src1_address_mode)
+        inst->bits3.ia1.pad1 = 0;
+      else
+        inst->bits3.da1.pad0 = 0;
+   }
+}
+
+static bool
+skip_bit(struct brw_instruction *src, int bit)
+{
+   /* pad bit */
+   if (bit == 7)
+      return true;
+
+   /* The compact bit -- uncompacted can't have it set. */
+   if (bit == 29)
+      return true;
+
+   /* pad bit */
+   if (bit == 47)
+      return true;
+
+   /* pad bits */
+   if (bit >= 90 && bit <= 95)
+      return true;
+
+   /* sometimes these are pad bits. */
+   if (src->header.opcode != BRW_OPCODE_SEND &&
+       src->header.opcode != BRW_OPCODE_SENDC &&
+       src->header.opcode != BRW_OPCODE_BREAK &&
+       src->header.opcode != BRW_OPCODE_CONTINUE &&
+       src->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE &&
+       src->bits1.da1.src1_reg_file != BRW_IMMEDIATE_VALUE &&
+       bit >= 121) {
+      return true;
+   }
+
+   return false;
+}
+
+static bool
+test_fuzz_compact_instruction(struct brw_compile *p,
+                             struct brw_instruction src)
+{
+   for (int bit0 = 0; bit0 < 128; bit0++) {
+      if (skip_bit(&src, bit0))
+        continue;
+
+      for (int bit1 = 0; bit1 < 128; bit1++) {
+        struct brw_instruction instr = src;
+        uint32_t *bits = (uint32_t *)&instr;
+
+        if (skip_bit(&src, bit1))
+           continue;
+
+        bits[bit0 / 32] ^= (1 << (bit0 & 31));
+        bits[bit1 / 32] ^= (1 << (bit1 & 31));
+
+        clear_pad_bits(&instr);
+
+        if (!test_compact_instruction(p, instr)) {
+           printf("  twiddled bits for fuzzing %d, %d\n", bit0, bit1);
+           return false;
+        }
+      }
+   }
+
+   return true;
+}
+
+static void
+gen_ADD_GRF_GRF_GRF(struct brw_compile *p)
+{
+   struct brw_reg g0 = brw_vec8_grf(0, 0);
+   struct brw_reg g2 = brw_vec8_grf(2, 0);
+   struct brw_reg g4 = brw_vec8_grf(4, 0);
+
+   brw_ADD(p, g0, g2, g4);
+}
+
+static void
+gen_ADD_GRF_GRF_IMM(struct brw_compile *p)
+{
+   struct brw_reg g0 = brw_vec8_grf(0, 0);
+   struct brw_reg g2 = brw_vec8_grf(2, 0);
+
+   brw_ADD(p, g0, g2, brw_imm_f(1.0));
+}
+
+static void
+gen_ADD_GRF_GRF_IMM_d(struct brw_compile *p)
+{
+   struct brw_reg g0 = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_D);
+   struct brw_reg g2 = retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_D);
+
+   brw_ADD(p, g0, g2, brw_imm_d(1));
+}
+
+static void
+gen_MOV_GRF_GRF(struct brw_compile *p)
+{
+   struct brw_reg g0 = brw_vec8_grf(0, 0);
+   struct brw_reg g2 = brw_vec8_grf(2, 0);
+
+   brw_MOV(p, g0, g2);
+}
+
+static void
+gen_ADD_MRF_GRF_GRF(struct brw_compile *p)
+{
+   struct brw_reg m6 = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 6, 0);
+   struct brw_reg g2 = brw_vec8_grf(2, 0);
+   struct brw_reg g4 = brw_vec8_grf(4, 0);
+
+   brw_ADD(p, m6, g2, g4);
+}
+
+static void
+gen_ADD_vec1_GRF_GRF_GRF(struct brw_compile *p)
+{
+   struct brw_reg g0 = brw_vec1_grf(0, 0);
+   struct brw_reg g2 = brw_vec1_grf(2, 0);
+   struct brw_reg g4 = brw_vec1_grf(4, 0);
+
+   brw_ADD(p, g0, g2, g4);
+}
+
+static void
+gen_PLN_MRF_GRF_GRF(struct brw_compile *p)
+{
+   struct brw_reg m6 = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 6, 0);
+   struct brw_reg interp = brw_vec1_grf(2, 0);
+   struct brw_reg g4 = brw_vec8_grf(4, 0);
+
+   brw_PLN(p, m6, interp, g4);
+}
+
+static void
+gen_f0_MOV_GRF_GRF(struct brw_compile *p)
+{
+   struct brw_reg g0 = brw_vec8_grf(0, 0);
+   struct brw_reg g2 = brw_vec8_grf(2, 0);
+
+   brw_push_insn_state(p);
+   brw_set_predicate_control(p, true);
+   brw_MOV(p, g0, g2);
+   brw_pop_insn_state(p);
+}
+
+/* The handling of f1 vs f0 changes between gen6 and gen7.  Explicitly test
+ * it, so that we run the fuzzing can run over all the other bits that might
+ * interact with it.
+ */
+static void
+gen_f1_MOV_GRF_GRF(struct brw_compile *p)
+{
+   struct brw_reg g0 = brw_vec8_grf(0, 0);
+   struct brw_reg g2 = brw_vec8_grf(2, 0);
+
+   brw_push_insn_state(p);
+   brw_set_predicate_control(p, true);
+   current_insn(p)->bits2.da1.flag_reg_nr = 1;
+   brw_MOV(p, g0, g2);
+   brw_pop_insn_state(p);
+}
+
+struct {
+   void (*func)(struct brw_compile *p);
+} tests[] = {
+   { gen_MOV_GRF_GRF },
+   { gen_ADD_GRF_GRF_GRF },
+   { gen_ADD_GRF_GRF_IMM },
+   { gen_ADD_GRF_GRF_IMM_d },
+   { gen_ADD_MRF_GRF_GRF },
+   { gen_ADD_vec1_GRF_GRF_GRF },
+   { gen_PLN_MRF_GRF_GRF },
+   { gen_f0_MOV_GRF_GRF },
+   { gen_f1_MOV_GRF_GRF },
+};
+
+int
+main(int argc, char **argv)
+{
+   struct brw_context *brw = calloc(1, sizeof(*brw));
+   struct intel_context *intel = &brw->intel;
+   intel->gen = 6;
+   int ret = 0;
+
+   for (int i = 0; i < ARRAY_SIZE(tests); i++) {
+      for (int align_16 = 0; align_16 <= 1; align_16++) {
+        struct brw_compile *p = rzalloc(NULL, struct brw_compile);
+        brw_init_compile(brw, p, p);
+
+        brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+        if (align_16)
+           brw_set_access_mode(p, BRW_ALIGN_16);
+        else
+           brw_set_access_mode(p, BRW_ALIGN_1);
+
+        tests[i].func(p);
+        assert(p->nr_insn == 1);
+
+        if (!test_compact_instruction(p, p->store[0])) {
+           ret = 1;
+           continue;
+        }
+
+        if (!test_fuzz_compact_instruction(p, p->store[0])) {
+           ret = 1;
+           continue;
+        }
+
+        ralloc_free(p);
+      }
+   }
+
+   return ret;
+}