pan/bi: Fix vector handling of readmasks
authorAlyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Thu, 12 Mar 2020 01:41:57 +0000 (21:41 -0400)
committerMarge Bot <eric+marge@anholt.net>
Thu, 12 Mar 2020 12:41:08 +0000 (12:41 +0000)
The issue was messing with liveness analysis... with Midgard we look at
the writemask to decide how the instruction behaves. Here, since our ALU
is scalar (except for subdivision which doesn't have proper writemasks
anyway) we just look at the component count directly -- either 4 for
vector instructions (essentially - for smaller loads we can replicate
manually without much burden), or 1 for scalar.

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4158>

src/panfrost/bifrost/bi_tables.c
src/panfrost/bifrost/bir.c
src/panfrost/bifrost/compiler.h

index e33b89aae55550b3f3ec7fe0eff26ec23ad67487..a0734e10ec97acdf416fa52d11e50760cdbcc8d9 100644 (file)
 
 unsigned bi_class_props[BI_NUM_CLASSES] = {
         [BI_ADD]               = BI_GENERIC | BI_MODS | BI_SCHED_ALL,
-        [BI_ATEST]             = BI_SCHED_HI_LATENCY,
+        [BI_ATEST]             = BI_SCHED_HI_LATENCY | BI_VECTOR,
         [BI_BRANCH]            = BI_SCHED_HI_LATENCY,
         [BI_CMP]               = BI_GENERIC | BI_MODS | BI_SCHED_ALL,
-        [BI_BLEND]             = BI_SCHED_HI_LATENCY,
+        [BI_BLEND]             = BI_SCHED_HI_LATENCY | BI_VECTOR,
         [BI_BITWISE]           = BI_GENERIC | BI_SCHED_ALL,
         [BI_CONVERT]           = BI_SCHED_ALL | BI_SWIZZLABLE,
         [BI_CSEL]              = BI_SCHED_FMA,
@@ -39,18 +39,18 @@ unsigned bi_class_props[BI_NUM_CLASSES] = {
         [BI_FMA]               = BI_ROUNDMODE | BI_SCHED_FMA,
         [BI_FREXP]             = BI_SCHED_ALL,
         [BI_ISUB]              = BI_GENERIC | BI_SCHED_ALL,
-        [BI_LOAD]              = BI_SCHED_HI_LATENCY,
-        [BI_LOAD_UNIFORM]      = BI_SCHED_HI_LATENCY,
-        [BI_LOAD_ATTR]                 = BI_SCHED_HI_LATENCY,
-        [BI_LOAD_VAR]          = BI_SCHED_HI_LATENCY,
+        [BI_LOAD]              = BI_SCHED_HI_LATENCY | BI_VECTOR,
+        [BI_LOAD_UNIFORM]      = BI_SCHED_HI_LATENCY | BI_VECTOR,
+        [BI_LOAD_ATTR]                 = BI_SCHED_HI_LATENCY | BI_VECTOR,
+        [BI_LOAD_VAR]          = BI_SCHED_HI_LATENCY | BI_VECTOR,
         [BI_LOAD_VAR_ADDRESS]  = BI_SCHED_HI_LATENCY,
         [BI_MINMAX]            = BI_GENERIC | BI_SCHED_ALL,
         [BI_MOV]               = BI_MODS | BI_SCHED_ALL,
         [BI_SHIFT]             = BI_SCHED_ALL,
-        [BI_STORE]             = BI_SCHED_HI_LATENCY,
-        [BI_STORE_VAR]                 = BI_SCHED_HI_LATENCY,
+        [BI_STORE]             = BI_SCHED_HI_LATENCY | BI_VECTOR,
+        [BI_STORE_VAR]                 = BI_SCHED_HI_LATENCY | BI_VECTOR,
         [BI_SPECIAL]           = BI_SCHED_ADD | BI_SCHED_SLOW,
         [BI_SWIZZLE]            = BI_SCHED_ALL | BI_SWIZZLABLE,
-        [BI_TEX]               = BI_SCHED_HI_LATENCY,
+        [BI_TEX]               = BI_SCHED_HI_LATENCY | BI_VECTOR,
         [BI_ROUND]             = BI_GENERIC | BI_ROUNDMODE | BI_SCHED_ALL,
 };
index 48c06ab776e9d2a15cb10493ff172e0e5e93468f..496a394acfa9243003ed2f930c27e0105c5da052 100644 (file)
@@ -75,10 +75,41 @@ bi_has_arg(bi_instruction *ins, unsigned arg)
         return false;
 }
 
+uint16_t
+bi_from_bytemask(uint16_t bytemask, unsigned bytes)
+{
+        unsigned value = 0;
+
+        for (unsigned c = 0, d = 0; c < 16; c += bytes, ++d) {
+                bool a = (bytemask & (1 << c)) != 0;
+
+                for (unsigned q = c; q < bytes; ++q)
+                        assert(((bytemask & (1 << q)) != 0) == a);
+
+                value |= (a << d);
+        }
+
+        return value;
+}
+
+unsigned
+bi_get_component_count(bi_instruction *ins)
+{
+        if (bi_class_props[ins->type] & BI_VECTOR) {
+                return 4;
+        } else {
+                /* Stores imply VECTOR */
+                assert(ins->dest_type);
+                unsigned bytes = MAX2(nir_alu_type_get_type_size(ins->dest_type), 8);
+                return 32 / bytes;
+        }
+}
+
 uint16_t
 bi_bytemask_of_read_components(bi_instruction *ins, unsigned node)
 {
         uint16_t mask = 0x0;
+        unsigned component_count = bi_get_component_count(ins);
 
         bi_foreach_src(ins, s) {
                 if (ins->src[s] != node) continue;
@@ -87,7 +118,7 @@ bi_bytemask_of_read_components(bi_instruction *ins, unsigned node)
                 unsigned bytes = (MAX2(size, 8) / 8);
                 unsigned cmask = (1 << bytes) - 1;
 
-                for (unsigned i = 0; i < ARRAY_SIZE(ins->swizzle[s]); ++i) {
+                for (unsigned i = 0; i < component_count; ++i) {
                         unsigned c = ins->swizzle[s][i];
                         mask |= (cmask << (c * bytes));
                 }
index 7427a426f73aa5f8cd69f5284feebe768d0df25f..920ca7d7e06dc306e6418014e540e6a813566fbd 100644 (file)
@@ -110,6 +110,9 @@ extern unsigned bi_class_props[BI_NUM_CLASSES];
  * the end of a clause. Implies ADD */
 #define BI_SCHED_HI_LATENCY ((1 << 7) | BI_SCHED_ADD)
 
+/* Intrinsic is vectorized and should read 4 components regardless of writemask */
+#define BI_VECTOR (1 << 8)
+
 /* It can't get any worse than csel4... can it? */
 #define BIR_SRC_COUNT 4
 
@@ -497,6 +500,8 @@ bool bi_has_outmod(bi_instruction *ins);
 bool bi_has_source_mods(bi_instruction *ins);
 bool bi_is_src_swizzled(bi_instruction *ins, unsigned s);
 bool bi_has_arg(bi_instruction *ins, unsigned arg);
+uint16_t bi_from_bytemask(uint16_t bytemask, unsigned bytes);
+unsigned bi_get_component_count(bi_instruction *ins);
 uint16_t bi_bytemask_of_read_components(bi_instruction *ins, unsigned node);
 
 /* BIR passes */