From: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Date: Thu, 12 Mar 2020 01:41:57 +0000 (-0400)
Subject: pan/bi: Fix vector handling of readmasks
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=e1d95339254361d4a481b35b3d2adeb4ae417d03;p=mesa.git

pan/bi: Fix vector handling of readmasks

The issue was messing with liveness analysis... with Midgard we look at
the writemask to decide how the instruction behaves. Here, since our ALU
is scalar (except for subdivision which doesn't have proper writemasks
anyway) we just look at the component count directly -- either 4 for
vector instructions (essentially - for smaller loads we can replicate
manually without much burden), or 1 for scalar.

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4158>
---

diff --git a/src/panfrost/bifrost/bi_tables.c b/src/panfrost/bifrost/bi_tables.c
index e33b89aae55..a0734e10ec9 100644
--- a/src/panfrost/bifrost/bi_tables.c
+++ b/src/panfrost/bifrost/bi_tables.c
@@ -28,10 +28,10 @@
 
 unsigned bi_class_props[BI_NUM_CLASSES] = {
         [BI_ADD] 		= BI_GENERIC | BI_MODS | BI_SCHED_ALL,
-        [BI_ATEST] 		= BI_SCHED_HI_LATENCY,
+        [BI_ATEST] 		= BI_SCHED_HI_LATENCY | BI_VECTOR,
         [BI_BRANCH] 		= BI_SCHED_HI_LATENCY,
         [BI_CMP] 		= BI_GENERIC | BI_MODS | BI_SCHED_ALL,
-        [BI_BLEND] 		= BI_SCHED_HI_LATENCY,
+        [BI_BLEND] 		= BI_SCHED_HI_LATENCY | BI_VECTOR,
         [BI_BITWISE] 		= BI_GENERIC | BI_SCHED_ALL,
         [BI_CONVERT] 		= BI_SCHED_ALL | BI_SWIZZLABLE,
         [BI_CSEL] 		= BI_SCHED_FMA,
@@ -39,18 +39,18 @@ unsigned bi_class_props[BI_NUM_CLASSES] = {
         [BI_FMA] 		= BI_ROUNDMODE | BI_SCHED_FMA,
         [BI_FREXP] 		= BI_SCHED_ALL,
         [BI_ISUB] 		= BI_GENERIC | BI_SCHED_ALL,
-        [BI_LOAD] 		= BI_SCHED_HI_LATENCY,
-        [BI_LOAD_UNIFORM]	= BI_SCHED_HI_LATENCY,
-        [BI_LOAD_ATTR] 		= BI_SCHED_HI_LATENCY,
-        [BI_LOAD_VAR] 		= BI_SCHED_HI_LATENCY,
+        [BI_LOAD] 		= BI_SCHED_HI_LATENCY | BI_VECTOR,
+        [BI_LOAD_UNIFORM]	= BI_SCHED_HI_LATENCY | BI_VECTOR,
+        [BI_LOAD_ATTR] 		= BI_SCHED_HI_LATENCY | BI_VECTOR,
+        [BI_LOAD_VAR] 		= BI_SCHED_HI_LATENCY | BI_VECTOR,
         [BI_LOAD_VAR_ADDRESS] 	= BI_SCHED_HI_LATENCY,
         [BI_MINMAX] 		= BI_GENERIC | BI_SCHED_ALL,
         [BI_MOV] 		= BI_MODS | BI_SCHED_ALL,
         [BI_SHIFT] 		= BI_SCHED_ALL,
-        [BI_STORE] 		= BI_SCHED_HI_LATENCY,
-        [BI_STORE_VAR] 		= BI_SCHED_HI_LATENCY,
+        [BI_STORE] 		= BI_SCHED_HI_LATENCY | BI_VECTOR,
+        [BI_STORE_VAR] 		= BI_SCHED_HI_LATENCY | BI_VECTOR,
         [BI_SPECIAL] 		= BI_SCHED_ADD | BI_SCHED_SLOW,
         [BI_SWIZZLE]            = BI_SCHED_ALL | BI_SWIZZLABLE,
-        [BI_TEX] 		= BI_SCHED_HI_LATENCY,
+        [BI_TEX] 		= BI_SCHED_HI_LATENCY | BI_VECTOR,
         [BI_ROUND] 		= BI_GENERIC | BI_ROUNDMODE | BI_SCHED_ALL,
 };
diff --git a/src/panfrost/bifrost/bir.c b/src/panfrost/bifrost/bir.c
index 48c06ab776e..496a394acfa 100644
--- a/src/panfrost/bifrost/bir.c
+++ b/src/panfrost/bifrost/bir.c
@@ -75,10 +75,41 @@ bi_has_arg(bi_instruction *ins, unsigned arg)
         return false;
 }
 
+uint16_t
+bi_from_bytemask(uint16_t bytemask, unsigned bytes)
+{
+        unsigned value = 0;
+
+        for (unsigned c = 0, d = 0; c < 16; c += bytes, ++d) {
+                bool a = (bytemask & (1 << c)) != 0;
+
+                for (unsigned q = c; q < bytes; ++q)
+                        assert(((bytemask & (1 << q)) != 0) == a);
+
+                value |= (a << d);
+        }
+
+        return value;
+}
+
+unsigned
+bi_get_component_count(bi_instruction *ins)
+{
+        if (bi_class_props[ins->type] & BI_VECTOR) {
+                return 4;
+        } else {
+                /* Stores imply VECTOR */
+                assert(ins->dest_type);
+                unsigned bytes = MAX2(nir_alu_type_get_type_size(ins->dest_type), 8);
+                return 32 / bytes;
+        }
+}
+
 uint16_t
 bi_bytemask_of_read_components(bi_instruction *ins, unsigned node)
 {
         uint16_t mask = 0x0;
+        unsigned component_count = bi_get_component_count(ins);
 
         bi_foreach_src(ins, s) {
                 if (ins->src[s] != node) continue;
@@ -87,7 +118,7 @@ bi_bytemask_of_read_components(bi_instruction *ins, unsigned node)
                 unsigned bytes = (MAX2(size, 8) / 8);
                 unsigned cmask = (1 << bytes) - 1;
 
-                for (unsigned i = 0; i < ARRAY_SIZE(ins->swizzle[s]); ++i) {
+                for (unsigned i = 0; i < component_count; ++i) {
                         unsigned c = ins->swizzle[s][i];
                         mask |= (cmask << (c * bytes));
                 }
diff --git a/src/panfrost/bifrost/compiler.h b/src/panfrost/bifrost/compiler.h
index 7427a426f73..920ca7d7e06 100644
--- a/src/panfrost/bifrost/compiler.h
+++ b/src/panfrost/bifrost/compiler.h
@@ -110,6 +110,9 @@ extern unsigned bi_class_props[BI_NUM_CLASSES];
  * the end of a clause. Implies ADD */
 #define BI_SCHED_HI_LATENCY ((1 << 7) | BI_SCHED_ADD)
 
+/* Intrinsic is vectorized and should read 4 components regardless of writemask */
+#define BI_VECTOR (1 << 8)
+
 /* It can't get any worse than csel4... can it? */
 #define BIR_SRC_COUNT 4
 
@@ -497,6 +500,8 @@ bool bi_has_outmod(bi_instruction *ins);
 bool bi_has_source_mods(bi_instruction *ins);
 bool bi_is_src_swizzled(bi_instruction *ins, unsigned s);
 bool bi_has_arg(bi_instruction *ins, unsigned arg);
+uint16_t bi_from_bytemask(uint16_t bytemask, unsigned bytes);
+unsigned bi_get_component_count(bi_instruction *ins);
 uint16_t bi_bytemask_of_read_components(bi_instruction *ins, unsigned node);
 
 /* BIR passes */