From e1d95339254361d4a481b35b3d2adeb4ae417d03 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Wed, 11 Mar 2020 21:41:57 -0400 Subject: [PATCH] pan/bi: Fix vector handling of readmasks The issue was messing with liveness analysis... with Midgard we look at the writemask to decide how the instruction behaves. Here, since our ALU is scalar (except for subdivision which doesn't have proper writemasks anyway) we just look at the component count directly -- either 4 for vector instructions (essentially - for smaller loads we can replicate manually without much burden), or 1 for scalar. Signed-off-by: Alyssa Rosenzweig Part-of: --- src/panfrost/bifrost/bi_tables.c | 18 ++++++++--------- src/panfrost/bifrost/bir.c | 33 +++++++++++++++++++++++++++++++- src/panfrost/bifrost/compiler.h | 5 +++++ 3 files changed, 46 insertions(+), 10 deletions(-) diff --git a/src/panfrost/bifrost/bi_tables.c b/src/panfrost/bifrost/bi_tables.c index e33b89aae55..a0734e10ec9 100644 --- a/src/panfrost/bifrost/bi_tables.c +++ b/src/panfrost/bifrost/bi_tables.c @@ -28,10 +28,10 @@ unsigned bi_class_props[BI_NUM_CLASSES] = { [BI_ADD] = BI_GENERIC | BI_MODS | BI_SCHED_ALL, - [BI_ATEST] = BI_SCHED_HI_LATENCY, + [BI_ATEST] = BI_SCHED_HI_LATENCY | BI_VECTOR, [BI_BRANCH] = BI_SCHED_HI_LATENCY, [BI_CMP] = BI_GENERIC | BI_MODS | BI_SCHED_ALL, - [BI_BLEND] = BI_SCHED_HI_LATENCY, + [BI_BLEND] = BI_SCHED_HI_LATENCY | BI_VECTOR, [BI_BITWISE] = BI_GENERIC | BI_SCHED_ALL, [BI_CONVERT] = BI_SCHED_ALL | BI_SWIZZLABLE, [BI_CSEL] = BI_SCHED_FMA, @@ -39,18 +39,18 @@ unsigned bi_class_props[BI_NUM_CLASSES] = { [BI_FMA] = BI_ROUNDMODE | BI_SCHED_FMA, [BI_FREXP] = BI_SCHED_ALL, [BI_ISUB] = BI_GENERIC | BI_SCHED_ALL, - [BI_LOAD] = BI_SCHED_HI_LATENCY, - [BI_LOAD_UNIFORM] = BI_SCHED_HI_LATENCY, - [BI_LOAD_ATTR] = BI_SCHED_HI_LATENCY, - [BI_LOAD_VAR] = BI_SCHED_HI_LATENCY, + [BI_LOAD] = BI_SCHED_HI_LATENCY | BI_VECTOR, + [BI_LOAD_UNIFORM] = BI_SCHED_HI_LATENCY | BI_VECTOR, + [BI_LOAD_ATTR] = BI_SCHED_HI_LATENCY | BI_VECTOR, + [BI_LOAD_VAR] = BI_SCHED_HI_LATENCY | BI_VECTOR, [BI_LOAD_VAR_ADDRESS] = BI_SCHED_HI_LATENCY, [BI_MINMAX] = BI_GENERIC | BI_SCHED_ALL, [BI_MOV] = BI_MODS | BI_SCHED_ALL, [BI_SHIFT] = BI_SCHED_ALL, - [BI_STORE] = BI_SCHED_HI_LATENCY, - [BI_STORE_VAR] = BI_SCHED_HI_LATENCY, + [BI_STORE] = BI_SCHED_HI_LATENCY | BI_VECTOR, + [BI_STORE_VAR] = BI_SCHED_HI_LATENCY | BI_VECTOR, [BI_SPECIAL] = BI_SCHED_ADD | BI_SCHED_SLOW, [BI_SWIZZLE] = BI_SCHED_ALL | BI_SWIZZLABLE, - [BI_TEX] = BI_SCHED_HI_LATENCY, + [BI_TEX] = BI_SCHED_HI_LATENCY | BI_VECTOR, [BI_ROUND] = BI_GENERIC | BI_ROUNDMODE | BI_SCHED_ALL, }; diff --git a/src/panfrost/bifrost/bir.c b/src/panfrost/bifrost/bir.c index 48c06ab776e..496a394acfa 100644 --- a/src/panfrost/bifrost/bir.c +++ b/src/panfrost/bifrost/bir.c @@ -75,10 +75,41 @@ bi_has_arg(bi_instruction *ins, unsigned arg) return false; } +uint16_t +bi_from_bytemask(uint16_t bytemask, unsigned bytes) +{ + unsigned value = 0; + + for (unsigned c = 0, d = 0; c < 16; c += bytes, ++d) { + bool a = (bytemask & (1 << c)) != 0; + + for (unsigned q = c; q < bytes; ++q) + assert(((bytemask & (1 << q)) != 0) == a); + + value |= (a << d); + } + + return value; +} + +unsigned +bi_get_component_count(bi_instruction *ins) +{ + if (bi_class_props[ins->type] & BI_VECTOR) { + return 4; + } else { + /* Stores imply VECTOR */ + assert(ins->dest_type); + unsigned bytes = MAX2(nir_alu_type_get_type_size(ins->dest_type), 8); + return 32 / bytes; + } +} + uint16_t bi_bytemask_of_read_components(bi_instruction *ins, unsigned node) { uint16_t mask = 0x0; + unsigned component_count = bi_get_component_count(ins); bi_foreach_src(ins, s) { if (ins->src[s] != node) continue; @@ -87,7 +118,7 @@ bi_bytemask_of_read_components(bi_instruction *ins, unsigned node) unsigned bytes = (MAX2(size, 8) / 8); unsigned cmask = (1 << bytes) - 1; - for (unsigned i = 0; i < ARRAY_SIZE(ins->swizzle[s]); ++i) { + for (unsigned i = 0; i < component_count; ++i) { unsigned c = ins->swizzle[s][i]; mask |= (cmask << (c * bytes)); } diff --git a/src/panfrost/bifrost/compiler.h b/src/panfrost/bifrost/compiler.h index 7427a426f73..920ca7d7e06 100644 --- a/src/panfrost/bifrost/compiler.h +++ b/src/panfrost/bifrost/compiler.h @@ -110,6 +110,9 @@ extern unsigned bi_class_props[BI_NUM_CLASSES]; * the end of a clause. Implies ADD */ #define BI_SCHED_HI_LATENCY ((1 << 7) | BI_SCHED_ADD) +/* Intrinsic is vectorized and should read 4 components regardless of writemask */ +#define BI_VECTOR (1 << 8) + /* It can't get any worse than csel4... can it? */ #define BIR_SRC_COUNT 4 @@ -497,6 +500,8 @@ bool bi_has_outmod(bi_instruction *ins); bool bi_has_source_mods(bi_instruction *ins); bool bi_is_src_swizzled(bi_instruction *ins, unsigned s); bool bi_has_arg(bi_instruction *ins, unsigned arg); +uint16_t bi_from_bytemask(uint16_t bytemask, unsigned bytes); +unsigned bi_get_component_count(bi_instruction *ins); uint16_t bi_bytemask_of_read_components(bi_instruction *ins, unsigned node); /* BIR passes */ -- 2.30.2