From: Alyssa Rosenzweig Date: Thu, 12 Mar 2020 01:41:57 +0000 (-0400) Subject: pan/bi: Fix vector handling of readmasks X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=e1d95339254361d4a481b35b3d2adeb4ae417d03;p=mesa.git pan/bi: Fix vector handling of readmasks The issue was messing with liveness analysis... with Midgard we look at the writemask to decide how the instruction behaves. Here, since our ALU is scalar (except for subdivision which doesn't have proper writemasks anyway) we just look at the component count directly -- either 4 for vector instructions (essentially - for smaller loads we can replicate manually without much burden), or 1 for scalar. Signed-off-by: Alyssa Rosenzweig Part-of: --- diff --git a/src/panfrost/bifrost/bi_tables.c b/src/panfrost/bifrost/bi_tables.c index e33b89aae55..a0734e10ec9 100644 --- a/src/panfrost/bifrost/bi_tables.c +++ b/src/panfrost/bifrost/bi_tables.c @@ -28,10 +28,10 @@ unsigned bi_class_props[BI_NUM_CLASSES] = { [BI_ADD] = BI_GENERIC | BI_MODS | BI_SCHED_ALL, - [BI_ATEST] = BI_SCHED_HI_LATENCY, + [BI_ATEST] = BI_SCHED_HI_LATENCY | BI_VECTOR, [BI_BRANCH] = BI_SCHED_HI_LATENCY, [BI_CMP] = BI_GENERIC | BI_MODS | BI_SCHED_ALL, - [BI_BLEND] = BI_SCHED_HI_LATENCY, + [BI_BLEND] = BI_SCHED_HI_LATENCY | BI_VECTOR, [BI_BITWISE] = BI_GENERIC | BI_SCHED_ALL, [BI_CONVERT] = BI_SCHED_ALL | BI_SWIZZLABLE, [BI_CSEL] = BI_SCHED_FMA, @@ -39,18 +39,18 @@ unsigned bi_class_props[BI_NUM_CLASSES] = { [BI_FMA] = BI_ROUNDMODE | BI_SCHED_FMA, [BI_FREXP] = BI_SCHED_ALL, [BI_ISUB] = BI_GENERIC | BI_SCHED_ALL, - [BI_LOAD] = BI_SCHED_HI_LATENCY, - [BI_LOAD_UNIFORM] = BI_SCHED_HI_LATENCY, - [BI_LOAD_ATTR] = BI_SCHED_HI_LATENCY, - [BI_LOAD_VAR] = BI_SCHED_HI_LATENCY, + [BI_LOAD] = BI_SCHED_HI_LATENCY | BI_VECTOR, + [BI_LOAD_UNIFORM] = BI_SCHED_HI_LATENCY | BI_VECTOR, + [BI_LOAD_ATTR] = BI_SCHED_HI_LATENCY | BI_VECTOR, + [BI_LOAD_VAR] = BI_SCHED_HI_LATENCY | BI_VECTOR, [BI_LOAD_VAR_ADDRESS] = BI_SCHED_HI_LATENCY, [BI_MINMAX] = BI_GENERIC | BI_SCHED_ALL, [BI_MOV] = BI_MODS | BI_SCHED_ALL, [BI_SHIFT] = BI_SCHED_ALL, - [BI_STORE] = BI_SCHED_HI_LATENCY, - [BI_STORE_VAR] = BI_SCHED_HI_LATENCY, + [BI_STORE] = BI_SCHED_HI_LATENCY | BI_VECTOR, + [BI_STORE_VAR] = BI_SCHED_HI_LATENCY | BI_VECTOR, [BI_SPECIAL] = BI_SCHED_ADD | BI_SCHED_SLOW, [BI_SWIZZLE] = BI_SCHED_ALL | BI_SWIZZLABLE, - [BI_TEX] = BI_SCHED_HI_LATENCY, + [BI_TEX] = BI_SCHED_HI_LATENCY | BI_VECTOR, [BI_ROUND] = BI_GENERIC | BI_ROUNDMODE | BI_SCHED_ALL, }; diff --git a/src/panfrost/bifrost/bir.c b/src/panfrost/bifrost/bir.c index 48c06ab776e..496a394acfa 100644 --- a/src/panfrost/bifrost/bir.c +++ b/src/panfrost/bifrost/bir.c @@ -75,10 +75,41 @@ bi_has_arg(bi_instruction *ins, unsigned arg) return false; } +uint16_t +bi_from_bytemask(uint16_t bytemask, unsigned bytes) +{ + unsigned value = 0; + + for (unsigned c = 0, d = 0; c < 16; c += bytes, ++d) { + bool a = (bytemask & (1 << c)) != 0; + + for (unsigned q = c; q < bytes; ++q) + assert(((bytemask & (1 << q)) != 0) == a); + + value |= (a << d); + } + + return value; +} + +unsigned +bi_get_component_count(bi_instruction *ins) +{ + if (bi_class_props[ins->type] & BI_VECTOR) { + return 4; + } else { + /* Stores imply VECTOR */ + assert(ins->dest_type); + unsigned bytes = MAX2(nir_alu_type_get_type_size(ins->dest_type), 8); + return 32 / bytes; + } +} + uint16_t bi_bytemask_of_read_components(bi_instruction *ins, unsigned node) { uint16_t mask = 0x0; + unsigned component_count = bi_get_component_count(ins); bi_foreach_src(ins, s) { if (ins->src[s] != node) continue; @@ -87,7 +118,7 @@ bi_bytemask_of_read_components(bi_instruction *ins, unsigned node) unsigned bytes = (MAX2(size, 8) / 8); unsigned cmask = (1 << bytes) - 1; - for (unsigned i = 0; i < ARRAY_SIZE(ins->swizzle[s]); ++i) { + for (unsigned i = 0; i < component_count; ++i) { unsigned c = ins->swizzle[s][i]; mask |= (cmask << (c * bytes)); } diff --git a/src/panfrost/bifrost/compiler.h b/src/panfrost/bifrost/compiler.h index 7427a426f73..920ca7d7e06 100644 --- a/src/panfrost/bifrost/compiler.h +++ b/src/panfrost/bifrost/compiler.h @@ -110,6 +110,9 @@ extern unsigned bi_class_props[BI_NUM_CLASSES]; * the end of a clause. Implies ADD */ #define BI_SCHED_HI_LATENCY ((1 << 7) | BI_SCHED_ADD) +/* Intrinsic is vectorized and should read 4 components regardless of writemask */ +#define BI_VECTOR (1 << 8) + /* It can't get any worse than csel4... can it? */ #define BIR_SRC_COUNT 4 @@ -497,6 +500,8 @@ bool bi_has_outmod(bi_instruction *ins); bool bi_has_source_mods(bi_instruction *ins); bool bi_is_src_swizzled(bi_instruction *ins, unsigned s); bool bi_has_arg(bi_instruction *ins, unsigned arg); +uint16_t bi_from_bytemask(uint16_t bytemask, unsigned bytes); +unsigned bi_get_component_count(bi_instruction *ins); uint16_t bi_bytemask_of_read_components(bi_instruction *ins, unsigned node); /* BIR passes */