unsigned bi_class_props[BI_NUM_CLASSES] = {
[BI_ADD] = BI_GENERIC | BI_MODS | BI_SCHED_ALL,
- [BI_ATEST] = BI_SCHED_HI_LATENCY,
+ [BI_ATEST] = BI_SCHED_HI_LATENCY | BI_VECTOR,
[BI_BRANCH] = BI_SCHED_HI_LATENCY,
[BI_CMP] = BI_GENERIC | BI_MODS | BI_SCHED_ALL,
- [BI_BLEND] = BI_SCHED_HI_LATENCY,
+ [BI_BLEND] = BI_SCHED_HI_LATENCY | BI_VECTOR,
[BI_BITWISE] = BI_GENERIC | BI_SCHED_ALL,
[BI_CONVERT] = BI_SCHED_ALL | BI_SWIZZLABLE,
[BI_CSEL] = BI_SCHED_FMA,
[BI_FMA] = BI_ROUNDMODE | BI_SCHED_FMA,
[BI_FREXP] = BI_SCHED_ALL,
[BI_ISUB] = BI_GENERIC | BI_SCHED_ALL,
- [BI_LOAD] = BI_SCHED_HI_LATENCY,
- [BI_LOAD_UNIFORM] = BI_SCHED_HI_LATENCY,
- [BI_LOAD_ATTR] = BI_SCHED_HI_LATENCY,
- [BI_LOAD_VAR] = BI_SCHED_HI_LATENCY,
+ [BI_LOAD] = BI_SCHED_HI_LATENCY | BI_VECTOR,
+ [BI_LOAD_UNIFORM] = BI_SCHED_HI_LATENCY | BI_VECTOR,
+ [BI_LOAD_ATTR] = BI_SCHED_HI_LATENCY | BI_VECTOR,
+ [BI_LOAD_VAR] = BI_SCHED_HI_LATENCY | BI_VECTOR,
[BI_LOAD_VAR_ADDRESS] = BI_SCHED_HI_LATENCY,
[BI_MINMAX] = BI_GENERIC | BI_SCHED_ALL,
[BI_MOV] = BI_MODS | BI_SCHED_ALL,
[BI_SHIFT] = BI_SCHED_ALL,
- [BI_STORE] = BI_SCHED_HI_LATENCY,
- [BI_STORE_VAR] = BI_SCHED_HI_LATENCY,
+ [BI_STORE] = BI_SCHED_HI_LATENCY | BI_VECTOR,
+ [BI_STORE_VAR] = BI_SCHED_HI_LATENCY | BI_VECTOR,
[BI_SPECIAL] = BI_SCHED_ADD | BI_SCHED_SLOW,
[BI_SWIZZLE] = BI_SCHED_ALL | BI_SWIZZLABLE,
- [BI_TEX] = BI_SCHED_HI_LATENCY,
+ [BI_TEX] = BI_SCHED_HI_LATENCY | BI_VECTOR,
[BI_ROUND] = BI_GENERIC | BI_ROUNDMODE | BI_SCHED_ALL,
};
return false;
}
+uint16_t
+bi_from_bytemask(uint16_t bytemask, unsigned bytes)
+{
+ unsigned value = 0;
+
+ for (unsigned c = 0, d = 0; c < 16; c += bytes, ++d) {
+ bool a = (bytemask & (1 << c)) != 0;
+
+ for (unsigned q = c; q < bytes; ++q)
+ assert(((bytemask & (1 << q)) != 0) == a);
+
+ value |= (a << d);
+ }
+
+ return value;
+}
+
+unsigned
+bi_get_component_count(bi_instruction *ins)
+{
+ if (bi_class_props[ins->type] & BI_VECTOR) {
+ return 4;
+ } else {
+ /* Stores imply VECTOR */
+ assert(ins->dest_type);
+ unsigned bytes = MAX2(nir_alu_type_get_type_size(ins->dest_type), 8);
+ return 32 / bytes;
+ }
+}
+
uint16_t
bi_bytemask_of_read_components(bi_instruction *ins, unsigned node)
{
uint16_t mask = 0x0;
+ unsigned component_count = bi_get_component_count(ins);
bi_foreach_src(ins, s) {
if (ins->src[s] != node) continue;
unsigned bytes = (MAX2(size, 8) / 8);
unsigned cmask = (1 << bytes) - 1;
- for (unsigned i = 0; i < ARRAY_SIZE(ins->swizzle[s]); ++i) {
+ for (unsigned i = 0; i < component_count; ++i) {
unsigned c = ins->swizzle[s][i];
mask |= (cmask << (c * bytes));
}
* the end of a clause. Implies ADD */
#define BI_SCHED_HI_LATENCY ((1 << 7) | BI_SCHED_ADD)
+/* Intrinsic is vectorized and should read 4 components regardless of writemask */
+#define BI_VECTOR (1 << 8)
+
/* It can't get any worse than csel4... can it? */
#define BIR_SRC_COUNT 4
bool bi_has_source_mods(bi_instruction *ins);
bool bi_is_src_swizzled(bi_instruction *ins, unsigned s);
bool bi_has_arg(bi_instruction *ins, unsigned arg);
+uint16_t bi_from_bytemask(uint16_t bytemask, unsigned bytes);
+unsigned bi_get_component_count(bi_instruction *ins);
uint16_t bi_bytemask_of_read_components(bi_instruction *ins, unsigned node);
/* BIR passes */