/* number of sync bits: */
uint16_t ss, sy;
+
+ /* estimate of number of cycles stalled on (ss) */
+ uint16_t sstall;
+
+ uint16_t last_baryf; /* instruction # of last varying fetch */
};
struct ir3_register {
} flags;
+ /* used for cat5 instructions, but also for internal/IR level
+ * tracking of what registers are read/written by an instruction.
+ * wrmask may be a bad name since it is used to represent both
+ * src and dst that touch multiple adjacent registers.
+ */
+ unsigned wrmask : 16; /* up to vec16 */
+
+ /* for relative addressing, 32bits for array size is too small,
+ * but otoh we don't need to deal with disjoint sets, so instead
+ * use a simple size field (number of scalar components).
+ *
+ * Note the size field isn't important for relative const (since
+ * we don't have to do register allocation for constants).
+ */
+ unsigned size : 15;
+
bool merged : 1; /* half-regs conflict with full regs (ie >= a6xx) */
/* normal registers:
* back to a previous instruction that we depend on).
*/
struct ir3_instruction *instr;
-
- union {
- /* used for cat5 instructions, but also for internal/IR level
- * tracking of what registers are read/written by an instruction.
- * wrmask may be a bad name since it is used to represent both
- * src and dst that touch multiple adjacent registers.
- */
- unsigned wrmask;
- /* for relative addressing, 32bits for array size is too small,
- * but otoh we don't need to deal with disjoint sets, so instead
- * use a simple size field (number of scalar components).
- */
- unsigned size;
- };
};
/*
IR3_INSTR_S2EN = 0x200,
IR3_INSTR_G = 0x400,
IR3_INSTR_SAT = 0x800,
+ /* (cat5/cat6) Bindless */
+ IR3_INSTR_B = 0x1000,
+ /* (cat5-only) Get some parts of the encoding from a1.x */
+ IR3_INSTR_A1EN = 0x2000,
/* meta-flags, for intermediate stages of IR, ie.
* before register assignment is done:
*/
- IR3_INSTR_MARK = 0x1000,
- IR3_INSTR_UNUSED= 0x2000,
+ IR3_INSTR_MARK = 0x4000,
+ IR3_INSTR_UNUSED= 0x8000,
} flags;
uint8_t repeat;
uint8_t nop;
} cat2;
struct {
unsigned samp, tex;
+ unsigned tex_base : 3;
type_t type;
} cat5;
struct {
int iim_val : 3; /* for ldgb/stgb, # of components */
unsigned d : 3;
bool typed : 1;
+ unsigned base : 3;
} cat6;
struct {
unsigned w : 1; /* write */
struct {
int off; /* component/offset */
} split;
+ struct {
+ /* for output collects, this maps back to the entry in the
+ * ir3_shader_variant::outputs table.
+ */
+ int outidx;
+ } collect;
struct {
unsigned samp, tex;
unsigned input_offset;
+ unsigned samp_base : 3;
+ unsigned tex_base : 3;
} prefetch;
struct {
+ /* maps back to entry in ir3_shader_variant::inputs table: */
+ int inidx;
/* for sysvals, identifies the sysval type. Mostly so we can
* identify the special cases where a sysval should not be DCE'd
* (currently, just pre-fs texture fetch)
*/
void *data;
+ /**
+ * Valid if pass calls ir3_find_ssa_uses().. see foreach_ssa_use()
+ */
+ struct set *uses;
+
int sun; /* Sethi–Ullman number, used by sched */
int use_count; /* currently just updated/used by cp */
#ifdef DEBUG
uint32_t serialno;
#endif
+
+ // TODO only computerator/assembler:
+ int line;
};
static inline struct ir3_instruction *
struct ir3_compiler *compiler;
gl_shader_stage type;
- unsigned ninputs, noutputs;
- struct ir3_instruction **inputs;
- struct ir3_instruction **outputs;
+ DECLARE_ARRAY(struct ir3_instruction *, inputs);
+ DECLARE_ARRAY(struct ir3_instruction *, outputs);
/* Track bary.f (and ldlv) instructions.. this is needed in
* scheduling to ensure that all varying fetches happen before
* convenient list of instructions that reference some address
* register simplifies this.
*/
- DECLARE_ARRAY(struct ir3_instruction *, indirects);
+ DECLARE_ARRAY(struct ir3_instruction *, a0_users);
+
+ /* same for a1.x: */
+ DECLARE_ARRAY(struct ir3_instruction *, a1_users);
/* and same for instructions that consume predicate register: */
DECLARE_ARRAY(struct ir3_instruction *, predicates);
unsigned base; /* base vreg name */
unsigned reg; /* base physical reg */
uint16_t start_ip, end_ip;
+
+ /* Indicates if half-precision */
+ bool half;
};
struct ir3_array * ir3_lookup_array(struct ir3 *ir, unsigned id);
#endif
}
-struct ir3 * ir3_create(struct ir3_compiler *compiler,
- gl_shader_stage type, unsigned nin, unsigned nout);
+struct ir3 * ir3_create(struct ir3_compiler *compiler, gl_shader_stage type);
void ir3_destroy(struct ir3 *shader);
void * ir3_assemble(struct ir3 *shader,
struct ir3_info *info, uint32_t gpu_id);
unsigned ir3_count_instructions(struct ir3 *ir);
-static inline int ir3_instr_regno(struct ir3_instruction *instr,
- struct ir3_register *reg)
-{
- unsigned i;
- for (i = 0; i < instr->regs_count; i++)
- if (reg == instr->regs[i])
- return i;
- return -1;
-}
+void ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps);
+#include "util/set.h"
+#define foreach_ssa_use(__use, __instr) \
+ for (struct ir3_instruction *__use = (void *)~0; \
+ __use && (__instr)->uses; __use = NULL) \
+ set_foreach ((__instr)->uses, __entry) \
+ if ((__use = (void *)__entry->key))
#define MAX_ARRAYS 16
return reg->num & 0x3;
}
+#define INVALID_REG regid(63, 0)
+#define VALIDREG(r) ((r) != INVALID_REG)
+#define CONDREG(r, val) COND(VALIDREG(r), (val))
+
static inline bool is_flow(struct ir3_instruction *instr)
{
return (opc_cat(instr->opc) == 0);
static inline bool is_kill(struct ir3_instruction *instr)
{
- return instr->opc == OPC_KILL || instr->opc == OPC_CONDEND;
+ return instr->opc == OPC_KILL;
}
static inline bool is_nop(struct ir3_instruction *instr)
dst = instr->regs[0];
- /* mov's that write to a0.x or p0.x are special: */
+ /* mov's that write to a0 or p0.x are special: */
if (dst->num == regid(REG_P0, 0))
return false;
- if (dst->num == regid(REG_A0, 0))
+ if (reg_num(dst) == REG_A0)
return false;
if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
return true;
}
+/* A move from const, which changes size but not type, can also be
+ * folded into dest instruction in some cases.
+ */
+static inline bool is_const_mov(struct ir3_instruction *instr)
+{
+ if (instr->opc != OPC_MOV)
+ return false;
+
+ if (!(instr->regs[1]->flags & IR3_REG_CONST))
+ return false;
+
+ type_t src_type = instr->cat1.src_type;
+ type_t dst_type = instr->cat1.dst_type;
+
+ return (type_float(src_type) && type_float(dst_type)) ||
+ (type_uint(src_type) && type_uint(dst_type)) ||
+ (type_sint(src_type) && type_sint(dst_type));
+}
+
static inline bool is_alu(struct ir3_instruction *instr)
{
return (1 <= opc_cat(instr->opc)) && (opc_cat(instr->opc) <= 3);
return (opc_cat(instr->opc) == 5);
}
+static inline bool is_tex_or_prefetch(struct ir3_instruction *instr)
+{
+ return is_tex(instr) || (instr->opc == OPC_META_TEX_PREFETCH);
+}
+
static inline bool is_mem(struct ir3_instruction *instr)
{
return (opc_cat(instr->opc) == 6);
return (opc_cat(instr->opc) == 7);
}
+static inline bool
+is_half(struct ir3_instruction *instr)
+{
+ return !!(instr->regs[0]->flags & IR3_REG_HALF);
+}
+
+static inline bool
+is_high(struct ir3_instruction *instr)
+{
+ return !!(instr->regs[0]->flags & IR3_REG_HIGH);
+}
+
static inline bool
is_store(struct ir3_instruction *instr)
{
static inline unsigned dest_regs(struct ir3_instruction *instr)
{
- if ((instr->regs_count == 0) || is_store(instr))
+ if ((instr->regs_count == 0) || is_store(instr) || is_flow(instr))
return 0;
return util_last_bit(instr->regs[0]->wrmask);
}
-static inline bool writes_addr(struct ir3_instruction *instr)
+static inline bool writes_addr0(struct ir3_instruction *instr)
{
if (instr->regs_count > 0) {
struct ir3_register *dst = instr->regs[0];
- return reg_num(dst) == REG_A0;
+ return dst->num == regid(REG_A0, 0);
+ }
+ return false;
+}
+
+static inline bool writes_addr1(struct ir3_instruction *instr)
+{
+ if (instr->regs_count > 0) {
+ struct ir3_register *dst = instr->regs[0];
+ return dst->num == regid(REG_A0, 1);
}
return false;
}
/* iterators for shader inputs: */
#define foreach_input_n(__ininstr, __cnt, __ir) \
- for (unsigned __cnt = 0; __cnt < (__ir)->ninputs; __cnt++) \
+ for (unsigned __cnt = 0; __cnt < (__ir)->inputs_count; __cnt++) \
if ((__ininstr = (__ir)->inputs[__cnt]))
#define foreach_input(__ininstr, __ir) \
foreach_input_n(__ininstr, __i, __ir)
/* iterators for shader outputs: */
#define foreach_output_n(__outinstr, __cnt, __ir) \
- for (unsigned __cnt = 0; __cnt < (__ir)->noutputs; __cnt++) \
+ for (unsigned __cnt = 0; __cnt < (__ir)->outputs_count; __cnt++) \
if ((__outinstr = (__ir)->outputs[__cnt]))
#define foreach_output(__outinstr, __ir) \
foreach_output_n(__outinstr, __i, __ir)
+/* iterators for instructions: */
+#define foreach_instr(__instr, __list) \
+ list_for_each_entry(struct ir3_instruction, __instr, __list, node)
+#define foreach_instr_rev(__instr, __list) \
+ list_for_each_entry_rev(struct ir3_instruction, __instr, __list, node)
+#define foreach_instr_safe(__instr, __list) \
+ list_for_each_entry_safe(struct ir3_instruction, __instr, __list, node)
+
+/* iterators for blocks: */
+#define foreach_block(__block, __list) \
+ list_for_each_entry(struct ir3_block, __block, __list, node)
+#define foreach_block_safe(__block, __list) \
+ list_for_each_entry_safe(struct ir3_block, __block, __list, node)
+
+/* iterators for arrays: */
+#define foreach_array(__array, __list) \
+ list_for_each_entry(struct ir3_array, __array, __list, node)
+
/* dump: */
void ir3_print(struct ir3 *ir);
void ir3_print_instr(struct ir3_instruction *instr);
+/* delay calculation: */
+int ir3_delayslots(struct ir3_instruction *assigner,
+ struct ir3_instruction *consumer, unsigned n, bool soft);
+unsigned ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
+ bool soft, bool pred);
+void ir3_remove_nops(struct ir3 *ir);
+
/* depth calculation: */
struct ir3_shader_variant;
-int ir3_delayslots(struct ir3_instruction *assigner,
- struct ir3_instruction *consumer, unsigned n);
-void ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list);
void ir3_depth(struct ir3 *ir, struct ir3_shader_variant *so);
+/* fp16 conversion folding */
+void ir3_cf(struct ir3 *ir);
+
/* copy-propagate: */
void ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so);
void ir3_sched_add_deps(struct ir3 *ir);
int ir3_sched(struct ir3 *ir);
-void ir3_a6xx_fixup_atomic_dests(struct ir3 *ir, struct ir3_shader_variant *so);
+struct ir3_context;
+int ir3_postsched(struct ir3_context *ctx);
+
+bool ir3_a6xx_fixup_atomic_dests(struct ir3 *ir, struct ir3_shader_variant *so);
/* register assignment: */
struct ir3_ra_reg_set * ir3_ra_alloc_reg_set(struct ir3_compiler *compiler);
int ir3_ra(struct ir3_shader_variant *v, struct ir3_instruction **precolor, unsigned nprecolor);
/* legalize: */
-void ir3_legalize(struct ir3 *ir, bool *has_ssbo, bool *need_pixlod, int *max_bary);
+void ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary);
+
+static inline bool
+ir3_has_latency_to_hide(struct ir3 *ir)
+{
+ /* VS/GS/TCS/TESS co-exist with frag shader invocations, but we don't
+ * know the nature of the fragment shader. Just assume it will have
+ * latency to hide:
+ */
+ if (ir->type != MESA_SHADER_FRAGMENT)
+ return true;
+
+ foreach_block (block, &ir->block_list) {
+ foreach_instr (instr, &block->instr_list) {
+ if (is_tex_or_prefetch(instr))
+ return true;
+
+ if (is_load(instr)) {
+ switch (instr->opc) {
+ case OPC_LDLV:
+ case OPC_LDL:
+ case OPC_LDLW:
+ break;
+ default:
+ return true;
+ }
+ }
+ }
+ }
+
+ return false;
+}
/* ************************************************************************* */
/* instruction helpers */
mov->cat1.src_type = type;
mov->cat1.dst_type = type;
__ssa_dst(mov)->flags |= flags;
- ir3_reg_create(mov, 0, IR3_REG_IMMED)->uim_val = val;
+ ir3_reg_create(mov, 0, IR3_REG_IMMED | flags)->uim_val = val;
return mov;
}
#define INSTR4(name) __INSTR4(0, name, OPC_##name)
/* cat0 instructions: */
-INSTR0(BR)
+INSTR1(BR)
INSTR0(JUMP)
INSTR1(KILL)
INSTR0(END)
INSTR0(CHSH)
INSTR0(CHMASK)
-INSTR1(CONDEND)
-INSTR0(ENDPATCH)
+INSTR1(IF)
+INSTR0(ELSE)
+INSTR0(ENDIF)
/* cat2 instructions, most 2 src but some 1 src: */
INSTR2(ADD_F)
INSTR3(MAD_S24)
INSTR3(MAD_F16)
INSTR3(MAD_F32)
+/* NOTE: SEL_B32 checks for zero vs nonzero */
INSTR3(SEL_B16)
INSTR3(SEL_B32)
INSTR3(SEL_S16)
/* cat4 instructions: */
INSTR1(RCP)
INSTR1(RSQ)
+INSTR1(HRSQ)
INSTR1(LOG2)
+INSTR1(HLOG2)
INSTR1(EXP2)
+INSTR1(HEXP2)
INSTR1(SIN)
INSTR1(COS)
INSTR1(SQRT)
/* cat5 instructions: */
INSTR1(DSX)
+INSTR1(DSXPP_1)
INSTR1(DSY)
+INSTR1(DSYPP_1)
INSTR1F(3D, DSX)
INSTR1F(3D, DSY)
INSTR1(RGETPOS)
struct ir3_instruction *sam;
sam = ir3_instr_create(block, opc);
- sam->flags |= flags | IR3_INSTR_S2EN;
+ sam->flags |= flags;
__ssa_dst(sam)->wrmask = wrmask;
- __ssa_src(sam, samp_tex, IR3_REG_HALF);
+ if (flags & IR3_INSTR_S2EN) {
+ __ssa_src(sam, samp_tex, IR3_REG_HALF);
+ }
if (src0) {
__ssa_src(sam, src0, 0)->wrmask = (1 << (src0->regs_count - 1)) - 1;
}
INSTR2(ATOMIC_AND)
INSTR2(ATOMIC_OR)
INSTR2(ATOMIC_XOR)
+INSTR2(LDC)
#if GPU >= 600
INSTR3(STIB);
INSTR2(LDIB);
/* split this out or find some helper to use.. like main/bitset.h.. */
#include <string.h>
+#include "util/bitset.h"
#define MAX_REG 256
-typedef uint8_t regmask_t[2 * MAX_REG / 8];
+typedef BITSET_DECLARE(regmask_t, 2 * MAX_REG);
-static inline unsigned regmask_idx(struct ir3_register *reg)
+static inline bool
+__regmask_get(regmask_t *regmask, struct ir3_register *reg, unsigned n)
{
- unsigned num = (reg->flags & IR3_REG_RELATIV) ? reg->array.offset : reg->num;
- debug_assert(num < MAX_REG);
- if (reg->flags & IR3_REG_HALF) {
- if (reg->merged) {
- num /= 2;
+ if (reg->merged) {
+ /* a6xx+ case, with merged register file, we track things in terms
+ * of half-precision registers, with a full precisions register
+ * using two half-precision slots:
+ */
+ if (reg->flags & IR3_REG_HALF) {
+ return BITSET_TEST(*regmask, n);
} else {
- num += MAX_REG;
+ n *= 2;
+ return BITSET_TEST(*regmask, n) || BITSET_TEST(*regmask, n+1);
}
+ } else {
+ /* pre a6xx case, with separate register file for half and full
+ * precision:
+ */
+ if (reg->flags & IR3_REG_HALF)
+ n += MAX_REG;
+ return BITSET_TEST(*regmask, n);
+ }
+}
+
+static inline void
+__regmask_set(regmask_t *regmask, struct ir3_register *reg, unsigned n)
+{
+ if (reg->merged) {
+ /* a6xx+ case, with merged register file, we track things in terms
+ * of half-precision registers, with a full precisions register
+ * using two half-precision slots:
+ */
+ if (reg->flags & IR3_REG_HALF) {
+ BITSET_SET(*regmask, n);
+ } else {
+ n *= 2;
+ BITSET_SET(*regmask, n);
+ BITSET_SET(*regmask, n+1);
+ }
+ } else {
+ /* pre a6xx case, with separate register file for half and full
+ * precision:
+ */
+ if (reg->flags & IR3_REG_HALF)
+ n += MAX_REG;
+ BITSET_SET(*regmask, n);
}
- return num;
}
static inline void regmask_init(regmask_t *regmask)
static inline void regmask_set(regmask_t *regmask, struct ir3_register *reg)
{
- unsigned idx = regmask_idx(reg);
if (reg->flags & IR3_REG_RELATIV) {
- unsigned i;
- for (i = 0; i < reg->size; i++, idx++)
- (*regmask)[idx / 8] |= 1 << (idx % 8);
+ for (unsigned i = 0; i < reg->size; i++)
+ __regmask_set(regmask, reg, reg->array.offset + i);
} else {
- unsigned mask;
- for (mask = reg->wrmask; mask; mask >>= 1, idx++)
+ for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++)
if (mask & 1)
- (*regmask)[idx / 8] |= 1 << (idx % 8);
+ __regmask_set(regmask, reg, n);
}
}
(*dst)[i] = (*a)[i] | (*b)[i];
}
-/* set bits in a if not set in b, conceptually:
- * a |= (reg & ~b)
- */
-static inline void regmask_set_if_not(regmask_t *a,
- struct ir3_register *reg, regmask_t *b)
-{
- unsigned idx = regmask_idx(reg);
- if (reg->flags & IR3_REG_RELATIV) {
- unsigned i;
- for (i = 0; i < reg->size; i++, idx++)
- if (!((*b)[idx / 8] & (1 << (idx % 8))))
- (*a)[idx / 8] |= 1 << (idx % 8);
- } else {
- unsigned mask;
- for (mask = reg->wrmask; mask; mask >>= 1, idx++)
- if (mask & 1)
- if (!((*b)[idx / 8] & (1 << (idx % 8))))
- (*a)[idx / 8] |= 1 << (idx % 8);
- }
-}
-
static inline bool regmask_get(regmask_t *regmask,
struct ir3_register *reg)
{
- unsigned idx = regmask_idx(reg);
if (reg->flags & IR3_REG_RELATIV) {
- unsigned i;
- for (i = 0; i < reg->size; i++, idx++)
- if ((*regmask)[idx / 8] & (1 << (idx % 8)))
+ for (unsigned i = 0; i < reg->size; i++)
+ if (__regmask_get(regmask, reg, reg->array.offset + i))
return true;
} else {
- unsigned mask;
- for (mask = reg->wrmask; mask; mask >>= 1, idx++)
+ for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++)
if (mask & 1)
- if ((*regmask)[idx / 8] & (1 << (idx % 8)))
+ if (__regmask_get(regmask, reg, n))
return true;
}
return false;