freedreno/ir3: new pre-RA scheduler

[mesa.git] / src / freedreno / ir3 / ir3.h
diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h

index 3e4fa34aa0f6c14e19bce139f11d70cbc13100a8..4dfcdf0da51d753c74c2a63e475f838ce6de9a4d 100644 (file)
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@@ -58,6 +58,11 @@ struct ir3_info {
  
         /* number of sync bits: */
         uint16_t ss, sy;
+
+       /* estimate of number of cycles stalled on (ss) */
+       uint16_t sstall;
+
+       uint16_t last_baryf;     /* instruction # of last varying fetch */
  };
  
  struct ir3_register {
@@ -100,6 +105,22 @@ struct ir3_register {
  
         } flags;
  
+       /* used for cat5 instructions, but also for internal/IR level
+        * tracking of what registers are read/written by an instruction.
+        * wrmask may be a bad name since it is used to represent both
+        * src and dst that touch multiple adjacent registers.
+        */
+       unsigned wrmask : 16;  /* up to vec16 */
+
+       /* for relative addressing, 32bits for array size is too small,
+        * but otoh we don't need to deal with disjoint sets, so instead
+        * use a simple size field (number of scalar components).
+        *
+        * Note the size field isn't important for relative const (since
+        * we don't have to do register allocation for constants).
+        */
+       unsigned size : 15;
+
         bool merged : 1;    /* half-regs conflict with full regs (ie >= a6xx) */
  
         /* normal registers:
@@ -127,20 +148,6 @@ struct ir3_register {
          * back to a previous instruction that we depend on).
          */
         struct ir3_instruction *instr;
-
-       union {
-               /* used for cat5 instructions, but also for internal/IR level
-                * tracking of what registers are read/written by an instruction.
-                * wrmask may be a bad name since it is used to represent both
-                * src and dst that touch multiple adjacent registers.
-                */
-               unsigned wrmask;
-               /* for relative addressing, 32bits for array size is too small,
-                * but otoh we don't need to deal with disjoint sets, so instead
-                * use a simple size field (number of scalar components).
-                */
-               unsigned size;
-       };
  };
  
  /*
@@ -211,11 +218,15 @@ struct ir3_instruction {
                 IR3_INSTR_S2EN  = 0x200,
                 IR3_INSTR_G     = 0x400,
                 IR3_INSTR_SAT   = 0x800,
+               /* (cat5/cat6) Bindless */
+               IR3_INSTR_B     = 0x1000,
+               /* (cat5-only) Get some parts of the encoding from a1.x */
+               IR3_INSTR_A1EN  = 0x2000,
                 /* meta-flags, for intermediate stages of IR, ie.
                  * before register assignment is done:
                  */
-               IR3_INSTR_MARK  = 0x1000,
-               IR3_INSTR_UNUSED= 0x2000,
+               IR3_INSTR_MARK  = 0x4000,
+               IR3_INSTR_UNUSED= 0x8000,
         } flags;
         uint8_t repeat;
         uint8_t nop;
@@ -246,6 +257,7 @@ struct ir3_instruction {
                 } cat2;
                 struct {
                         unsigned samp, tex;
+                       unsigned tex_base : 3;
                         type_t type;
                 } cat5;
                 struct {
@@ -255,6 +267,7 @@ struct ir3_instruction {
                         int iim_val : 3;      /* for ldgb/stgb, # of components */
                         unsigned d : 3;
                         bool typed : 1;
+                       unsigned base : 3;
                 } cat6;
                 struct {
                         unsigned w : 1;       /* write */
@@ -268,11 +281,21 @@ struct ir3_instruction {
                 struct {
                         int off;              /* component/offset */
                 } split;
+               struct {
+                       /* for output collects, this maps back to the entry in the
+                        * ir3_shader_variant::outputs table.
+                        */
+                       int outidx;
+               } collect;
                 struct {
                         unsigned samp, tex;
                         unsigned input_offset;
+                       unsigned samp_base : 3;
+                       unsigned tex_base : 3;
                 } prefetch;
                 struct {
+                       /* maps back to entry in ir3_shader_variant::inputs table: */
+                       int inidx;
                         /* for sysvals, identifies the sysval type.  Mostly so we can
                          * identify the special cases where a sysval should not be DCE'd
                          * (currently, just pre-fs texture fetch)
@@ -310,6 +333,11 @@ struct ir3_instruction {
          */
         void *data;
  
+       /**
+        * Valid if pass calls ir3_find_ssa_uses().. see foreach_ssa_use()
+        */
+       struct set *uses;
+
         int sun;            /* Sethi–Ullman number, used by sched */
         int use_count;      /* currently just updated/used by cp */
  
@@ -387,6 +415,9 @@ struct ir3_instruction {
  #ifdef DEBUG
         uint32_t serialno;
  #endif
+
+       // TODO only computerator/assembler:
+       int line;
  };
  
  static inline struct ir3_instruction *
@@ -425,9 +456,8 @@ struct ir3 {
         struct ir3_compiler *compiler;
         gl_shader_stage type;
  
-       unsigned ninputs, noutputs;
-       struct ir3_instruction **inputs;
-       struct ir3_instruction **outputs;
+       DECLARE_ARRAY(struct ir3_instruction *, inputs);
+       DECLARE_ARRAY(struct ir3_instruction *, outputs);
  
         /* Track bary.f (and ldlv) instructions.. this is needed in
          * scheduling to ensure that all varying fetches happen before
@@ -447,7 +477,10 @@ struct ir3 {
          * convenient list of instructions that reference some address
          * register simplifies this.
          */
-       DECLARE_ARRAY(struct ir3_instruction *, indirects);
+       DECLARE_ARRAY(struct ir3_instruction *, a0_users);
+
+       /* same for a1.x: */
+       DECLARE_ARRAY(struct ir3_instruction *, a1_users);
  
         /* and same for instructions that consume predicate register: */
         DECLARE_ARRAY(struct ir3_instruction *, predicates);
@@ -489,6 +522,9 @@ struct ir3_array {
         unsigned base;      /* base vreg name */
         unsigned reg;       /* base physical reg */
         uint16_t start_ip, end_ip;
+
+       /* Indicates if half-precision */
+       bool half;
  };
  
  struct ir3_array * ir3_lookup_array(struct ir3 *ir, unsigned id);
@@ -537,8 +573,7 @@ block_id(struct ir3_block *block)
  #endif
  }
  
-struct ir3 * ir3_create(struct ir3_compiler *compiler,
-               gl_shader_stage type, unsigned nin, unsigned nout);
+struct ir3 * ir3_create(struct ir3_compiler *compiler, gl_shader_stage type);
  void ir3_destroy(struct ir3 *shader);
  void * ir3_assemble(struct ir3 *shader,
                 struct ir3_info *info, uint32_t gpu_id);
@@ -574,16 +609,14 @@ void ir3_clear_mark(struct ir3 *shader);
  
  unsigned ir3_count_instructions(struct ir3 *ir);
  
-static inline int ir3_instr_regno(struct ir3_instruction *instr,
-               struct ir3_register *reg)
-{
-       unsigned i;
-       for (i = 0; i < instr->regs_count; i++)
-               if (reg == instr->regs[i])
-                       return i;
-       return -1;
-}
+void ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps);
  
+#include "util/set.h"
+#define foreach_ssa_use(__use, __instr) \
+       for (struct ir3_instruction *__use = (void *)~0; \
+            __use && (__instr)->uses; __use = NULL) \
+               set_foreach ((__instr)->uses, __entry) \
+                       if ((__use = (void *)__entry->key))
  
  #define MAX_ARRAYS 16
  
@@ -608,6 +641,10 @@ static inline uint32_t reg_comp(struct ir3_register *reg)
         return reg->num & 0x3;
  }
  
+#define INVALID_REG      regid(63, 0)
+#define VALIDREG(r)      ((r) != INVALID_REG)
+#define CONDREG(r, val)  COND(VALIDREG(r), (val))
+
  static inline bool is_flow(struct ir3_instruction *instr)
  {
         return (opc_cat(instr->opc) == 0);
@@ -615,7 +652,7 @@ static inline bool is_flow(struct ir3_instruction *instr)
  
  static inline bool is_kill(struct ir3_instruction *instr)
  {
-       return instr->opc == OPC_KILL || instr->opc == OPC_CONDEND;
+       return instr->opc == OPC_KILL;
  }
  
  static inline bool is_nop(struct ir3_instruction *instr)
@@ -669,10 +706,10 @@ static inline bool is_same_type_mov(struct ir3_instruction *instr)
  
         dst = instr->regs[0];
  
-       /* mov's that write to a0.x or p0.x are special: */
+       /* mov's that write to a0 or p0.x are special: */
         if (dst->num == regid(REG_P0, 0))
                 return false;
-       if (dst->num == regid(REG_A0, 0))
+       if (reg_num(dst) == REG_A0)
                 return false;
  
         if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
@@ -681,6 +718,25 @@ static inline bool is_same_type_mov(struct ir3_instruction *instr)
         return true;
  }
  
+/* A move from const, which changes size but not type, can also be
+ * folded into dest instruction in some cases.
+ */
+static inline bool is_const_mov(struct ir3_instruction *instr)
+{
+       if (instr->opc != OPC_MOV)
+               return false;
+
+       if (!(instr->regs[1]->flags & IR3_REG_CONST))
+               return false;
+
+       type_t src_type = instr->cat1.src_type;
+       type_t dst_type = instr->cat1.dst_type;
+
+       return (type_float(src_type) && type_float(dst_type)) ||
+               (type_uint(src_type) && type_uint(dst_type)) ||
+               (type_sint(src_type) && type_sint(dst_type));
+}
+
  static inline bool is_alu(struct ir3_instruction *instr)
  {
         return (1 <= opc_cat(instr->opc)) && (opc_cat(instr->opc) <= 3);
@@ -696,6 +752,11 @@ static inline bool is_tex(struct ir3_instruction *instr)
         return (opc_cat(instr->opc) == 5);
  }
  
+static inline bool is_tex_or_prefetch(struct ir3_instruction *instr)
+{
+       return is_tex(instr) || (instr->opc == OPC_META_TEX_PREFETCH);
+}
+
  static inline bool is_mem(struct ir3_instruction *instr)
  {
         return (opc_cat(instr->opc) == 6);
@@ -706,6 +767,18 @@ static inline bool is_barrier(struct ir3_instruction *instr)
         return (opc_cat(instr->opc) == 7);
  }
  
+static inline bool
+is_half(struct ir3_instruction *instr)
+{
+       return !!(instr->regs[0]->flags & IR3_REG_HALF);
+}
+
+static inline bool
+is_high(struct ir3_instruction *instr)
+{
+       return !!(instr->regs[0]->flags & IR3_REG_HIGH);
+}
+
  static inline bool
  is_store(struct ir3_instruction *instr)
  {
@@ -780,17 +853,26 @@ static inline bool is_meta(struct ir3_instruction *instr)
  
  static inline unsigned dest_regs(struct ir3_instruction *instr)
  {
-       if ((instr->regs_count == 0) || is_store(instr))
+       if ((instr->regs_count == 0) || is_store(instr) || is_flow(instr))
                 return 0;
  
         return util_last_bit(instr->regs[0]->wrmask);
  }
  
-static inline bool writes_addr(struct ir3_instruction *instr)
+static inline bool writes_addr0(struct ir3_instruction *instr)
  {
         if (instr->regs_count > 0) {
                 struct ir3_register *dst = instr->regs[0];
-               return reg_num(dst) == REG_A0;
+               return dst->num == regid(REG_A0, 0);
+       }
+       return false;
+}
+
+static inline bool writes_addr1(struct ir3_instruction *instr)
+{
+       if (instr->regs_count > 0) {
+               struct ir3_register *dst = instr->regs[0];
+               return dst->num == regid(REG_A0, 1);
         }
         return false;
  }
@@ -1065,29 +1147,54 @@ static inline bool __is_false_dep(struct ir3_instruction *instr, unsigned n)
  
  /* iterators for shader inputs: */
  #define foreach_input_n(__ininstr, __cnt, __ir) \
-       for (unsigned __cnt = 0; __cnt < (__ir)->ninputs; __cnt++) \
+       for (unsigned __cnt = 0; __cnt < (__ir)->inputs_count; __cnt++) \
                 if ((__ininstr = (__ir)->inputs[__cnt]))
  #define foreach_input(__ininstr, __ir) \
         foreach_input_n(__ininstr, __i, __ir)
  
  /* iterators for shader outputs: */
  #define foreach_output_n(__outinstr, __cnt, __ir) \
-       for (unsigned __cnt = 0; __cnt < (__ir)->noutputs; __cnt++) \
+       for (unsigned __cnt = 0; __cnt < (__ir)->outputs_count; __cnt++) \
                 if ((__outinstr = (__ir)->outputs[__cnt]))
  #define foreach_output(__outinstr, __ir) \
         foreach_output_n(__outinstr, __i, __ir)
  
+/* iterators for instructions: */
+#define foreach_instr(__instr, __list) \
+       list_for_each_entry(struct ir3_instruction, __instr, __list, node)
+#define foreach_instr_rev(__instr, __list) \
+       list_for_each_entry_rev(struct ir3_instruction, __instr, __list, node)
+#define foreach_instr_safe(__instr, __list) \
+       list_for_each_entry_safe(struct ir3_instruction, __instr, __list, node)
+
+/* iterators for blocks: */
+#define foreach_block(__block, __list) \
+       list_for_each_entry(struct ir3_block, __block, __list, node)
+#define foreach_block_safe(__block, __list) \
+       list_for_each_entry_safe(struct ir3_block, __block, __list, node)
+
+/* iterators for arrays: */
+#define foreach_array(__array, __list) \
+       list_for_each_entry(struct ir3_array, __array, __list, node)
+
  /* dump: */
  void ir3_print(struct ir3 *ir);
  void ir3_print_instr(struct ir3_instruction *instr);
  
+/* delay calculation: */
+int ir3_delayslots(struct ir3_instruction *assigner,
+               struct ir3_instruction *consumer, unsigned n, bool soft);
+unsigned ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
+               bool soft, bool pred);
+void ir3_remove_nops(struct ir3 *ir);
+
  /* depth calculation: */
  struct ir3_shader_variant;
-int ir3_delayslots(struct ir3_instruction *assigner,
-               struct ir3_instruction *consumer, unsigned n);
-void ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list);
  void ir3_depth(struct ir3 *ir, struct ir3_shader_variant *so);
  
+/* fp16 conversion folding */
+void ir3_cf(struct ir3 *ir);
+
  /* copy-propagate: */
  void ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so);
  
@@ -1101,14 +1208,48 @@ void ir3_sun(struct ir3 *ir);
  void ir3_sched_add_deps(struct ir3 *ir);
  int ir3_sched(struct ir3 *ir);
  
-void ir3_a6xx_fixup_atomic_dests(struct ir3 *ir, struct ir3_shader_variant *so);
+struct ir3_context;
+int ir3_postsched(struct ir3_context *ctx);
+
+bool ir3_a6xx_fixup_atomic_dests(struct ir3 *ir, struct ir3_shader_variant *so);
  
  /* register assignment: */
  struct ir3_ra_reg_set * ir3_ra_alloc_reg_set(struct ir3_compiler *compiler);
  int ir3_ra(struct ir3_shader_variant *v, struct ir3_instruction **precolor, unsigned nprecolor);
  
  /* legalize: */
-void ir3_legalize(struct ir3 *ir, bool *has_ssbo, bool *need_pixlod, int *max_bary);
+void ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary);
+
+static inline bool
+ir3_has_latency_to_hide(struct ir3 *ir)
+{
+       /* VS/GS/TCS/TESS  co-exist with frag shader invocations, but we don't
+        * know the nature of the fragment shader.  Just assume it will have
+        * latency to hide:
+        */
+       if (ir->type != MESA_SHADER_FRAGMENT)
+               return true;
+
+       foreach_block (block, &ir->block_list) {
+               foreach_instr (instr, &block->instr_list) {
+                       if (is_tex_or_prefetch(instr))
+                               return true;
+
+                       if (is_load(instr)) {
+                               switch (instr->opc) {
+                               case OPC_LDLV:
+                               case OPC_LDL:
+                               case OPC_LDLW:
+                                       break;
+                               default:
+                                       return true;
+                               }
+                       }
+               }
+       }
+
+       return false;
+}
  
  /* ************************************************************************* */
  /* instruction helpers */
@@ -1143,7 +1284,7 @@ create_immed_typed(struct ir3_block *block, uint32_t val, type_t type)
         mov->cat1.src_type = type;
         mov->cat1.dst_type = type;
         __ssa_dst(mov)->flags |= flags;
-       ir3_reg_create(mov, 0, IR3_REG_IMMED)->uim_val = val;
+       ir3_reg_create(mov, 0, IR3_REG_IMMED | flags)->uim_val = val;
  
         return mov;
  }
@@ -1320,14 +1461,15 @@ ir3_##name(struct ir3_block *block,                                      \
  #define INSTR4(name)        __INSTR4(0, name, OPC_##name)
  
  /* cat0 instructions: */
-INSTR0(BR)
+INSTR1(BR)
  INSTR0(JUMP)
  INSTR1(KILL)
  INSTR0(END)
  INSTR0(CHSH)
  INSTR0(CHMASK)
-INSTR1(CONDEND)
-INSTR0(ENDPATCH)
+INSTR1(IF)
+INSTR0(ELSE)
+INSTR0(ENDIF)
  
  /* cat2 instructions, most 2 src but some 1 src: */
  INSTR2(ADD_F)
@@ -1386,6 +1528,7 @@ INSTR3(MAD_U24)
  INSTR3(MAD_S24)
  INSTR3(MAD_F16)
  INSTR3(MAD_F32)
+/* NOTE: SEL_B32 checks for zero vs nonzero */
  INSTR3(SEL_B16)
  INSTR3(SEL_B32)
  INSTR3(SEL_S16)
@@ -1398,15 +1541,20 @@ INSTR3(SAD_S32)
  /* cat4 instructions: */
  INSTR1(RCP)
  INSTR1(RSQ)
+INSTR1(HRSQ)
  INSTR1(LOG2)
+INSTR1(HLOG2)
  INSTR1(EXP2)
+INSTR1(HEXP2)
  INSTR1(SIN)
  INSTR1(COS)
  INSTR1(SQRT)
  
  /* cat5 instructions: */
  INSTR1(DSX)
+INSTR1(DSXPP_1)
  INSTR1(DSY)
+INSTR1(DSYPP_1)
  INSTR1F(3D, DSX)
  INSTR1F(3D, DSY)
  INSTR1(RGETPOS)
@@ -1419,9 +1567,11 @@ ir3_SAM(struct ir3_block *block, opc_t opc, type_t type,
         struct ir3_instruction *sam;
  
         sam = ir3_instr_create(block, opc);
-       sam->flags |= flags | IR3_INSTR_S2EN;
+       sam->flags |= flags;
         __ssa_dst(sam)->wrmask = wrmask;
-       __ssa_src(sam, samp_tex, IR3_REG_HALF);
+       if (flags & IR3_INSTR_S2EN) {
+               __ssa_src(sam, samp_tex, IR3_REG_HALF);
+       }
         if (src0) {
                 __ssa_src(sam, src0, 0)->wrmask = (1 << (src0->regs_count - 1)) - 1;
         }
@@ -1454,6 +1604,7 @@ INSTR2(ATOMIC_MAX)
  INSTR2(ATOMIC_AND)
  INSTR2(ATOMIC_OR)
  INSTR2(ATOMIC_XOR)
+INSTR2(LDC)
  #if GPU >= 600
  INSTR3(STIB);
  INSTR2(LDIB);
@@ -1498,23 +1649,59 @@ INSTR0(META_TEX_PREFETCH);
  /* split this out or find some helper to use.. like main/bitset.h.. */
  
  #include <string.h>
+#include "util/bitset.h"
  
  #define MAX_REG 256
  
-typedef uint8_t regmask_t[2 * MAX_REG / 8];
+typedef BITSET_DECLARE(regmask_t, 2 * MAX_REG);
  
-static inline unsigned regmask_idx(struct ir3_register *reg)
+static inline bool
+__regmask_get(regmask_t *regmask, struct ir3_register *reg, unsigned n)
  {
-       unsigned num = (reg->flags & IR3_REG_RELATIV) ? reg->array.offset : reg->num;
-       debug_assert(num < MAX_REG);
-       if (reg->flags & IR3_REG_HALF) {
-               if (reg->merged) {
-                       num /= 2;
+       if (reg->merged) {
+               /* a6xx+ case, with merged register file, we track things in terms
+                * of half-precision registers, with a full precisions register
+                * using two half-precision slots:
+                */
+               if (reg->flags & IR3_REG_HALF) {
+                       return BITSET_TEST(*regmask, n);
                 } else {
-                       num += MAX_REG;
+                       n *= 2;
+                       return BITSET_TEST(*regmask, n) || BITSET_TEST(*regmask, n+1);
                 }
+       } else {
+               /* pre a6xx case, with separate register file for half and full
+                * precision:
+                */
+               if (reg->flags & IR3_REG_HALF)
+                       n += MAX_REG;
+               return BITSET_TEST(*regmask, n);
+       }
+}
+
+static inline void
+__regmask_set(regmask_t *regmask, struct ir3_register *reg, unsigned n)
+{
+       if (reg->merged) {
+               /* a6xx+ case, with merged register file, we track things in terms
+                * of half-precision registers, with a full precisions register
+                * using two half-precision slots:
+                */
+               if (reg->flags & IR3_REG_HALF) {
+                       BITSET_SET(*regmask, n);
+               } else {
+                       n *= 2;
+                       BITSET_SET(*regmask, n);
+                       BITSET_SET(*regmask, n+1);
+               }
+       } else {
+               /* pre a6xx case, with separate register file for half and full
+                * precision:
+                */
+               if (reg->flags & IR3_REG_HALF)
+                       n += MAX_REG;
+               BITSET_SET(*regmask, n);
         }
-       return num;
  }
  
  static inline void regmask_init(regmask_t *regmask)
@@ -1524,16 +1711,13 @@ static inline void regmask_init(regmask_t *regmask)
  
  static inline void regmask_set(regmask_t *regmask, struct ir3_register *reg)
  {
-       unsigned idx = regmask_idx(reg);
         if (reg->flags & IR3_REG_RELATIV) {
-               unsigned i;
-               for (i = 0; i < reg->size; i++, idx++)
-                       (*regmask)[idx / 8] |= 1 << (idx % 8);
+               for (unsigned i = 0; i < reg->size; i++)
+                       __regmask_set(regmask, reg, reg->array.offset + i);
         } else {
-               unsigned mask;
-               for (mask = reg->wrmask; mask; mask >>= 1, idx++)
+               for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++)
                         if (mask & 1)
-                               (*regmask)[idx / 8] |= 1 << (idx % 8);
+                               __regmask_set(regmask, reg, n);
         }
  }
  
@@ -1544,41 +1728,17 @@ static inline void regmask_or(regmask_t *dst, regmask_t *a, regmask_t *b)
                 (*dst)[i] = (*a)[i] | (*b)[i];
  }
  
-/* set bits in a if not set in b, conceptually:
- *   a |= (reg & ~b)
- */
-static inline void regmask_set_if_not(regmask_t *a,
-               struct ir3_register *reg, regmask_t *b)
-{
-       unsigned idx = regmask_idx(reg);
-       if (reg->flags & IR3_REG_RELATIV) {
-               unsigned i;
-               for (i = 0; i < reg->size; i++, idx++)
-                       if (!((*b)[idx / 8] & (1 << (idx % 8))))
-                               (*a)[idx / 8] |= 1 << (idx % 8);
-       } else {
-               unsigned mask;
-               for (mask = reg->wrmask; mask; mask >>= 1, idx++)
-                       if (mask & 1)
-                               if (!((*b)[idx / 8] & (1 << (idx % 8))))
-                                       (*a)[idx / 8] |= 1 << (idx % 8);
-       }
-}
-
  static inline bool regmask_get(regmask_t *regmask,
                 struct ir3_register *reg)
  {
-       unsigned idx = regmask_idx(reg);
         if (reg->flags & IR3_REG_RELATIV) {
-               unsigned i;
-               for (i = 0; i < reg->size; i++, idx++)
-                       if ((*regmask)[idx / 8] & (1 << (idx % 8)))
+               for (unsigned i = 0; i < reg->size; i++)
+                       if (__regmask_get(regmask, reg, reg->array.offset + i))
                                 return true;
         } else {
-               unsigned mask;
-               for (mask = reg->wrmask; mask; mask >>= 1, idx++)
+               for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++)
                         if (mask & 1)
-                               if ((*regmask)[idx / 8] & (1 << (idx % 8)))
+                               if (__regmask_get(regmask, reg, n))
                                         return true;
         }
         return false;