int const_reg; /**< GP register pointing to constants buffer */
int builtins_reg; /**< GP register pointint to built-in constants */
+ int offset_reg; /**< used to reduce redundant li instructions */
+ int offset_value;
+
int one_vec; /**< vector register with {1.0, 1.0, 1.0, 1.0} */
int bit31_vec; /**< vector register with {1<<31, 1<<31, 1<<31, 1<<31} */
};
+/**
+ * Initialize code generation context.
+ */
+static void
+init_gen_context(struct gen_context *gen, struct ppc_function *func)
+{
+ memset(gen, 0, sizeof(*gen));
+ gen->f = func;
+ gen->inputs_reg = ppc_reserve_register(func, 3); /* first function param */
+ gen->outputs_reg = ppc_reserve_register(func, 4); /* second function param */
+ gen->temps_reg = ppc_reserve_register(func, 5); /* ... */
+ gen->immed_reg = ppc_reserve_register(func, 6);
+ gen->const_reg = ppc_reserve_register(func, 7);
+ gen->builtins_reg = ppc_reserve_register(func, 8);
+ gen->one_vec = -1;
+ gen->bit31_vec = -1;
+ gen->offset_reg = -1;
+ gen->offset_value = -9999999;
+}
+
+
+/**
+ * All PPC vector load/store instructions form an effective address
+ * by adding the contents of two registers. For example:
+ * lvx v2,r8,r9 # v2 = memory[r8 + r9]
+ * stvx v2,r8,r9 # memory[r8 + r9] = v2;
+ * So our lvx/stvx instructions are typically preceded by an 'li' instruction
+ * to load r9 (above) with an immediate (an offset).
+ * This code emits that 'li' instruction, but only if the offset value is
+ * different than the previous 'li'.
+ * This optimization seems to save about 10% in the instruction count.
+ * Note that we need to unconditionally emit an 'li' inside basic blocks
+ * (such as inside loops).
+ */
+static int
+emit_li_offset(struct gen_context *gen, int offset)
+{
+ if (gen->offset_reg <= 0) {
+ /* allocate a GP register for storing load/store offset */
+ gen->offset_reg = ppc_allocate_register(gen->f);
+ }
+
+ /* emit new 'li' if offset is changing */
+ if (gen->offset_value < 0 || gen->offset_value != offset) {
+ gen->offset_value = offset;
+ ppc_li(gen->f, gen->offset_reg, offset);
+ }
+
+ return gen->offset_reg;
+}
+
+
+/**
+ * Forces subsequent emit_li_offset() calls to emit an 'li'.
+ * To be called at the top of basic blocks.
+ */
+static int
+reset_li_offset(struct gen_context *gen)
+{
+ gen->offset_value = -9999999;
+}
+
+
+
/**
* Load the given vector register with {value, value, value, value}.
* The value must be in the ppu_builtin_constants[] array.
uint pos;
for (pos = 0; pos < Elements(ppc_builtin_constants); pos++) {
if (ppc_builtin_constants[pos] == value) {
- int offset_reg = ppc_allocate_register(gen->f);
int offset = pos * 4;
+ int offset_reg = emit_li_offset(gen, offset);
- ppc_li(gen->f, offset_reg, offset);
/* Load 4-byte word into vector register.
* The vector slot depends on the effective address we load from.
* We know that our builtins start at a 16-byte boundary so we
ppc_lvewx(gen->f, dst_vec, gen->builtins_reg, offset_reg);
/* splat word[pos % 4] across the vector reg */
ppc_vspltw(gen->f, dst_vec, dst_vec, pos % 4);
- ppc_release_register(gen->f, offset_reg);
return;
}
}
switch (reg->SrcRegister.File) {
case TGSI_FILE_INPUT:
{
- int offset_reg = ppc_allocate_register(gen->f);
int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
- ppc_li(gen->f, offset_reg, offset);
+ int offset_reg = emit_li_offset(gen, offset);
ppc_lvx(gen->f, dst_vec, gen->inputs_reg, offset_reg);
- ppc_release_register(gen->f, offset_reg);
}
break;
case TGSI_FILE_TEMPORARY:
{
- int offset_reg = ppc_allocate_register(gen->f);
int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
- ppc_li(gen->f, offset_reg, offset);
+ int offset_reg = emit_li_offset(gen, offset);
ppc_lvx(gen->f, dst_vec, gen->temps_reg, offset_reg);
- ppc_release_register(gen->f, offset_reg);
}
break;
case TGSI_FILE_IMMEDIATE:
{
- int offset_reg = ppc_allocate_register(gen->f);
int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
- ppc_li(gen->f, offset_reg, offset);
+ int offset_reg = emit_li_offset(gen, offset);
ppc_lvx(gen->f, dst_vec, gen->immed_reg, offset_reg);
- ppc_release_register(gen->f, offset_reg);
}
break;
case TGSI_FILE_CONSTANT:
{
- int offset_reg = ppc_allocate_register(gen->f);
int offset = (reg->SrcRegister.Index * 4 + swizzle) * 4;
- ppc_li(gen->f, offset_reg, offset);
+ int offset_reg = emit_li_offset(gen, offset);
/* Load 4-byte word into vector register.
* The vector slot depends on the effective address we load from.
* We know that our constants start at a 16-byte boundary so we
ppc_lvewx(gen->f, dst_vec, gen->const_reg, offset_reg);
/* splat word[swizzle] across the vector reg */
ppc_vspltw(gen->f, dst_vec, dst_vec, swizzle);
- ppc_release_register(gen->f, offset_reg);
}
break;
default:
switch (reg->DstRegister.File) {
case TGSI_FILE_OUTPUT:
{
- int offset_reg = ppc_allocate_register(gen->f);
int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
- ppc_li(gen->f, offset_reg, offset);
+ int offset_reg = emit_li_offset(gen, offset);
ppc_stvx(gen->f, src_vec, gen->outputs_reg, offset_reg);
- ppc_release_register(gen->f, offset_reg);
}
break;
case TGSI_FILE_TEMPORARY:
{
- int offset_reg = ppc_allocate_register(gen->f);
int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
- ppc_li(gen->f, offset_reg, offset);
+ int offset_reg = emit_li_offset(gen, offset);
ppc_stvx(gen->f, src_vec, gen->temps_reg, offset_reg);
- ppc_release_register(gen->f, offset_reg);
}
break;
#if 0
util_init_math();
- memset(&gen, 0, sizeof(gen));
- gen.f = func;
- gen.inputs_reg = ppc_reserve_register(func, 3); /* first function param */
- gen.outputs_reg = ppc_reserve_register(func, 4); /* second function param */
- gen.temps_reg = ppc_reserve_register(func, 5); /* ... */
- gen.immed_reg = ppc_reserve_register(func, 6);
- gen.const_reg = ppc_reserve_register(func, 7);
- gen.builtins_reg = ppc_reserve_register(func, 8);
- gen.one_vec = -1;
- gen.bit31_vec = -1;
+ init_gen_context(&gen, func);
emit_prologue(func);