nv_print_program(pc);
#endif
+ pc->opt_reload_elim = ti->store_to_memory ? FALSE : TRUE;
+
/* optimization */
ret = nv_pc_exec_pass0(pc);
if (ret)
struct nv_fixup *fixups;
int num_fixups;
+
+ /* optimization enables */
+ boolean opt_reload_elim;
};
void nvbb_insert_tail(struct nv_basic_block *, struct nv_instruction *);
}
static void
-set_ld_st_size(struct nv_pc *pc, ubyte type)
+set_ld_st_size(struct nv_pc *pc, int s, ubyte type)
{
switch (type) {
case NV_TYPE_F64:
- pc->emit[1] |= 0x8000;
+ pc->emit[1] |= 0x8000 << s;
break;
case NV_TYPE_F32:
case NV_TYPE_S32:
case NV_TYPE_U32:
- pc->emit[1] |= 0xc000;
+ pc->emit[1] |= 0xc000 << s;
break;
case NV_TYPE_S16:
- pc->emit[1] |= 0x6000;
+ pc->emit[1] |= 0x6000 << s;
break;
case NV_TYPE_U16:
- pc->emit[1] |= 0x4000;
+ pc->emit[1] |= 0x4000 << s;
break;
case NV_TYPE_S8:
- pc->emit[1] |= 0x2000;
+ pc->emit[1] |= 0x2000 << s;
break;
default:
break;
if (sf == NV_FILE_MEM_L) {
pc->emit[0] = 0xd0000001;
pc->emit[1] = 0x40000000;
+
+ set_addr(pc, i);
} else {
NOUVEAU_ERR("invalid ld source file\n");
abort();
}
- set_ld_st_size(pc, STYPE(i, 0));
+ set_ld_st_size(pc, (sf == NV_FILE_MEM_L) ? 8 : 0, STYPE(i, 0));
set_dst(pc, i->def[0]);
set_pred_wr(pc, i);
static void
emit_st(struct nv_pc *pc, struct nv_instruction *i)
{
+ assert(SFILE(i, 1) == NV_FILE_GPR);
+ assert(SFILE(i, 0) == NV_FILE_MEM_L);
+
+ pc->emit[0] = 0xd0000001;
+ pc->emit[1] = 0x60000000;
+ SID(pc, i->src[1], 2);
+ SID(pc, i->src[0], 9);
+
+ set_ld_st_size(pc, 8, STYPE(i, 1));
+
+ set_addr(pc, i);
+ set_pred(pc, i);
}
static int
static INLINE boolean
inst_cullable(struct nv_instruction *nvi)
{
+ if (nvi->opcode == NV_OP_STA)
+ return FALSE;
return (!(nvi->is_terminator || nvi->is_join ||
nvi->target ||
nvi->fixed ||
int alloc;
};
+/* TODO: properly handle loads from l[] memory in the presence of stores */
static int
nv_pass_reload_elim(struct nv_pass_reld_elim *ctx, struct nv_basic_block *b)
{
if (ret)
return ret;
- reldelim = CALLOC_STRUCT(nv_pass_reld_elim);
- reldelim->pc = pc;
- pc->pass_seq++;
- ret = nv_pass_reload_elim(reldelim, root);
- FREE(reldelim);
- if (ret)
- return ret;
+ if (pc->opt_reload_elim) {
+ reldelim = CALLOC_STRUCT(nv_pass_reld_elim);
+ reldelim->pc = pc;
+ pc->pass_seq++;
+ ret = nv_pass_reload_elim(reldelim, root);
+ FREE(reldelim);
+ if (ret)
+ return ret;
+ }
pc->pass_seq++;
ret = nv_pass_cse(&pass, root);
case NV_FILE_FLAGS:
PRINT(" %s%cc%i", mgta, reg_pfx, nv_value_id(value));
break;
+ case NV_FILE_MEM_L:
+ nv_print_address('l', -1, ind, 4 * nv_value_id(value));
+ break;
case NV_FILE_MEM_S:
nv_print_address('s', -1, ind, 4 * nv_value_id(value));
break;
inst->Src[0].Register.File == TGSI_FILE_INPUT &&
dst->Index == ti->edgeflag_out)
ti->p->vp.edgeflag = inst->Src[0].Register.Index;
+ } else
+ if (inst->Dst[0].Register.File == TGSI_FILE_TEMPORARY) {
+ if (inst->Dst[0].Register.Indirect)
+ ti->store_to_memory = TRUE;
}
for (s = 0; s < inst->Instruction.NumSrcRegs; ++s) {
src = &inst->Src[s].Register;
+ if (src->File == TGSI_FILE_TEMPORARY)
+ if (inst->Src[s].Register.Indirect)
+ ti->store_to_memory = TRUE;
if (src->File != TGSI_FILE_INPUT)
continue;
mask = nv50_tgsi_src_mask(inst, s);
int output_access[PIPE_MAX_SHADER_OUTPUTS][4];
boolean indirect_inputs;
boolean indirect_outputs;
+ boolean store_to_memory;
struct tgsi_shader_info scan;
uint32_t *immd32;
unsigned immd32_nr;
uint64_t value;
unsigned chipset = dev->chipset;
unsigned tesla_class = 0;
- unsigned stack_size;
+ unsigned stack_size, local_size, max_warps;
int ret, i;
const unsigned rl = NOUVEAU_BO_VRAM | NOUVEAU_BO_RD;
/* shader stack */
nouveau_device_get_param(dev, NOUVEAU_GETPARAM_GRAPH_UNITS, &value);
- stack_size = util_bitcount(value & 0xffff);
- stack_size *= util_bitcount((value >> 24) & 0xf);
- stack_size *= 32 * 64 * 8;
+ max_warps = util_bitcount(value & 0xffff);
+ max_warps *= util_bitcount((value >> 24) & 0xf) * 32;
+
+ stack_size = max_warps * 64 * 8;
ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16,
stack_size, &screen->stack_bo);
OUT_RELOCl(chan, screen->stack_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
OUT_RING (chan, 4);
+ local_size = (NV50_CAP_MAX_PROGRAM_TEMPS * 16) * max_warps * 32;
+
+ ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16,
+ local_size, &screen->local_bo);
+ if (ret) {
+ nv50_screen_destroy(pscreen);
+ return NULL;
+ }
+
+ local_size = NV50_CAP_MAX_PROGRAM_TEMPS * 16;
+
+ BEGIN_RING(chan, screen->tesla, NV50TCL_LOCAL_ADDRESS_HIGH, 3);
+ OUT_RELOCh(chan, screen->local_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+ OUT_RELOCl(chan, screen->local_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+ OUT_RING (chan, util_unsigned_logbase2(local_size / 8));
+
/* Vertex array limits - max them out */
for (i = 0; i < 16; i++) {
BEGIN_RING(chan, screen->tesla,
struct nouveau_bo *tic;
struct nouveau_bo *tsc;
- struct nouveau_bo *stack_bo;
+ struct nouveau_bo *stack_bo; /* control flow stack */
+ struct nouveau_bo *local_bo; /* l[] memory */
boolean force_push;
};
return bld_def(insn, 0, new_value(bld->pc, NV_FILE_GPR, src0->reg.type));
}
+static void
+bld_lmem_store(struct bld_context *bld, struct nv_value *ptr, int ofst,
+ struct nv_value *val)
+{
+ struct nv_instruction *insn = new_instruction(bld->pc, NV_OP_STA);
+ struct nv_value *loc;
+
+ loc = new_value(bld->pc, NV_FILE_MEM_L, NV_TYPE_U32);
+
+ loc->reg.id = ofst * 4;
+
+ nv_reference(bld->pc, &insn->src[0], loc);
+ nv_reference(bld->pc, &insn->src[1], val);
+ nv_reference(bld->pc, &insn->src[4], ptr);
+}
+
+static struct nv_value *
+bld_lmem_load(struct bld_context *bld, struct nv_value *ptr, int ofst)
+{
+ struct nv_value *loc, *val;
+
+ loc = new_value(bld->pc, NV_FILE_MEM_L, NV_TYPE_U32);
+
+ loc->reg.id = ofst * 4;
+
+ val = bld_insn_1(bld, NV_OP_LDA, loc);
+
+ nv_reference(bld->pc, &val->insn->src[4], ptr);
+
+ return val;
+}
+
#define BLD_INSN_1_EX(d, op, dt, s0, s0t) \
do { \
(d) = bld_insn_1(bld, (NV_OP_##op), (s0)); \
static void
emit_store(struct bld_context *bld, const struct tgsi_full_instruction *inst,
- unsigned chan, struct nv_value *value)
+ unsigned chan, struct nv_value *value)
{
+ struct nv_value *ptr;
const struct tgsi_full_dst_register *reg = &inst->Dst[0];
+ if (reg->Register.Indirect) {
+ ptr = FETCH_ADDR(reg->Indirect.Index,
+ tgsi_util_get_src_register_swizzle(®->Indirect, 0));
+ } else {
+ ptr = NULL;
+ }
+
assert(chan < 4);
if (inst->Instruction.Opcode != TGSI_OPCODE_MOV)
value->reg.file = NV_FILE_GPR;
if (value->insn->bb != bld->pc->current_block)
value = bld_insn_1(bld, NV_OP_MOV, value);
- STORE_TEMP(reg->Register.Index, chan, value);
+
+ if (bld->ti->store_to_memory)
+ bld_lmem_store(bld, ptr, reg->Register.Index * 4 + chan, value);
+ else
+ STORE_TEMP(reg->Register.Index, chan, value);
break;
case TGSI_FILE_ADDRESS:
assert(reg->Register.Index < BLD_MAX_ADDRS);
bld->saved_inputs[bld->ti->input_map[idx][swz]] = res;
break;
case TGSI_FILE_TEMPORARY:
- /* this should be load from l[], with reload elimination later on */
- res = bld_fetch_global(bld, &bld->tvs[idx][swz]);
+ if (bld->ti->store_to_memory)
+ res = bld_lmem_load(bld, ptr, idx * 4 + swz);
+ else
+ res = bld_fetch_global(bld, &bld->tvs[idx][swz]);
break;
case TGSI_FILE_ADDRESS:
res = bld_fetch_global(bld, &bld->avs[idx][swz]);