From f30810cb68a53c4fef360778a230126ed0ee0ee3 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Thu, 9 Sep 2010 19:12:54 +0200 Subject: [PATCH] nv50: use actual loads/stores if TEMPs are accessed indirectly --- src/gallium/drivers/nv50/nv50_pc.c | 2 + src/gallium/drivers/nv50/nv50_pc.h | 3 ++ src/gallium/drivers/nv50/nv50_pc_emit.c | 28 ++++++++--- src/gallium/drivers/nv50/nv50_pc_optimize.c | 19 +++++--- src/gallium/drivers/nv50/nv50_pc_print.c | 3 ++ src/gallium/drivers/nv50/nv50_program.c | 7 +++ src/gallium/drivers/nv50/nv50_program.h | 1 + src/gallium/drivers/nv50/nv50_screen.c | 25 ++++++++-- src/gallium/drivers/nv50/nv50_screen.h | 3 +- src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 54 +++++++++++++++++++-- 10 files changed, 122 insertions(+), 23 deletions(-) diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c index c54f16e4c53..637b3cf2fe3 100644 --- a/src/gallium/drivers/nv50/nv50_pc.c +++ b/src/gallium/drivers/nv50/nv50_pc.c @@ -414,6 +414,8 @@ nv50_generate_code(struct nv50_translation_info *ti) nv_print_program(pc); #endif + pc->opt_reload_elim = ti->store_to_memory ? FALSE : TRUE; + /* optimization */ ret = nv_pc_exec_pass0(pc); if (ret) diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h index d9cc775572e..ba32ab08ab2 100644 --- a/src/gallium/drivers/nv50/nv50_pc.h +++ b/src/gallium/drivers/nv50/nv50_pc.h @@ -345,6 +345,9 @@ struct nv_pc { struct nv_fixup *fixups; int num_fixups; + + /* optimization enables */ + boolean opt_reload_elim; }; void nvbb_insert_tail(struct nv_basic_block *, struct nv_instruction *); diff --git a/src/gallium/drivers/nv50/nv50_pc_emit.c b/src/gallium/drivers/nv50/nv50_pc_emit.c index bb0a6f32d1e..8c64b198756 100644 --- a/src/gallium/drivers/nv50/nv50_pc_emit.c +++ b/src/gallium/drivers/nv50/nv50_pc_emit.c @@ -412,25 +412,25 @@ emit_form_IMM(struct nv_pc *pc, struct nv_instruction *i, ubyte mod_mask) } static void -set_ld_st_size(struct nv_pc *pc, ubyte type) +set_ld_st_size(struct nv_pc *pc, int s, ubyte type) { switch (type) { case NV_TYPE_F64: - pc->emit[1] |= 0x8000; + pc->emit[1] |= 0x8000 << s; break; case NV_TYPE_F32: case NV_TYPE_S32: case NV_TYPE_U32: - pc->emit[1] |= 0xc000; + pc->emit[1] |= 0xc000 << s; break; case NV_TYPE_S16: - pc->emit[1] |= 0x6000; + pc->emit[1] |= 0x6000 << s; break; case NV_TYPE_U16: - pc->emit[1] |= 0x4000; + pc->emit[1] |= 0x4000 << s; break; case NV_TYPE_S8: - pc->emit[1] |= 0x2000; + pc->emit[1] |= 0x2000 << s; break; default: break; @@ -473,12 +473,14 @@ emit_ld(struct nv_pc *pc, struct nv_instruction *i) if (sf == NV_FILE_MEM_L) { pc->emit[0] = 0xd0000001; pc->emit[1] = 0x40000000; + + set_addr(pc, i); } else { NOUVEAU_ERR("invalid ld source file\n"); abort(); } - set_ld_st_size(pc, STYPE(i, 0)); + set_ld_st_size(pc, (sf == NV_FILE_MEM_L) ? 8 : 0, STYPE(i, 0)); set_dst(pc, i->def[0]); set_pred_wr(pc, i); @@ -495,7 +497,19 @@ emit_ld(struct nv_pc *pc, struct nv_instruction *i) static void emit_st(struct nv_pc *pc, struct nv_instruction *i) { + assert(SFILE(i, 1) == NV_FILE_GPR); + assert(SFILE(i, 0) == NV_FILE_MEM_L); + + pc->emit[0] = 0xd0000001; + pc->emit[1] = 0x60000000; + SID(pc, i->src[1], 2); + SID(pc, i->src[0], 9); + + set_ld_st_size(pc, 8, STYPE(i, 1)); + + set_addr(pc, i); + set_pred(pc, i); } static int diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c index 4f5bdc1f9fb..09d232abda0 100644 --- a/src/gallium/drivers/nv50/nv50_pc_optimize.c +++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c @@ -82,6 +82,8 @@ inst_commutation_legal(struct nv_instruction *a, static INLINE boolean inst_cullable(struct nv_instruction *nvi) { + if (nvi->opcode == NV_OP_STA) + return FALSE; return (!(nvi->is_terminator || nvi->is_join || nvi->target || nvi->fixed || @@ -739,6 +741,7 @@ struct nv_pass_reld_elim { int alloc; }; +/* TODO: properly handle loads from l[] memory in the presence of stores */ static int nv_pass_reload_elim(struct nv_pass_reld_elim *ctx, struct nv_basic_block *b) { @@ -1074,13 +1077,15 @@ nv_pc_pass0(struct nv_pc *pc, struct nv_basic_block *root) if (ret) return ret; - reldelim = CALLOC_STRUCT(nv_pass_reld_elim); - reldelim->pc = pc; - pc->pass_seq++; - ret = nv_pass_reload_elim(reldelim, root); - FREE(reldelim); - if (ret) - return ret; + if (pc->opt_reload_elim) { + reldelim = CALLOC_STRUCT(nv_pass_reld_elim); + reldelim->pc = pc; + pc->pass_seq++; + ret = nv_pass_reload_elim(reldelim, root); + FREE(reldelim); + if (ret) + return ret; + } pc->pass_seq++; ret = nv_pass_cse(&pass, root); diff --git a/src/gallium/drivers/nv50/nv50_pc_print.c b/src/gallium/drivers/nv50/nv50_pc_print.c index 01a6f009979..74c3970f404 100644 --- a/src/gallium/drivers/nv50/nv50_pc_print.c +++ b/src/gallium/drivers/nv50/nv50_pc_print.c @@ -217,6 +217,9 @@ nv_print_value(struct nv_value *value, struct nv_value *ind, ubyte type) case NV_FILE_FLAGS: PRINT(" %s%cc%i", mgta, reg_pfx, nv_value_id(value)); break; + case NV_FILE_MEM_L: + nv_print_address('l', -1, ind, 4 * nv_value_id(value)); + break; case NV_FILE_MEM_S: nv_print_address('s', -1, ind, 4 * nv_value_id(value)); break; diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c index 925028700cd..24952f70f14 100644 --- a/src/gallium/drivers/nv50/nv50_program.c +++ b/src/gallium/drivers/nv50/nv50_program.c @@ -168,10 +168,17 @@ prog_inst(struct nv50_translation_info *ti, inst->Src[0].Register.File == TGSI_FILE_INPUT && dst->Index == ti->edgeflag_out) ti->p->vp.edgeflag = inst->Src[0].Register.Index; + } else + if (inst->Dst[0].Register.File == TGSI_FILE_TEMPORARY) { + if (inst->Dst[0].Register.Indirect) + ti->store_to_memory = TRUE; } for (s = 0; s < inst->Instruction.NumSrcRegs; ++s) { src = &inst->Src[s].Register; + if (src->File == TGSI_FILE_TEMPORARY) + if (inst->Src[s].Register.Indirect) + ti->store_to_memory = TRUE; if (src->File != TGSI_FILE_INPUT) continue; mask = nv50_tgsi_src_mask(inst, s); diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h index 918baf325f5..a1b2bde97bf 100644 --- a/src/gallium/drivers/nv50/nv50_program.h +++ b/src/gallium/drivers/nv50/nv50_program.h @@ -116,6 +116,7 @@ struct nv50_translation_info { int output_access[PIPE_MAX_SHADER_OUTPUTS][4]; boolean indirect_inputs; boolean indirect_outputs; + boolean store_to_memory; struct tgsi_shader_info scan; uint32_t *immd32; unsigned immd32_nr; diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c index c1efa443daf..24a6d8055c8 100644 --- a/src/gallium/drivers/nv50/nv50_screen.c +++ b/src/gallium/drivers/nv50/nv50_screen.c @@ -274,7 +274,7 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev) uint64_t value; unsigned chipset = dev->chipset; unsigned tesla_class = 0; - unsigned stack_size; + unsigned stack_size, local_size, max_warps; int ret, i; const unsigned rl = NOUVEAU_BO_VRAM | NOUVEAU_BO_RD; @@ -495,9 +495,10 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev) /* shader stack */ nouveau_device_get_param(dev, NOUVEAU_GETPARAM_GRAPH_UNITS, &value); - stack_size = util_bitcount(value & 0xffff); - stack_size *= util_bitcount((value >> 24) & 0xf); - stack_size *= 32 * 64 * 8; + max_warps = util_bitcount(value & 0xffff); + max_warps *= util_bitcount((value >> 24) & 0xf) * 32; + + stack_size = max_warps * 64 * 8; ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16, stack_size, &screen->stack_bo); @@ -510,6 +511,22 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev) OUT_RELOCl(chan, screen->stack_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); OUT_RING (chan, 4); + local_size = (NV50_CAP_MAX_PROGRAM_TEMPS * 16) * max_warps * 32; + + ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16, + local_size, &screen->local_bo); + if (ret) { + nv50_screen_destroy(pscreen); + return NULL; + } + + local_size = NV50_CAP_MAX_PROGRAM_TEMPS * 16; + + BEGIN_RING(chan, screen->tesla, NV50TCL_LOCAL_ADDRESS_HIGH, 3); + OUT_RELOCh(chan, screen->local_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); + OUT_RELOCl(chan, screen->local_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); + OUT_RING (chan, util_unsigned_logbase2(local_size / 8)); + /* Vertex array limits - max them out */ for (i = 0; i < 16; i++) { BEGIN_RING(chan, screen->tesla, diff --git a/src/gallium/drivers/nv50/nv50_screen.h b/src/gallium/drivers/nv50/nv50_screen.h index 1517f5608f2..ad6bdeb27c8 100644 --- a/src/gallium/drivers/nv50/nv50_screen.h +++ b/src/gallium/drivers/nv50/nv50_screen.h @@ -25,7 +25,8 @@ struct nv50_screen { struct nouveau_bo *tic; struct nouveau_bo *tsc; - struct nouveau_bo *stack_bo; + struct nouveau_bo *stack_bo; /* control flow stack */ + struct nouveau_bo *local_bo; /* l[] memory */ boolean force_push; }; diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index 983fcb2fbf3..f4fee4e0f23 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -558,6 +558,38 @@ bld_insn_3(struct bld_context *bld, uint opcode, return bld_def(insn, 0, new_value(bld->pc, NV_FILE_GPR, src0->reg.type)); } +static void +bld_lmem_store(struct bld_context *bld, struct nv_value *ptr, int ofst, + struct nv_value *val) +{ + struct nv_instruction *insn = new_instruction(bld->pc, NV_OP_STA); + struct nv_value *loc; + + loc = new_value(bld->pc, NV_FILE_MEM_L, NV_TYPE_U32); + + loc->reg.id = ofst * 4; + + nv_reference(bld->pc, &insn->src[0], loc); + nv_reference(bld->pc, &insn->src[1], val); + nv_reference(bld->pc, &insn->src[4], ptr); +} + +static struct nv_value * +bld_lmem_load(struct bld_context *bld, struct nv_value *ptr, int ofst) +{ + struct nv_value *loc, *val; + + loc = new_value(bld->pc, NV_FILE_MEM_L, NV_TYPE_U32); + + loc->reg.id = ofst * 4; + + val = bld_insn_1(bld, NV_OP_LDA, loc); + + nv_reference(bld->pc, &val->insn->src[4], ptr); + + return val; +} + #define BLD_INSN_1_EX(d, op, dt, s0, s0t) \ do { \ (d) = bld_insn_1(bld, (NV_OP_##op), (s0)); \ @@ -854,10 +886,18 @@ infer_dst_type(unsigned opcode) static void emit_store(struct bld_context *bld, const struct tgsi_full_instruction *inst, - unsigned chan, struct nv_value *value) + unsigned chan, struct nv_value *value) { + struct nv_value *ptr; const struct tgsi_full_dst_register *reg = &inst->Dst[0]; + if (reg->Register.Indirect) { + ptr = FETCH_ADDR(reg->Indirect.Index, + tgsi_util_get_src_register_swizzle(®->Indirect, 0)); + } else { + ptr = NULL; + } + assert(chan < 4); if (inst->Instruction.Opcode != TGSI_OPCODE_MOV) @@ -893,7 +933,11 @@ emit_store(struct bld_context *bld, const struct tgsi_full_instruction *inst, value->reg.file = NV_FILE_GPR; if (value->insn->bb != bld->pc->current_block) value = bld_insn_1(bld, NV_OP_MOV, value); - STORE_TEMP(reg->Register.Index, chan, value); + + if (bld->ti->store_to_memory) + bld_lmem_store(bld, ptr, reg->Register.Index * 4 + chan, value); + else + STORE_TEMP(reg->Register.Index, chan, value); break; case TGSI_FILE_ADDRESS: assert(reg->Register.Index < BLD_MAX_ADDRS); @@ -1064,8 +1108,10 @@ emit_fetch(struct bld_context *bld, const struct tgsi_full_instruction *insn, bld->saved_inputs[bld->ti->input_map[idx][swz]] = res; break; case TGSI_FILE_TEMPORARY: - /* this should be load from l[], with reload elimination later on */ - res = bld_fetch_global(bld, &bld->tvs[idx][swz]); + if (bld->ti->store_to_memory) + res = bld_lmem_load(bld, ptr, idx * 4 + swz); + else + res = bld_fetch_global(bld, &bld->tvs[idx][swz]); break; case TGSI_FILE_ADDRESS: res = bld_fetch_global(bld, &bld->avs[idx][swz]); -- 2.30.2