X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fnvfx%2Fnvfx_fragprog.c;h=dbd7c77346512d4ae00afa0d1308c1a1ce3f59ab;hb=358795bc0bacdba9a36bb010ef18ee1b2d086f2d;hp=47df71f2325ab385d116cb82ead5832517e5018e;hpb=d21be6ee2cae2daeb83583a5d3798f5104c00d73;p=mesa.git diff --git a/src/gallium/drivers/nvfx/nvfx_fragprog.c b/src/gallium/drivers/nvfx/nvfx_fragprog.c index 47df71f2325..dbd7c773465 100644 --- a/src/gallium/drivers/nvfx/nvfx_fragprog.c +++ b/src/gallium/drivers/nvfx/nvfx_fragprog.c @@ -1,3 +1,4 @@ +#include #include "pipe/p_context.h" #include "pipe/p_defines.h" #include "pipe/p_state.h" @@ -14,30 +15,25 @@ #include "nvfx_shader.h" #include "nvfx_resource.h" -#define MAX_CONSTS 128 -#define MAX_IMM 32 - struct nvfx_fpc { struct nvfx_pipe_fragment_program* pfp; struct nvfx_fragment_program *fp; + unsigned max_temps; unsigned long long r_temps; unsigned long long r_temps_discard; struct nvfx_reg r_result[PIPE_MAX_SHADER_OUTPUTS]; struct nvfx_reg *r_temp; + unsigned sprite_coord_temp; int num_regs; unsigned inst_offset; unsigned have_const; - struct { - int pipe; - float vals[4]; - } consts[MAX_CONSTS]; - int nr_consts; + struct util_dynarray imm_data; - struct nvfx_reg imm[MAX_IMM]; + struct nvfx_reg* r_imm; unsigned nr_imm; unsigned char generic_to_slot[256]; /* semantic idx for each input semantic */ @@ -50,9 +46,9 @@ struct nvfx_fpc { static INLINE struct nvfx_reg temp(struct nvfx_fpc *fpc) { - int idx = ffsll(~fpc->r_temps) - 1; + int idx = __builtin_ctzll(~fpc->r_temps); - if (idx < 0) { + if (idx >= fpc->max_temps) { NOUVEAU_ERR("out of temps!!\n"); assert(0); return nvfx_reg(NVFXSR_TEMP, 0); @@ -70,19 +66,14 @@ release_temps(struct nvfx_fpc *fpc) fpc->r_temps_discard = 0ULL; } -static INLINE struct nvfx_reg -constant(struct nvfx_fpc *fpc, int pipe, float vals[4]) +static inline struct nvfx_reg +nvfx_fp_imm(struct nvfx_fpc *fpc, float a, float b, float c, float d) { - int idx; + float v[4] = {a, b, c, d}; + int idx = fpc->imm_data.size >> 4; - if (fpc->nr_consts == MAX_CONSTS) - assert(0); - idx = fpc->nr_consts++; - - fpc->consts[idx].pipe = pipe; - if (pipe == -1) - memcpy(fpc->consts[idx].vals, vals, 4 * sizeof(float)); - return nvfx_reg(NVFXSR_CONST, idx); + memcpy(util_dynarray_grow(&fpc->imm_data, sizeof(float) * 4), v, 4 * sizeof(float)); + return nvfx_reg(NVFXSR_IMM, idx); } static void @@ -114,30 +105,40 @@ emit_src(struct nvfx_fpc *fpc, int pos, struct nvfx_src src) sr |= (src.reg.index << NVFX_FP_REG_SRC_SHIFT); break; case NVFXSR_RELOCATED: - sr |= (NVFX_FP_REG_TYPE_INPUT << NVFX_FP_REG_TYPE_SHIFT); + sr |= (NVFX_FP_REG_TYPE_TEMP << NVFX_FP_REG_TYPE_SHIFT); + sr |= (fpc->sprite_coord_temp << NVFX_FP_REG_SRC_SHIFT); //printf("adding relocation at %x for %x\n", fpc->inst_offset, src.index); - util_dynarray_append(&fpc->fp->slot_relocations[src.reg.index], unsigned, fpc->inst_offset); + util_dynarray_append(&fpc->fp->slot_relocations[src.reg.index], unsigned, fpc->inst_offset + pos + 1); + break; + case NVFXSR_IMM: + if (!fpc->have_const) { + grow_insns(fpc, 4); + hw = &fp->insn[fpc->inst_offset]; + fpc->have_const = 1; + } + + memcpy(&fp->insn[fpc->inst_offset + 4], + (float*)fpc->imm_data.data + src.reg.index * 4, + sizeof(uint32_t) * 4); + + sr |= (NVFX_FP_REG_TYPE_CONST << NVFX_FP_REG_TYPE_SHIFT); break; case NVFXSR_CONST: if (!fpc->have_const) { grow_insns(fpc, 4); + hw = &fp->insn[fpc->inst_offset]; fpc->have_const = 1; } - hw = &fp->insn[fpc->inst_offset]; - if (fpc->consts[src.reg.index].pipe >= 0) { + { struct nvfx_fragment_program_data *fpd; fp->consts = realloc(fp->consts, ++fp->nr_consts * sizeof(*fpd)); fpd = &fp->consts[fp->nr_consts - 1]; fpd->offset = fpc->inst_offset + 4; - fpd->index = fpc->consts[src.reg.index].pipe; + fpd->index = src.reg.index; memset(&fp->insn[fpd->offset], 0, sizeof(uint32_t) * 4); - } else { - memcpy(&fp->insn[fpc->inst_offset + 4], - fpc->consts[src.reg.index].vals, - sizeof(uint32_t) * 4); } sr |= (NVFX_FP_REG_TYPE_CONST << NVFX_FP_REG_TYPE_SHIFT); @@ -204,7 +205,7 @@ nvfx_fp_emit(struct nvfx_fpc *fpc, struct nvfx_insn insn) memset(hw, 0, sizeof(uint32_t) * 4); if (insn.op == NVFX_FP_OP_OPCODE_KIL) - fp->fp_control |= NV34TCL_FP_CONTROL_USES_KIL; + fp->fp_control |= NV30_3D_FP_CONTROL_USES_KIL; hw[0] |= (insn.op << NVFX_FP_OP_OPCODE_SHIFT); hw[0] |= (insn.mask << NVFX_FP_OP_OUTMASK_SHIFT); hw[2] |= (insn.scale << NVFX_FP_OP_DST_SCALE_SHIFT); @@ -405,11 +406,11 @@ tgsi_src(struct nvfx_fpc *fpc, const struct tgsi_full_src_register *fsrc) } break; case TGSI_FILE_CONSTANT: - src.reg = constant(fpc, fsrc->Register.Index, NULL); + src.reg = nvfx_reg(NVFXSR_CONST, fsrc->Register.Index); break; case TGSI_FILE_IMMEDIATE: assert(fsrc->Register.Index < fpc->nr_imm); - src.reg = fpc->imm[fsrc->Register.Index]; + src.reg = fpc->r_imm[fsrc->Register.Index]; break; case TGSI_FILE_TEMPORARY: src.reg = fpc->r_temp[fsrc->Register.Index]; @@ -431,6 +432,9 @@ tgsi_src(struct nvfx_fpc *fpc, const struct tgsi_full_src_register *fsrc) src.swz[1] = fsrc->Register.SwizzleY; src.swz[2] = fsrc->Register.SwizzleZ; src.swz[3] = fsrc->Register.SwizzleW; + src.indirect = 0; + src.indirect_reg = 0; + src.indirect_swz = 0; return src; } @@ -467,7 +471,7 @@ nvfx_fragprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_fpc *fpc, { const struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0)); struct nvfx_insn insn; - struct nvfx_src src[3], tmp, tmp2; + struct nvfx_src src[3], tmp; struct nvfx_reg dst; int mask, sat, unit = 0; int ai = -1, ci = -1, ii = -1; @@ -492,7 +496,21 @@ nvfx_fragprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_fpc *fpc, switch (fsrc->Register.File) { case TGSI_FILE_INPUT: - if (ai == -1 || ai == fsrc->Register.Index) { + if(fpc->pfp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_FOG && (0 + || fsrc->Register.SwizzleX == PIPE_SWIZZLE_ALPHA + || fsrc->Register.SwizzleY == PIPE_SWIZZLE_ALPHA + || fsrc->Register.SwizzleZ == PIPE_SWIZZLE_ALPHA + || fsrc->Register.SwizzleW == PIPE_SWIZZLE_ALPHA + )) { + /* hardware puts 0 in fogcoord.w, but GL/Gallium want 1 there */ + struct nvfx_src addend = nvfx_src(nvfx_fp_imm(fpc, 0, 0, 0, 1)); + addend.swz[0] = fsrc->Register.SwizzleX; + addend.swz[1] = fsrc->Register.SwizzleY; + addend.swz[2] = fsrc->Register.SwizzleZ; + addend.swz[3] = fsrc->Register.SwizzleW; + src[i] = nvfx_src(temp(fpc)); + nvfx_fp_emit(fpc, arith(0, ADD, src[i].reg, NVFX_FP_MASK_ALL, tgsi_src(fpc, fsrc), addend, none)); + } else if (ai == -1 || ai == fsrc->Register.Index) { ai = fsrc->Register.Index; src[i] = tgsi_src(fpc, fsrc); } else { @@ -626,7 +644,27 @@ nvfx_fragprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_fpc *fpc, case TGSI_OPCODE_LG2: nvfx_fp_emit(fpc, arith(sat, LG2, dst, mask, src[0], none, none)); break; -// case TGSI_OPCODE_LIT: + case TGSI_OPCODE_LIT: + if(!nvfx->is_nv4x) + nvfx_fp_emit(fpc, arith(sat, LIT_NV30, dst, mask, src[0], src[1], src[2])); + else { + /* we use FLT_MIN, so that log2 never gives -infinity, and thus multiplication by + * specular 0 always gives 0, so that ex2 gives 1, to satisfy the 0^0 = 1 requirement + * + * NOTE: if we start using half precision, we might need an fp16 FLT_MIN here instead + */ + struct nvfx_src maxs = nvfx_src(nvfx_fp_imm(fpc, 0, FLT_MIN, 0, 0)); + tmp = nvfx_src(temp(fpc)); + if (ci>= 0 || ii >= 0) { + nvfx_fp_emit(fpc, arith(0, MOV, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, maxs, none, none)); + maxs = tmp; + } + nvfx_fp_emit(fpc, arith(0, MAX, tmp.reg, NVFX_FP_MASK_Y | NVFX_FP_MASK_W, swz(src[0], X, X, X, Y), swz(maxs, X, X, Y, Y), none)); + nvfx_fp_emit(fpc, arith(0, LG2, tmp.reg, NVFX_FP_MASK_W, swz(tmp, W, W, W, W), none, none)); + nvfx_fp_emit(fpc, arith(0, MUL, tmp.reg, NVFX_FP_MASK_W, swz(tmp, W, W, W, W), swz(src[0], W, W, W, W), none)); + nvfx_fp_emit(fpc, arith(sat, LITEX2_NV40, dst, mask, swz(tmp, Y, Y, W, W), none, none)); + } + break; case TGSI_OPCODE_LRP: if(!nvfx->is_nv4x) nvfx_fp_emit(fpc, arith(sat, LRP_NV30, dst, mask, src[0], src[1], src[2])); @@ -732,12 +770,24 @@ nvfx_fragprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_fpc *fpc, nvfx_fp_emit(fpc, arith(sat, SNE, dst, mask, src[0], src[1], none)); break; case TGSI_OPCODE_SSG: - tmp = nvfx_src(temp(fpc)); - tmp2 = nvfx_src(temp(fpc)); - nvfx_fp_emit(fpc, arith(0, SGT, tmp.reg, mask, src[0], nvfx_src(nvfx_reg(NVFXSR_CONST, 0)), none)); - nvfx_fp_emit(fpc, arith(0, SLT, tmp.reg, mask, src[0], nvfx_src(nvfx_reg(NVFXSR_CONST, 0)), none)); - nvfx_fp_emit(fpc, arith(sat, ADD, dst, mask, tmp, neg(tmp2), none)); + { + struct nvfx_src minones = swz(nvfx_src(nvfx_fp_imm(fpc, -1, -1, -1, -1)), X, X, X, X); + + insn = arith(sat, MOV, dst, mask, src[0], none, none); + insn.cc_update = 1; + nvfx_fp_emit(fpc, insn); + + insn = arith(0, STR, dst, mask, none, none, none); + insn.cc_test = NVFX_COND_GT; + nvfx_fp_emit(fpc, insn); + + if(!sat) { + insn = arith(0, MOV, dst, mask, minones, none, none); + insn.cc_test = NVFX_COND_LT; + nvfx_fp_emit(fpc, insn); + } break; + } case TGSI_OPCODE_STR: nvfx_fp_emit(fpc, arith(sat, STR, dst, mask, src[0], src[1], none)); break; @@ -781,7 +831,7 @@ nvfx_fragprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_fpc *fpc, case TGSI_OPCODE_IF: // MOVRC0 R31 (TR0.xyzw), R: // IF (NE.xxxx) ELSE END - if(!nvfx->is_nv4x) + if(!nvfx->use_nv4x) goto nv3x_cflow; nv40_fp_if(fpc, src[0]); break; @@ -789,7 +839,7 @@ nvfx_fragprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_fpc *fpc, case TGSI_OPCODE_ELSE: { uint32_t *hw; - if(!nvfx->is_nv4x) + if(!nvfx->use_nv4x) goto nv3x_cflow; assert(util_dynarray_contains(&fpc->if_stack, unsigned)); hw = &fpc->fp->insn[util_dynarray_top(&fpc->if_stack, unsigned)]; @@ -800,7 +850,7 @@ nvfx_fragprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_fpc *fpc, case TGSI_OPCODE_ENDIF: { uint32_t *hw; - if(!nvfx->is_nv4x) + if(!nvfx->use_nv4x) goto nv3x_cflow; assert(util_dynarray_contains(&fpc->if_stack, unsigned)); hw = &fpc->fp->insn[util_dynarray_pop(&fpc->if_stack, unsigned)]; @@ -823,19 +873,19 @@ nvfx_fragprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_fpc *fpc, break; case TGSI_OPCODE_CAL: - if(!nvfx->is_nv4x) + if(!nvfx->use_nv4x) goto nv3x_cflow; nv40_fp_cal(fpc, finst->Label.Label); break; case TGSI_OPCODE_RET: - if(!nvfx->is_nv4x) + if(!nvfx->use_nv4x) goto nv3x_cflow; nv40_fp_ret(fpc); break; case TGSI_OPCODE_BGNLOOP: - if(!nvfx->is_nv4x) + if(!nvfx->use_nv4x) goto nv3x_cflow; /* TODO: we should support using two nested REPs to allow a > 255 iteration count */ nv40_fp_rep(fpc, 255, finst->Label.Label); @@ -845,7 +895,7 @@ nvfx_fragprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_fpc *fpc, break; case TGSI_OPCODE_BRK: - if(!nvfx->is_nv4x) + if(!nvfx->use_nv4x) goto nv3x_cflow; nv40_fp_brk(fpc); break; @@ -900,7 +950,7 @@ nvfx_fragprog_parse_decl_output(struct nvfx_context* nvfx, struct nvfx_fpc *fpc, case 2: hw = 3; break; case 3: hw = 4; break; } - if(hw > ((nvfx->is_nv4x) ? 4 : 2)) { + if(hw > ((nvfx->use_nv4x) ? 4 : 2)) { NOUVEAU_ERR("bad rcol index\n"); return FALSE; } @@ -921,19 +971,17 @@ nvfx_fragprog_prepare(struct nvfx_context* nvfx, struct nvfx_fpc *fpc) struct tgsi_parse_context p; int high_temp = -1, i; struct util_semantic_set set; - float const0v[4] = {0, 0, 0, 0}; - struct nvfx_reg const0; + unsigned num_texcoords = nvfx->use_nv4x ? 10 : 8; fpc->fp->num_slots = util_semantic_set_from_program_file(&set, fpc->pfp->pipe.tokens, TGSI_FILE_INPUT); - if(fpc->fp->num_slots > 8) + if(fpc->fp->num_slots > num_texcoords) return FALSE; - util_semantic_layout_from_set(fpc->fp->slot_to_generic, &set, 0, 8); - util_semantic_table_from_layout(fpc->generic_to_slot, fpc->fp->slot_to_generic, 0, 8); + util_semantic_layout_from_set(fpc->fp->slot_to_generic, &set, 0, num_texcoords); + util_semantic_table_from_layout(fpc->generic_to_slot, fpc->fp->slot_to_generic, 0, num_texcoords); memset(fpc->fp->slot_to_fp_input, 0xff, sizeof(fpc->fp->slot_to_fp_input)); - const0 = constant(fpc, -1, const0v); - assert(const0.index == 0); + fpc->r_imm = CALLOC(fpc->pfp->info.immediate_count, sizeof(struct nvfx_reg)); tgsi_parse_init(&p, fpc->pfp->pipe.tokens); while (!tgsi_parse_end_of_tokens(&p)) { @@ -964,19 +1012,14 @@ nvfx_fragprog_prepare(struct nvfx_context* nvfx, struct nvfx_fpc *fpc) case TGSI_TOKEN_TYPE_IMMEDIATE: { struct tgsi_full_immediate *imm; - float vals[4]; imm = &p.FullToken.FullImmediate; assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32); - assert(fpc->nr_imm < MAX_IMM); + assert(fpc->nr_imm < fpc->pfp->info.immediate_count); - vals[0] = imm->u[0].Float; - vals[1] = imm->u[1].Float; - vals[2] = imm->u[2].Float; - vals[3] = imm->u[3].Float; - fpc->imm[fpc->nr_imm++] = constant(fpc, -1, vals); - } + fpc->r_imm[fpc->nr_imm++] = nvfx_fp_imm(fpc, imm->u[0].Float, imm->u[1].Float, imm->u[2].Float, imm->u[3].Float); break; + } default: break; } @@ -993,8 +1036,10 @@ nvfx_fragprog_prepare(struct nvfx_context* nvfx, struct nvfx_fpc *fpc) return TRUE; out_err: - if (fpc->r_temp) + if (fpc->r_temp) { FREE(fpc->r_temp); + fpc->r_temp = NULL; + } tgsi_parse_free(&p); return FALSE; } @@ -1003,7 +1048,8 @@ DEBUG_GET_ONCE_BOOL_OPTION(nvfx_dump_fp, "NVFX_DUMP_FP", FALSE) static struct nvfx_fragment_program* nvfx_fragprog_translate(struct nvfx_context *nvfx, - struct nvfx_pipe_fragment_program *pfp) + struct nvfx_pipe_fragment_program *pfp, + boolean emulate_sprite_flipping) { struct tgsi_parse_context parse; struct nvfx_fpc *fpc = NULL; @@ -1019,16 +1065,38 @@ nvfx_fragprog_translate(struct nvfx_context *nvfx, if (!fpc) goto out_err; + fpc->max_temps = nvfx->use_nv4x ? 48 : 32; fpc->pfp = pfp; fpc->fp = fp; fpc->num_regs = 2; + for (unsigned i = 0; i < pfp->info.num_properties; ++i) { + if (pfp->info.properties[i].name == TGSI_PROPERTY_FS_COORD_ORIGIN) { + if(pfp->info.properties[i].data[0]) + fp->coord_conventions |= NV30_3D_COORD_CONVENTIONS_ORIGIN_INVERTED; + } else if (pfp->info.properties[i].name == TGSI_PROPERTY_FS_COORD_PIXEL_CENTER) { + if(pfp->info.properties[i].data[0]) + fp->coord_conventions |= NV30_3D_COORD_CONVENTIONS_CENTER_INTEGER; + } + } + if (!nvfx_fragprog_prepare(nvfx, fpc)) goto out_err; tgsi_parse_init(&parse, pfp->pipe.tokens); - util_dynarray_init(&insns); + + if(emulate_sprite_flipping) + { + struct nvfx_reg reg = temp(fpc); + struct nvfx_src sprite_input = nvfx_src(nvfx_reg(NVFXSR_RELOCATED, fp->num_slots)); + struct nvfx_src imm = nvfx_src(nvfx_fp_imm(fpc, 1, -1, 0, 0)); + + fpc->sprite_coord_temp = reg.index; + fpc->r_temps_discard = 0ULL; + nvfx_fp_emit(fpc, arith(0, MAD, reg, NVFX_FP_MASK_ALL, sprite_input, swz(imm, X, Y, X, X), swz(imm, Z, X, Z, Z))); + } + while (!tgsi_parse_end_of_tokens(&parse)) { tgsi_parse_token(&parse); @@ -1059,7 +1127,7 @@ nvfx_fragprog_translate(struct nvfx_context *nvfx, if(!nvfx->is_nv4x) fp->fp_control |= (fpc->num_regs-1)/2; else - fp->fp_control |= fpc->num_regs << NV40TCL_FP_CONTROL_TEMP_COUNT_SHIFT; + fp->fp_control |= fpc->num_regs << NV40_3D_FP_CONTROL_TEMP_COUNT__SHIFT; /* Terminate final instruction */ if(fp->insn) @@ -1100,6 +1168,7 @@ out: FREE(fpc->r_temp); util_dynarray_fini(&fpc->if_stack); util_dynarray_fini(&fpc->label_relocs); + util_dynarray_fini(&fpc->imm_data); //util_dynarray_fini(&fpc->loop_stack); FREE(fpc); } @@ -1120,12 +1189,12 @@ out_err: static inline void nvfx_fp_memcpy(void* dst, const void* src, size_t len) { -#ifndef WORDS_BIGENDIAN +#ifndef PIPE_ARCH_BIG_ENDIAN memcpy(dst, src, len); #else size_t i; for(i = 0; i < len; i += 4) { - uint32_t v = (uint32_t*)((char*)src + i); + uint32_t v = *(uint32_t*)((char*)src + i); *(uint32_t*)((char*)dst + i) = (v >> 16) | (v << 16); } #endif @@ -1164,16 +1233,21 @@ void nvfx_fragprog_validate(struct nvfx_context *nvfx) { struct nouveau_channel* chan = nvfx->screen->base.channel; + struct nouveau_grobj *eng3d = nvfx->screen->eng3d; struct nvfx_pipe_fragment_program *pfp = nvfx->fragprog; struct nvfx_vertex_program* vp; - unsigned sprite_coord_enable; - unsigned key = 0; + + // TODO: the multiplication by point_quad_rasterization is probably superfluous + unsigned sprite_coord_enable = nvfx->rasterizer->pipe.point_quad_rasterization * nvfx->rasterizer->pipe.sprite_coord_enable; + + boolean emulate_sprite_flipping = sprite_coord_enable && nvfx->rasterizer->pipe.sprite_coord_mode; + unsigned key = emulate_sprite_flipping; struct nvfx_fragment_program* fp; fp = pfp->fps[key]; if (!fp) { - fp = nvfx_fragprog_translate(nvfx, pfp); + fp = nvfx_fragprog_translate(nvfx, pfp, emulate_sprite_flipping); if(!fp) { @@ -1193,7 +1267,8 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx) } } - fp = nvfx_fragprog_translate(nvfx, nvfx->dummy_fs); + fp = nvfx_fragprog_translate(nvfx, nvfx->dummy_fs, FALSE); + emulate_sprite_flipping = FALSE; if(!fp) { @@ -1205,26 +1280,24 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx) pfp->fps[key] = fp; } - nvfx->hw_fragprog = fp; - - vp = nvfx->render_mode == HW ? nvfx->vertprog : nvfx->swtnl.vertprog; - sprite_coord_enable = nvfx->rasterizer->pipe.point_quad_rasterization * nvfx->rasterizer->pipe.sprite_coord_enable; + vp = nvfx->hw_vertprog; if (fp->last_vp_id != vp->id || fp->last_sprite_coord_enable != sprite_coord_enable) { - int sprite_input = -1; + int sprite_real_input = -1; + int sprite_reloc_input; unsigned i; fp->last_vp_id = vp->id; fp->last_sprite_coord_enable = sprite_coord_enable; if(sprite_coord_enable) { - sprite_input = vp->sprite_fp_input; - if(sprite_input < 0) + sprite_real_input = vp->sprite_fp_input; + if(sprite_real_input < 0) { unsigned used_texcoords = 0; for(unsigned i = 0; i < fp->num_slots; ++i) { unsigned generic = fp->slot_to_generic[i]; - if(!((1 << generic) & sprite_coord_enable)) + if((generic < 32) && !((1 << generic) & sprite_coord_enable)) { unsigned char slot_mask = vp->generic_to_fp_input[generic]; if(slot_mask >= 0xf0) @@ -1232,19 +1305,24 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx) } } - sprite_input = NVFX_FP_OP_INPUT_SRC_TC(__builtin_ctz(~used_texcoords)); + sprite_real_input = NVFX_FP_OP_INPUT_SRC_TC(__builtin_ctz(~used_texcoords)); } - fp->point_sprite_control |= (1 << (sprite_input - NVFX_FP_OP_INPUT_SRC_TC0 + 8)); + fp->point_sprite_control |= (1 << (sprite_real_input - NVFX_FP_OP_INPUT_SRC_TC0 + 8)); } else fp->point_sprite_control = 0; + if(emulate_sprite_flipping) + sprite_reloc_input = 0; + else + sprite_reloc_input = sprite_real_input; + for(i = 0; i < fp->num_slots; ++i) { unsigned generic = fp->slot_to_generic[i]; - if((1 << generic) & sprite_coord_enable) + if((generic < 32) && ((1 << generic) & sprite_coord_enable)) { - if(fp->slot_to_fp_input[i] != sprite_input) + if(fp->slot_to_fp_input[i] != sprite_reloc_input) goto update_slots; } else @@ -1255,6 +1333,12 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx) } } + if(emulate_sprite_flipping) + { + if(fp->slot_to_fp_input[fp->num_slots] != sprite_real_input) + goto update_slots; + } + if(0) { update_slots: @@ -1262,22 +1346,24 @@ update_slots: for(; i < fp->num_slots; ++i) { unsigned generic = fp->slot_to_generic[i]; - if((1 << generic) & sprite_coord_enable) - fp->slot_to_fp_input[i] = sprite_input; + if((generic < 32) && ((1 << generic) & sprite_coord_enable)) + fp->slot_to_fp_input[i] = sprite_reloc_input; else fp->slot_to_fp_input[i] = vp->generic_to_fp_input[generic] & 0xf; } + fp->slot_to_fp_input[fp->num_slots] = sprite_real_input; + if(nvfx->is_nv4x) { fp->or = 0; - for(i = 0; i < fp->num_slots; ++i) { + for(i = 0; i <= fp->num_slots; ++i) { unsigned fp_input = fp->slot_to_fp_input[i]; if(fp_input == NVFX_FP_OP_INPUT_SRC_TC(8)) fp->or |= (1 << 12); else if(fp_input == NVFX_FP_OP_INPUT_SRC_TC(9)) fp->or |= (1 << 13); - else if(fp_input != 0xf) + else if(fp_input >= NVFX_FP_OP_INPUT_SRC_TC(0) && fp_input <= NVFX_FP_OP_INPUT_SRC_TC(7)) fp->or |= (1 << (fp_input - NVFX_FP_OP_INPUT_SRC_TC0 + 14)); } } @@ -1292,7 +1378,7 @@ update_slots: * one bound to this fragment program. * Doing such a check would likely be a pessimization. */ - if (nvfx->dirty & (NVFX_NEW_FRAGCONST | NVFX_NEW_FRAGPROG)) { + if ((nvfx->hw_fragprog != fp) || (nvfx->dirty & (NVFX_NEW_FRAGPROG | NVFX_NEW_FRAGCONST))) { int offset; uint32_t* fpmap; @@ -1365,16 +1451,45 @@ update: */ if(fp->progs_left_with_obsolete_slot_assignments) { unsigned char* fpbo_slots = &fp->fpbo->slots[fp->bo_prog_idx * 8]; - for(unsigned i = 0; i < fp->num_slots; ++i) { + /* also relocate sprite coord slot, if any */ + for(unsigned i = 0; i <= fp->num_slots; ++i) { unsigned value = fp->slot_to_fp_input[i];; if(value != fpbo_slots[i]) { - unsigned* p = (unsigned*)fp->slot_relocations[i].data; - unsigned* pend = (unsigned*)((char*)fp->slot_relocations[i].data + fp->slot_relocations[i].size); - for(; p != pend; ++p) { - unsigned off = *p; - unsigned dw = fp->insn[off]; - dw = (dw & ~NVFX_FP_OP_INPUT_SRC_MASK) | (value << NVFX_FP_OP_INPUT_SRC_SHIFT); - nvfx_fp_memcpy(&fpmap[*p], &dw, sizeof(dw)); + unsigned* p; + unsigned* begin = (unsigned*)fp->slot_relocations[i].data; + unsigned* end = (unsigned*)((char*)fp->slot_relocations[i].data + fp->slot_relocations[i].size); + //printf("fp %p reloc slot %u/%u: %u -> %u\n", fp, i, fp->num_slots, fpbo_slots[i], value); + if(value == 0) + { + /* was relocated to an input, switch type to temporary */ + for(p = begin; p != end; ++p) { + unsigned off = *p; + unsigned dw = fp->insn[off]; + dw &=~ NVFX_FP_REG_TYPE_MASK; + //printf("reloc_tmp at %x\n", off); + nvfx_fp_memcpy(&fpmap[off], &dw, sizeof(dw)); + } + } else { + if(!fpbo_slots[i]) + { + /* was relocated to a temporary, switch type to input */ + for(p= begin; p != end; ++p) { + unsigned off = *p; + unsigned dw = fp->insn[off]; + //printf("reloc_in at %x\n", off); + dw |= NVFX_FP_REG_TYPE_INPUT << NVFX_FP_REG_TYPE_SHIFT; + nvfx_fp_memcpy(&fpmap[off], &dw, sizeof(dw)); + } + } + + /* set the correct input index */ + for(p = begin; p != end; ++p) { + unsigned off = *p & ~3; + unsigned dw = fp->insn[off]; + //printf("reloc&~3 at %x\n", off); + dw = (dw & ~NVFX_FP_OP_INPUT_SRC_MASK) | (value << NVFX_FP_OP_INPUT_SRC_SHIFT); + nvfx_fp_memcpy(&fpmap[off], &dw, sizeof(dw)); + } } fpbo_slots[i] = value; } @@ -1382,18 +1497,20 @@ update: --fp->progs_left_with_obsolete_slot_assignments; } + nvfx->hw_fragprog = fp; + MARK_RING(chan, 8, 1); - OUT_RING(chan, RING_3D(NV34TCL_FP_ACTIVE_PROGRAM, 1)); + BEGIN_RING(chan, eng3d, NV30_3D_FP_ACTIVE_PROGRAM, 1); OUT_RELOC(chan, fp->fpbo->bo, offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD | NOUVEAU_BO_LOW | - NOUVEAU_BO_OR, NV34TCL_FP_ACTIVE_PROGRAM_DMA0, - NV34TCL_FP_ACTIVE_PROGRAM_DMA1); - OUT_RING(chan, RING_3D(NV34TCL_FP_CONTROL, 1)); + NOUVEAU_BO_OR, NV30_3D_FP_ACTIVE_PROGRAM_DMA0, + NV30_3D_FP_ACTIVE_PROGRAM_DMA1); + BEGIN_RING(chan, eng3d, NV30_3D_FP_CONTROL, 1); OUT_RING(chan, fp->fp_control); if(!nvfx->is_nv4x) { - OUT_RING(chan, RING_3D(NV34TCL_FP_REG_CONTROL, 1)); + BEGIN_RING(chan, eng3d, NV30_3D_FP_REG_CONTROL, 1); OUT_RING(chan, (1<<16)|0x4); - OUT_RING(chan, RING_3D(NV34TCL_TX_UNITS_ENABLE, 1)); + BEGIN_RING(chan, eng3d, NV30_3D_TEX_UNITS_ENABLE, 1); OUT_RING(chan, fp->samplers); } } @@ -1402,25 +1519,13 @@ update: unsigned pointsprite_control = fp->point_sprite_control | nvfx->rasterizer->pipe.point_quad_rasterization; if(pointsprite_control != nvfx->hw_pointsprite_control) { - WAIT_RING(chan, 2); - OUT_RING(chan, RING_3D(NV34TCL_POINT_SPRITE, 1)); + BEGIN_RING(chan, eng3d, NV30_3D_POINT_SPRITE, 1); OUT_RING(chan, pointsprite_control); nvfx->hw_pointsprite_control = pointsprite_control; } } - if(nvfx->is_nv4x) - { - unsigned vp_output = vp->or | fp->or; - - if(vp_output != nvfx->hw_vp_output) - { - WAIT_RING(chan, 2); - OUT_RING(chan, RING_3D(NV40TCL_VP_RESULT_EN, 1)); - OUT_RING(chan, vp_output); - nvfx->hw_vp_output = vp_output; - } - } + nvfx->relocs_needed &=~ NVFX_RELOCATE_FRAGPROG; } void @@ -1433,10 +1538,11 @@ nvfx_fragprog_relocate(struct nvfx_context *nvfx) unsigned fp_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_RD; // TODO: GART? fp_flags |= NOUVEAU_BO_DUMMY; MARK_RING(chan, 2, 2); - OUT_RELOC(chan, bo, RING_3D(NV34TCL_FP_ACTIVE_PROGRAM, 1), fp_flags, 0, 0); + OUT_RELOC(chan, bo, RING_3D(NV30_3D_FP_ACTIVE_PROGRAM, 1), fp_flags, 0, 0); OUT_RELOC(chan, bo, offset, fp_flags | NOUVEAU_BO_LOW | - NOUVEAU_BO_OR, NV34TCL_FP_ACTIVE_PROGRAM_DMA0, - NV34TCL_FP_ACTIVE_PROGRAM_DMA1); + NOUVEAU_BO_OR, NV30_3D_FP_ACTIVE_PROGRAM_DMA0, + NV30_3D_FP_ACTIVE_PROGRAM_DMA1); + nvfx->relocs_needed &=~ NVFX_RELOCATE_FRAGPROG; } void @@ -1452,13 +1558,13 @@ nvfx_fragprog_destroy(struct nvfx_context *nvfx, struct nvfx_fragment_program_bo* next = fpbo->next; nouveau_bo_unmap(fpbo->bo); nouveau_bo_ref(0, &fpbo->bo); - free(fpbo); + os_free_aligned(fpbo); fpbo = next; } while(fpbo != fp->fpbo); } - for(i = 0; i < 8; ++i) + for(i = 0; i < Elements(fp->slot_relocations); ++i) util_dynarray_fini(&fp->slot_relocations[i]); if (fp->insn_len) @@ -1491,15 +1597,18 @@ nvfx_fp_state_bind(struct pipe_context *pipe, void *hwcso) static void nvfx_fp_state_delete(struct pipe_context *pipe, void *hwcso) { - struct nvfx_context *nvfx = nvfx_context(pipe); - struct nvfx_pipe_fragment_program *pfp = hwcso; - unsigned i; - - for(i = 0; i < Elements(pfp->fps); ++i) - { - nvfx_fragprog_destroy(nvfx, pfp->fps[i]); - FREE(pfp->fps[i]); - } + struct nvfx_context *nvfx = nvfx_context(pipe); + struct nvfx_pipe_fragment_program *pfp = hwcso; + unsigned i; + + for(i = 0; i < Elements(pfp->fps); ++i) + { + if(pfp->fps[i]) + { + nvfx_fragprog_destroy(nvfx, pfp->fps[i]); + FREE(pfp->fps[i]); + } + } FREE((void*)pfp->pipe.tokens); FREE(pfp);