From: Roland Scheidegger Date: Fri, 2 Jun 2006 22:47:31 +0000 (+0000) Subject: implement arb_vertex_program in hw for r200. Code contains still some hacks, generic... X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=98c791b543c4ba86b8bb54488bd872b33b10b1aa;p=mesa.git implement arb_vertex_program in hw for r200. Code contains still some hacks, generic attribs cause a fallback, but otherwise it seems to work quite well. Passes all glean vertProg1 tests with the exception of the degnerated LIT case (which is a hw limitation), as well as runs the r200 render path of doom3/quake4 (1.1 patch needed for quake4). The code is heavily borrowed from the r300 driver as vertex programs encoding is almost identical. arb_vertex_program is not yet announced by default and still needs to be enabled via driconf. --- diff --git a/src/mesa/drivers/dri/r200/Makefile b/src/mesa/drivers/dri/r200/Makefile index 2084d52132e..75c09ff867e 100644 --- a/src/mesa/drivers/dri/r200/Makefile +++ b/src/mesa/drivers/dri/r200/Makefile @@ -31,6 +31,7 @@ DRIVER_SOURCES = r200_context.c \ r200_vtxfmt_sse.c \ r200_vtxfmt_x86.c \ r200_fragshader.c \ + r200_vertprog.c \ radeon_screen.c \ $(EGL_SOURCES) diff --git a/src/mesa/drivers/dri/r200/r200_cmdbuf.c b/src/mesa/drivers/dri/r200/r200_cmdbuf.c index 6ce68708940..91737d2d33c 100644 --- a/src/mesa/drivers/dri/r200/r200_cmdbuf.c +++ b/src/mesa/drivers/dri/r200/r200_cmdbuf.c @@ -110,6 +110,11 @@ void r200SetUpAtomList( r200ContextPtr rmesa ) /* FIXME: is this a good place to insert that atom ? */ insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.spr ); insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.prf ); + insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.pvs ); + insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vpp[0] ); + insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vpp[1] ); + insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vpi[0] ); + insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vpi[1] ); } static void r200SaveHwState( r200ContextPtr rmesa ) diff --git a/src/mesa/drivers/dri/r200/r200_context.c b/src/mesa/drivers/dri/r200/r200_context.c index 24f0ea55102..3e41d863148 100644 --- a/src/mesa/drivers/dri/r200/r200_context.c +++ b/src/mesa/drivers/dri/r200/r200_context.c @@ -62,6 +62,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include "r200_tcl.h" #include "r200_vtxfmt.h" #include "r200_maos.h" +#include "r200_vertprog.h" #define need_GL_ARB_multisample #define need_GL_ARB_texture_compression @@ -76,7 +77,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define need_GL_NV_vertex_program #include "extension_helper.h" -#define DRIVER_DATE "20060327" +#define DRIVER_DATE "20060602" #include "vblank.h" #include "utils.h" @@ -310,6 +311,7 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual, r200InitIoctlFuncs(&functions); r200InitStateFuncs(&functions); r200InitTextureFuncs(&functions); + r200InitShaderFuncs(&functions); /* Allocate and initialize the Mesa context */ if (sharedContextPrivate) @@ -417,6 +419,12 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual, ctx->Const.MaxLineWidthAA = 10.0; ctx->Const.LineWidthGranularity = 0.0625; + ctx->Const.VertexProgram.MaxNativeInstructions = R200_VSF_MAX_INST; + ctx->Const.VertexProgram.MaxNativeAttribs = 12; + ctx->Const.VertexProgram.MaxNativeTemps = R200_VSF_MAX_TEMPS; + ctx->Const.VertexProgram.MaxNativeParameters = R200_VSF_MAX_PARAM; + ctx->Const.VertexProgram.MaxNativeAddressRegs = 1; + /* Initialize the software rasterizer and helper modules. */ _swrast_CreateContext( ctx ); diff --git a/src/mesa/drivers/dri/r200/r200_context.h b/src/mesa/drivers/dri/r200/r200_context.h index 0de131adc1d..2ff9b9f32e6 100644 --- a/src/mesa/drivers/dri/r200/r200_context.h +++ b/src/mesa/drivers/dri/r200/r200_context.h @@ -46,6 +46,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include "mtypes.h" #include "colormac.h" #include "r200_reg.h" +#include "r200_vertprog.h" #define ENABLE_HW_3D_TEXTURE 1 /* XXX this is temporary! */ @@ -94,6 +95,15 @@ typedef void (*r200_point_func)( r200ContextPtr, r200Vertex * ); +struct r200_vertex_program { + struct vertex_program mesa_program; /* Must be first */ + int translated; + VERTEX_SHADER_INSTRUCTION instr[R200_VSF_MAX_INST + 2]; + int pos_end; + int inputs[VERT_ATTRIB_MAX]; + int native; +}; + struct r200_colorbuffer_state { GLuint clear; #if 000 @@ -336,6 +346,34 @@ struct r200_state_atom { #define AFS_IA1 4 /* 2f0c */ #define AFS_STATE_SIZE 33 +#define PVS_CMD_0 0 +#define PVS_CNTL_1 1 +#define PVS_CNTL_2 2 +#define PVS_STATE_SIZE 3 + +/* those are quite big... */ +#define VPI_CMD_0 0 +#define VPI_OPDST_0 1 +#define VPI_SRC0_0 2 +#define VPI_SRC1_0 3 +#define VPI_SRC2_0 4 +#define VPI_OPDST_63 253 +#define VPI_SRC0_63 254 +#define VPI_SRC1_63 255 +#define VPI_SRC2_63 256 +#define VPI_STATE_SIZE 257 + +#define VPP_CMD_0 0 +#define VPP_PARAM0_0 1 +#define VPP_PARAM1_0 2 +#define VPP_PARAM2_0 3 +#define VPP_PARAM3_0 4 +#define VPP_PARAM0_95 381 +#define VPP_PARAM1_95 382 +#define VPP_PARAM2_95 383 +#define VPP_PARAM3_95 384 +#define VPP_STATE_SIZE 385 + #define TCL_CMD_0 0 #define TCL_LIGHT_MODEL_CTL_0 1 #define TCL_LIGHT_MODEL_CTL_1 2 @@ -567,6 +605,9 @@ struct r200_hw_state { struct r200_state_atom glt; struct r200_state_atom prf; struct r200_state_atom afs[2]; + struct r200_state_atom pvs; + struct r200_state_atom vpi[2]; + struct r200_state_atom vpp[2]; struct r200_state_atom atf; struct r200_state_atom spr; @@ -883,6 +924,7 @@ struct r200_context { */ struct r200_hw_state hw; struct r200_state state; + struct r200_vertex_program *curr_vp_hw; /* Texture object bookkeeping */ diff --git a/src/mesa/drivers/dri/r200/r200_state.c b/src/mesa/drivers/dri/r200/r200_state.c index 41c7607b61d..d266e78910e 100644 --- a/src/mesa/drivers/dri/r200/r200_state.c +++ b/src/mesa/drivers/dri/r200/r200_state.c @@ -54,6 +54,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include "r200_tex.h" #include "r200_swtcl.h" #include "r200_vtxfmt.h" +#include "r200_vertprog.h" #include "drirenderbuffer.h" @@ -2100,7 +2101,71 @@ static void r200Enable( GLcontext *ctx, GLenum cap, GLboolean state ) break; case GL_VERTEX_PROGRAM_ARB: - TCL_FALLBACK(rmesa->glCtx, R200_TCL_FALLBACK_VERTEX_PROGRAM, state); + if (!state) { + GLuint i; + R200_STATECHANGE( rmesa, vap ); + rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL] &= ~R200_VAP_PROG_VTX_SHADER_ENABLE; + /* mark all tcl atoms (tcl vector state got overwritten) dirty + not sure about tcl scalar state - we need at least grd + with vert progs too. + ucp looks like it doesn't get overwritten (may even work + with vp for pos-invariant progs if we're lucky) */ + R200_STATECHANGE( rmesa, mtl[0] ); + R200_STATECHANGE( rmesa, mtl[1] ); + R200_STATECHANGE( rmesa, fog ); + R200_STATECHANGE( rmesa, glt ); + R200_STATECHANGE( rmesa, eye ); + for (i = R200_MTX_MV; i <= R200_MTX_TEX5; i++) { + R200_STATECHANGE( rmesa, mat[i] ); + } + for (i = 0 ; i < 8; i++) { + R200_STATECHANGE( rmesa, lit[i] ); + } + R200_STATECHANGE( rmesa, tcl ); + for (i = 0; i <= ctx->Const.MaxClipPlanes; i++) { + if (ctx->Transform.ClipPlanesEnabled & (1 << i)) { + rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] |= (R200_UCP_ENABLE_0 << i); + } +/* else { + rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] &= ~(R200_UCP_ENABLE_0 << i); + }*/ + } + /* FIXME: ugly as hell. need to call everything which might change tcl_output_vtxfmt0/1 and compsel */ + r200UpdateSpecular( ctx ); + r200Fogfv( ctx, GL_FOG_COORD_SRC, NULL ); + /* shouldn't be necessary, as it's picked up anyway in r200ValidateState (_NEW_PROGRAM), + but without it doom3 locks up at always the same places. Why? */ + r200UpdateTextureState( ctx ); + /* if we call r200UpdateTextureState we need the code below because we are calling it with + non-current derived enabled values which may revert the state atoms for frag progs even when + they already got disabled... ugh + Should really figure out why we need to call r200UpdateTextureState in the first place */ + GLuint unit; + for (unit = 0; unit < R200_MAX_TEXTURE_UNITS; unit++) { + R200_STATECHANGE( rmesa, pix[unit] ); + R200_STATECHANGE( rmesa, tex[unit] ); + rmesa->hw.tex[unit].cmd[TEX_PP_TXFORMAT] &= + ~(R200_TXFORMAT_ST_ROUTE_MASK | R200_TXFORMAT_LOOKUP_DISABLE); + rmesa->hw.tex[unit].cmd[TEX_PP_TXFORMAT] |= unit << R200_TXFORMAT_ST_ROUTE_SHIFT; + /* need to guard this with drmSupportsFragmentShader? Should never get here if + we don't announce ATI_fs, right? */ + rmesa->hw.tex[unit].cmd[TEX_PP_TXMULTI_CTL] = 0; + } + R200_STATECHANGE( rmesa, cst ); + R200_STATECHANGE( rmesa, tf ); + rmesa->hw.cst.cmd[CST_PP_CNTL_X] = 0; + } + else { + R200_STATECHANGE( rmesa, vap ); + if (!rmesa->TclFallback) { + /* FIXME: fglrx sets R200_VAP_SINGLE_BUF_STATE_ENABLE too. Do we need it? */ + rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL] |= R200_VAP_PROG_VTX_SHADER_ENABLE /*| R200_VAP_SINGLE_BUF_STATE_ENABLE*/; + } + R200_STATECHANGE( rmesa, vpi[0] ); + R200_STATECHANGE( rmesa, vpi[1] ); + R200_STATECHANGE( rmesa, vpp[0] ); + R200_STATECHANGE( rmesa, vpp[1] ); + } break; case GL_FRAGMENT_SHADER_ATI: @@ -2310,6 +2375,8 @@ void r200ValidateState( GLcontext *ctx ) r200UpdateLocalViewer( ctx ); } +/* FIXME: don't really need most of these when vertex progs are enabled */ + /* Need an event driven matrix update? */ if (new_state & (_NEW_MODELVIEW|_NEW_PROJECTION)) @@ -2340,6 +2407,16 @@ void r200ValidateState( GLcontext *ctx ) r200UpdateClipPlanes( ctx ); } + if (new_state & (_NEW_PROGRAM| + /* need to test for pretty much anything due to possible parameter bindings */ + _NEW_MODELVIEW|_NEW_PROJECTION|_NEW_TRANSFORM| + _NEW_LIGHT|_NEW_TEXTURE|_NEW_TEXTURE_MATRIX| + _NEW_FOG|_NEW_POINT|_NEW_TRACK_MATRIX)) { + if (ctx->VertexProgram._Enabled) { + r200SetupVertexProg( ctx ); + } + else TCL_FALLBACK(ctx, R200_TCL_FALLBACK_VERTEX_PROGRAM, 0); + } rmesa->NewGLState = 0; } diff --git a/src/mesa/drivers/dri/r200/r200_state_init.c b/src/mesa/drivers/dri/r200/r200_state_init.c index 14616b09f41..266beb61b6a 100644 --- a/src/mesa/drivers/dri/r200/r200_state_init.c +++ b/src/mesa/drivers/dri/r200/r200_state_init.c @@ -93,6 +93,19 @@ static int cmdvec( int offset, int stride, int count ) return h.i; } +/* warning: the count here is divided by 4 compared to other cmds + (so it doesn't exceed the char size)! */ +static int cmdveclinear( int offset, int count ) +{ + drm_radeon_cmd_header_t h; + h.i = 0; + h.veclinear.cmd_type = RADEON_CMD_VECLINEAR; + h.veclinear.addr_lo = offset & 0xff; + h.veclinear.addr_hi = (offset & 0xff00) >> 8; + h.veclinear.count = count; + return h.i; +} + static int cmdscl( int offset, int stride, int count ) { drm_radeon_cmd_header_t h; @@ -126,12 +139,27 @@ static GLboolean check_##NM( GLcontext *ctx, int idx ) \ #define TCL_CHECK( NM, FLAG ) \ static GLboolean check_##NM( GLcontext *ctx, int idx ) \ +{ \ + r200ContextPtr rmesa = R200_CONTEXT(ctx); \ + (void) idx; \ + return !rmesa->TclFallback && !ctx->VertexProgram._Enabled && (FLAG); \ +} + +#define TCL_OR_VP_CHECK( NM, FLAG ) \ +static GLboolean check_##NM( GLcontext *ctx, int idx ) \ { \ r200ContextPtr rmesa = R200_CONTEXT(ctx); \ (void) idx; \ return !rmesa->TclFallback && (FLAG); \ } +#define VP_CHECK( NM, FLAG ) \ +static GLboolean check_##NM( GLcontext *ctx, int idx ) \ +{ \ + r200ContextPtr rmesa = R200_CONTEXT(ctx); \ + (void) idx; \ + return !rmesa->TclFallback && ctx->VertexProgram._Enabled && (FLAG); \ +} CHECK( always, GL_TRUE ) @@ -150,7 +178,11 @@ TCL_CHECK( tcl, GL_TRUE ) TCL_CHECK( tcl_tex, rmesa->state.texture.unit[idx].unitneeded ) TCL_CHECK( tcl_lighting, ctx->Light.Enabled ) TCL_CHECK( tcl_light, ctx->Light.Enabled && ctx->Light.Light[idx].Enabled ) -TCL_CHECK( tcl_ucp, (ctx->Transform.ClipPlanesEnabled & (1 << idx)) ) +TCL_OR_VP_CHECK( tcl_ucp, (ctx->Transform.ClipPlanesEnabled & (1 << idx)) ) +TCL_OR_VP_CHECK( tcl_or_vp, GL_TRUE ) +VP_CHECK( tcl_vp, GL_TRUE ) +VP_CHECK( tcl_vp_size, ctx->VertexProgram.Current->Base.NumNativeInstructions > 64 ) +VP_CHECK( tcl_vpp_size, ctx->VertexProgram.Current->Base.NumNativeParameters > 96 ) /* Initialize the context's hardware state. @@ -307,13 +339,27 @@ void r200InitState( r200ContextPtr rmesa ) ALLOC_STATE( cube[4], never, CUBE_STATE_SIZE, "CUBE/tex-4", 4 ); ALLOC_STATE( cube[5], never, CUBE_STATE_SIZE, "CUBE/tex-5", 5 ); } - - ALLOC_STATE( tcl, tcl, TCL_STATE_SIZE, "TCL/tcl", 0 ); + if (rmesa->r200Screen->drmSupportsVertexProgram) { + ALLOC_STATE( pvs, tcl_vp, PVS_STATE_SIZE, "PVS/pvscntl", 0 ); + ALLOC_STATE( vpi[0], tcl_vp, VPI_STATE_SIZE, "VP/vertexprog-0", 0 ); + ALLOC_STATE( vpi[1], tcl_vp_size, VPI_STATE_SIZE, "VP/vertexprog-1", 1 ); + ALLOC_STATE( vpp[0], tcl_vp, VPP_STATE_SIZE, "VPP/vertexparam-0", 0 ); + ALLOC_STATE( vpp[1], tcl_vpp_size, VPP_STATE_SIZE, "VPP/vertexparam-1", 1 ); + } + else { + ALLOC_STATE( pvs, never, PVS_STATE_SIZE, "PVS/pvscntl", 0 ); + ALLOC_STATE( vpi[0], never, VPI_STATE_SIZE, "VP/vertexprog-0", 0 ); + ALLOC_STATE( vpi[1], never, VPI_STATE_SIZE, "VP/vertexprog-1", 1 ); + ALLOC_STATE( vpp[0], never, VPP_STATE_SIZE, "VPP/vertexparam-0", 0 ); + ALLOC_STATE( vpp[1], never, VPP_STATE_SIZE, "VPP/vertexparam-1", 1 ); + } + /* FIXME: this atom has two commands, we need only one (ucp_vert_blend) for vp */ + ALLOC_STATE( tcl, tcl_or_vp, TCL_STATE_SIZE, "TCL/tcl", 0 ); ALLOC_STATE( msl, tcl, MSL_STATE_SIZE, "MSL/matrix-select", 0 ); ALLOC_STATE( tcg, tcl, TCG_STATE_SIZE, "TCG/texcoordgen", 0 ); ALLOC_STATE( mtl[0], tcl_lighting, MTL_STATE_SIZE, "MTL0/material0", 0 ); ALLOC_STATE( mtl[1], tcl_lighting, MTL_STATE_SIZE, "MTL1/material1", 1 ); - ALLOC_STATE( grd, tcl, GRD_STATE_SIZE, "GRD/guard-band", 0 ); + ALLOC_STATE( grd, tcl_or_vp, GRD_STATE_SIZE, "GRD/guard-band", 0 ); ALLOC_STATE( fog, fog, FOG_STATE_SIZE, "FOG/fog", 0 ); ALLOC_STATE( glt, tcl_lighting, GLT_STATE_SIZE, "GLT/light-global", 0 ); ALLOC_STATE( eye, tcl_lighting, EYE_STATE_SIZE, "EYE/eye-vector", 0 ); @@ -411,6 +457,7 @@ void r200InitState( r200ContextPtr rmesa ) } rmesa->hw.afs[0].cmd[AFS_CMD_0] = cmdpkt(R200_EMIT_PP_AFS_0); rmesa->hw.afs[1].cmd[AFS_CMD_0] = cmdpkt(R200_EMIT_PP_AFS_1); + rmesa->hw.pvs.cmd[PVS_CMD_0] = cmdpkt(R200_EMIT_VAP_PVS_CNTL); rmesa->hw.cube[0].cmd[CUBE_CMD_0] = cmdpkt(R200_EMIT_PP_CUBIC_FACES_0); rmesa->hw.cube[0].cmd[CUBE_CMD_1] = cmdpkt(R200_EMIT_PP_CUBIC_OFFSETS_0); rmesa->hw.cube[1].cmd[CUBE_CMD_0] = cmdpkt(R200_EMIT_PP_CUBIC_FACES_1); @@ -450,6 +497,15 @@ void r200InitState( r200ContextPtr rmesa ) rmesa->hw.mtl[1].cmd[MTL_CMD_1] = cmdscl2( R200_SS_MAT_1_SHININESS, 1, 1 ); + rmesa->hw.vpi[0].cmd[VPI_CMD_0] = + cmdveclinear( R200_PVS_PROG0, 64 ); + rmesa->hw.vpi[1].cmd[VPI_CMD_0] = + cmdveclinear( R200_PVS_PROG1, 64 ); + rmesa->hw.vpp[0].cmd[VPP_CMD_0] = + cmdveclinear( R200_PVS_PARAM0, 96 ); + rmesa->hw.vpp[1].cmd[VPP_CMD_0] = + cmdveclinear( R200_PVS_PARAM1, 96 ); + rmesa->hw.grd.cmd[GRD_CMD_0] = cmdscl( R200_SS_VERT_GUARD_CLIP_ADJ_ADDR, 1, 4 ); rmesa->hw.fog.cmd[FOG_CMD_0] = diff --git a/src/mesa/drivers/dri/r200/r200_tcl.c b/src/mesa/drivers/dri/r200/r200_tcl.c index c41622debe5..2530e1b1046 100644 --- a/src/mesa/drivers/dri/r200/r200_tcl.c +++ b/src/mesa/drivers/dri/r200/r200_tcl.c @@ -390,27 +390,86 @@ static GLboolean r200_run_tcl_render( GLcontext *ctx, if (rmesa->NewGLState) r200ValidateState( ctx ); + if (!ctx->VertexProgram._Enabled) { /* NOTE: inputs != tnl->render_inputs - these are the untransformed * inputs. */ - if (ctx->Light.Enabled) { - inputs |= VERT_BIT_NORMAL; - } + if (ctx->Light.Enabled) { + inputs |= VERT_BIT_NORMAL; + } - if (ctx->_TriangleCaps & DD_SEPARATE_SPECULAR) { - inputs |= VERT_BIT_COLOR1; - } + if (ctx->_TriangleCaps & DD_SEPARATE_SPECULAR) { + inputs |= VERT_BIT_COLOR1; + } - if ( (ctx->Fog.FogCoordinateSource == GL_FOG_COORD) && ctx->Fog.Enabled ) { - inputs |= VERT_BIT_FOG; - } + if ( (ctx->Fog.FogCoordinateSource == GL_FOG_COORD) && ctx->Fog.Enabled ) { + inputs |= VERT_BIT_FOG; + } - for (i = 0 ; i < ctx->Const.MaxTextureUnits; i++) { - if (ctx->Texture.Unit[i]._ReallyEnabled) { - if (rmesa->TexGenNeedNormals[i]) { - inputs |= VERT_BIT_NORMAL; + for (i = 0 ; i < ctx->Const.MaxTextureUnits; i++) { + if (ctx->Texture.Unit[i]._ReallyEnabled) { + if (rmesa->TexGenNeedNormals[i]) { + inputs |= VERT_BIT_NORMAL; + } + inputs |= VERT_BIT_TEX(i); } - inputs |= VERT_BIT_TEX(i); + } + } + else { + GLuint out_vtxfmt0 = 0; + GLuint out_vtxfmt1 = 0; + GLuint out_compsel = 0; + GLuint vp_out = rmesa->curr_vp_hw->mesa_program.Base.OutputsWritten; + /* can't handle other inputs, generic attribs etc. currently - should never arrive here */ + assert ((rmesa->curr_vp_hw->mesa_program.Base.InputsRead & + ~(VERT_BIT_POS | VERT_BIT_NORMAL | VERT_BIT_COLOR0 | VERT_BIT_COLOR1 | + VERT_BIT_FOG | VERT_BIT_TEX0 | VERT_BIT_TEX1 | VERT_BIT_TEX2 | + VERT_BIT_TEX3 | VERT_BIT_TEX4 | VERT_BIT_TEX5)) == 0); + inputs |= rmesa->curr_vp_hw->mesa_program.Base.InputsRead; + /* FIXME: this is a mess. Not really sure how to set up TCL_OUTPUT_VTXFMT + in "undefined" cases (e.g. output needed later but not written by vertex program or vice versa) + - however misconfiguration here will almost certainly lock up the chip. + I think at the very least we need to enable tcl outputs which we write to. Maybe even need to + fix up a vertex program so an output needed later always gets written? + For now just set the compsel and output_vtxfmt to the outputs written. + However, for simplicity we assume always all 4 values are written which may not be correct + (but I don't know if it could lead to lockups). */ + assert(vp_out & (1 << VERT_RESULT_HPOS)); + out_vtxfmt0 = R200_VTX_XY | R200_VTX_Z0 | R200_VTX_W0; + /* FIXME: need to always enable color_0 otherwise doom3's shadow vp (?) will lock up (?) */ + out_vtxfmt0 |= R200_VTX_FP_RGBA << R200_VTX_COLOR_0_SHIFT; + out_compsel = R200_OUTPUT_XYZW; + if (vp_out & (1 << VERT_RESULT_COL0)) { + out_vtxfmt0 |= R200_VTX_FP_RGBA << R200_VTX_COLOR_0_SHIFT; + out_compsel |= R200_OUTPUT_COLOR_0; + } + if (vp_out & (1 << VERT_RESULT_COL1)) { + out_vtxfmt0 |= R200_VTX_FP_RGBA << R200_VTX_COLOR_1_SHIFT; + out_compsel |= R200_OUTPUT_COLOR_1; + } + /* FIXME: probably not everything is set up for fogc and psiz to work correctly */ + if (vp_out & (1 << VERT_RESULT_FOGC)) { + out_vtxfmt0 |= R200_VTX_DISCRETE_FOG; + out_compsel |= R200_OUTPUT_DISCRETE_FOG; + } + if (vp_out & (1 << VERT_RESULT_PSIZ)) { + out_vtxfmt0 |= R200_VTX_POINT_SIZE; + out_compsel |= R200_OUTPUT_PT_SIZE; + } + for (i = VERT_RESULT_TEX0; i < VERT_RESULT_TEX6; i++) { + if (vp_out & (1 << i)) { + out_vtxfmt1 |= 4 << ((i - VERT_RESULT_TEX0) * 3); + out_compsel |= R200_OUTPUT_TEX_0 << (i - VERT_RESULT_TEX0); + } + } + if ((rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] != out_vtxfmt0) || + (rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_1] != out_vtxfmt1) || + (rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_COMPSEL] != out_compsel)) { + R200_STATECHANGE( rmesa, vtx ); + rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] = out_vtxfmt0; + rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_1] = out_vtxfmt1; + rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_COMPSEL] = out_compsel; + /* FIXME: should restore this when disabling vertex programs maybe? */ } } @@ -486,7 +545,8 @@ static void transition_to_swtnl( GLcontext *ctx ) * need to put the card into D3D mode to make it work: */ R200_STATECHANGE( rmesa, vap ); - rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL] &= ~R200_VAP_TCL_ENABLE; + /* not sure if it's strictly necessary to disable VAP_PROG_VTX_SHADER_ENABLE in addition to VAP_TCL_ENABLE) */ + rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL] &= ~(R200_VAP_TCL_ENABLE|R200_VAP_PROG_VTX_SHADER_ENABLE); } static void transition_to_hwtnl( GLcontext *ctx ) @@ -513,6 +573,10 @@ static void transition_to_hwtnl( GLcontext *ctx ) rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL] |= R200_VAP_TCL_ENABLE; rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL] &= ~R200_VAP_FORCE_W_TO_ONE; + if (ctx->VertexProgram._Enabled) { + rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL] |= R200_VAP_PROG_VTX_SHADER_ENABLE; + } + if ( ((rmesa->hw.ctx.cmd[CTX_PP_FOG_COLOR] & R200_FOG_USE_MASK) == R200_FOG_USE_SPEC_ALPHA) && (ctx->Fog.FogCoordinateSource == GL_FOG_COORD )) { diff --git a/src/mesa/drivers/dri/r200/r200_vertprog.c b/src/mesa/drivers/dri/r200/r200_vertprog.c new file mode 100644 index 00000000000..cd008df7aab --- /dev/null +++ b/src/mesa/drivers/dri/r200/r200_vertprog.c @@ -0,0 +1,1180 @@ +/************************************************************************** + +Copyright (C) 2005 Aapo Tahkola. + +All Rights Reserved. + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the "Software"), +to deal in the Software without restriction, including without limitation +on the rights to use, copy, modify, merge, publish, distribute, sub +license, and/or sell copies of the Software, and to permit persons to whom +the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice (including the next +paragraph) shall be included in all copies or substantial portions of the +Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL +THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, +DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR +OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE +USE OR OTHER DEALINGS IN THE SOFTWARE. + +**************************************************************************/ + +/* + * Authors: + * Aapo Tahkola + */ +#include "glheader.h" +#include "macros.h" +#include "enums.h" +#include "program.h" + +#include "r200_context.h" +#include "r200_vertprog.h" +#include "r200_ioctl.h" +#include "r200_tcl.h" +#include "program_instruction.h" +#include "tnl/tnl.h" + +#define SCALAR_FLAG (1<<31) +#define FLAG_MASK (1<<31) +#define OP_MASK (0xf) /* we are unlikely to have more than 15 */ +#define OPN(operator, ip, op) {#operator, OPCODE_##operator, ip, op} + +static struct{ + char *name; + int opcode; + unsigned long ip; /* number of input operands and flags */ + unsigned long op; +}op_names[]={ + OPN(ABS, 1, 1), + OPN(ADD, 2, 1), + OPN(ARL, 1, 1|SCALAR_FLAG), + OPN(DP3, 2, 3|SCALAR_FLAG), + OPN(DP4, 2, 3|SCALAR_FLAG), + OPN(DPH, 2, 3|SCALAR_FLAG), + OPN(DST, 2, 1), + OPN(EX2, 1|SCALAR_FLAG, 4|SCALAR_FLAG), + OPN(EXP, 1|SCALAR_FLAG, 1), + OPN(FLR, 1, 1), + OPN(FRC, 1, 1), + OPN(LG2, 1|SCALAR_FLAG, 4|SCALAR_FLAG), + OPN(LIT, 1, 1), + OPN(LOG, 1|SCALAR_FLAG, 1), + OPN(MAD, 3, 1), + OPN(MAX, 2, 1), + OPN(MIN, 2, 1), + OPN(MOV, 1, 1), + OPN(MUL, 2, 1), + OPN(POW, 2|SCALAR_FLAG, 4|SCALAR_FLAG), + OPN(RCP, 1|SCALAR_FLAG, 4|SCALAR_FLAG), + OPN(RSQ, 1|SCALAR_FLAG, 4|SCALAR_FLAG), + OPN(SGE, 2, 1), + OPN(SLT, 2, 1), + OPN(SUB, 2, 1), + OPN(SWZ, 1, 1), + OPN(XPD, 2, 1), + OPN(RCC, 0, 0), //extra + OPN(PRINT, 0, 0), + OPN(END, 0, 0), +}; +#undef OPN + +static GLboolean r200VertexProgUpdateParams(GLcontext *ctx, struct r200_vertex_program *vp) +{ + r200ContextPtr rmesa = R200_CONTEXT( ctx ); + GLfloat *fcmd = (GLfloat *)&rmesa->hw.vpp[0].cmd[VPP_CMD_0 + 1]; + int pi; + struct vertex_program *mesa_vp = (void *)vp; + struct program_parameter_list *paramList; + drm_radeon_cmd_header_t tmp; + + R200_STATECHANGE( rmesa, vpp[0] ); + R200_STATECHANGE( rmesa, vpp[1] ); + assert(mesa_vp->Base.Parameters); + _mesa_load_state_parameters(ctx, mesa_vp->Base.Parameters); + paramList = mesa_vp->Base.Parameters; + + if(paramList->NumParameters > R200_VSF_MAX_PARAM){ + fprintf(stderr, "%s:Params exhausted\n", __FUNCTION__); + return GL_FALSE; + } + + for(pi = 0; pi < paramList->NumParameters; pi++) { + switch(paramList->Parameters[pi].Type) { + case PROGRAM_STATE_VAR: + case PROGRAM_NAMED_PARAM: + //fprintf(stderr, "%s", vp->Parameters->Parameters[pi].Name); + case PROGRAM_CONSTANT: + *fcmd++ = paramList->ParameterValues[pi][0]; + *fcmd++ = paramList->ParameterValues[pi][1]; + *fcmd++ = paramList->ParameterValues[pi][2]; + *fcmd++ = paramList->ParameterValues[pi][3]; + break; + default: + _mesa_problem(NULL, "Bad param type in %s", __FUNCTION__); + break; + } + if (pi == 95) { + fcmd = (GLfloat *)rmesa->hw.vpp[1].cmd[VPP_CMD_0 + 1]; + } + } + /* hack up the cmd_size so not the whole state atom is emitted always. */ + rmesa->hw.vpp[0].cmd_size = + 1 + 4 * ((paramList->NumParameters > 96) ? 96 : paramList->NumParameters); + tmp.i = rmesa->hw.vpp[0].cmd[VPP_CMD_0]; + tmp.veclinear.count = (paramList->NumParameters > 96) ? 96 : paramList->NumParameters; + rmesa->hw.vpp[0].cmd[VPP_CMD_0] = tmp.i; + if (paramList->NumParameters > 96) { + rmesa->hw.vpp[1].cmd_size = 1 + 4 * (paramList->NumParameters - 96); + tmp.i = rmesa->hw.vpp[1].cmd[VPP_CMD_0]; + tmp.veclinear.count = paramList->NumParameters - 96; + rmesa->hw.vpp[1].cmd[VPP_CMD_0] = tmp.i; + } + return GL_TRUE; +} + +static unsigned long t_dst_mask(GLuint mask) +{ + unsigned long flags = 0; + + if(mask & WRITEMASK_X) flags |= VSF_FLAG_X; + if(mask & WRITEMASK_Y) flags |= VSF_FLAG_Y; + if(mask & WRITEMASK_Z) flags |= VSF_FLAG_Z; + if(mask & WRITEMASK_W) flags |= VSF_FLAG_W; + + return flags; +} + +static unsigned long t_dst(struct prog_dst_register *dst) +{ + switch(dst->File) { + case PROGRAM_TEMPORARY: + return ((dst->Index << R200_VPI_OUT_REG_INDEX_SHIFT) + | R200_VSF_OUT_CLASS_TMP); + case PROGRAM_OUTPUT: + switch (dst->Index) { + case VERT_RESULT_HPOS: + return R200_VSF_OUT_CLASS_RESULT_POS; + case VERT_RESULT_COL0: + return R200_VSF_OUT_CLASS_RESULT_COLOR; + case VERT_RESULT_COL1: + return ((1 << R200_VPI_OUT_REG_INDEX_SHIFT) + | R200_VSF_OUT_CLASS_RESULT_COLOR); + case VERT_RESULT_FOGC: + return R200_VSF_OUT_CLASS_RESULT_FOGC; + case VERT_RESULT_TEX0: + case VERT_RESULT_TEX1: + case VERT_RESULT_TEX2: + case VERT_RESULT_TEX3: + case VERT_RESULT_TEX4: + case VERT_RESULT_TEX5: + return (((dst->Index - VERT_RESULT_TEX0) << R200_VPI_OUT_REG_INDEX_SHIFT) + | R200_VSF_OUT_CLASS_RESULT_TEXC); + case VERT_RESULT_PSIZ: + return R200_VSF_OUT_CLASS_RESULT_POINTSIZE; + default: + fprintf(stderr, "problem in %s, unknown dst output reg %d\n", __FUNCTION__, dst->Index); + exit(0); + return 0; + } + case PROGRAM_ADDRESS: + assert (dst->Index == 0); + return R200_VSF_OUT_CLASS_ADDR; + default: + fprintf(stderr, "problem in %s, unknown register type %d\n", __FUNCTION__, dst->File); + exit(0); + return 0; + } +} + +static unsigned long t_src_class(enum register_file file) +{ + + switch(file){ + case PROGRAM_TEMPORARY: + return VSF_IN_CLASS_TMP; + + case PROGRAM_INPUT: + return VSF_IN_CLASS_ATTR; + + case PROGRAM_LOCAL_PARAM: + case PROGRAM_ENV_PARAM: + case PROGRAM_NAMED_PARAM: + case PROGRAM_STATE_VAR: + return VSF_IN_CLASS_PARAM; + /* + case PROGRAM_OUTPUT: + case PROGRAM_WRITE_ONLY: + case PROGRAM_ADDRESS: + */ + default: + fprintf(stderr, "problem in %s", __FUNCTION__); + exit(0); + } +} + +static __inline unsigned long t_swizzle(GLubyte swizzle) +{ +/* this is in fact a NOP as the Mesa SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */ +/* + switch(swizzle){ + case SWIZZLE_X: return VSF_IN_COMPONENT_X; + case SWIZZLE_Y: return VSF_IN_COMPONENT_Y; + case SWIZZLE_Z: return VSF_IN_COMPONENT_Z; + case SWIZZLE_W: return VSF_IN_COMPONENT_W; + case SWIZZLE_ZERO: return VSF_IN_COMPONENT_ZERO; + case SWIZZLE_ONE: return VSF_IN_COMPONENT_ONE; + default: + fprintf(stderr, "problem in %s", __FUNCTION__); + exit(0); + } +*/ + return swizzle; +} + +#if 0 +static void vp_dump_inputs(struct r200_vertex_program *vp, char *caller) +{ + int i; + + if(vp == NULL){ + fprintf(stderr, "vp null in call to %s from %s\n", __FUNCTION__, caller); + return ; + } + + fprintf(stderr, "%s:<", caller); + for(i=0; i < VERT_ATTRIB_MAX; i++) + fprintf(stderr, "%d ", vp->inputs[i]); + fprintf(stderr, ">\n"); + +} +#endif + +static unsigned long t_src_index(struct r200_vertex_program *vp, struct prog_src_register *src) +{ +/* + int i; + int max_reg = -1; +*/ + if(src->File == PROGRAM_INPUT){ +/* if(vp->inputs[src->Index] != -1) + return vp->inputs[src->Index]; + + for(i=0; i < VERT_ATTRIB_MAX; i++) + if(vp->inputs[i] > max_reg) + max_reg = vp->inputs[i]; + + vp->inputs[src->Index] = max_reg+1;*/ + + //vp_dump_inputs(vp, __FUNCTION__); + assert(vp->inputs[src->Index] != -1); + return vp->inputs[src->Index]; + } else { + if (src->Index < 0) { + fprintf(stderr, "WARNING negative offsets for indirect addressing do not work\n"); + return 0; + } + return src->Index; + } +} + +static unsigned long t_src(struct r200_vertex_program *vp, struct prog_src_register *src) +{ + + return MAKE_VSF_SOURCE(t_src_index(vp, src), + t_swizzle(GET_SWZ(src->Swizzle, 0)), + t_swizzle(GET_SWZ(src->Swizzle, 1)), + t_swizzle(GET_SWZ(src->Swizzle, 2)), + t_swizzle(GET_SWZ(src->Swizzle, 3)), + t_src_class(src->File), + src->NegateBase) | (src->RelAddr << 4); +} + +static unsigned long t_src_scalar(struct r200_vertex_program *vp, struct prog_src_register *src) +{ + + return MAKE_VSF_SOURCE(t_src_index(vp, src), + t_swizzle(GET_SWZ(src->Swizzle, 0)), + t_swizzle(GET_SWZ(src->Swizzle, 0)), + t_swizzle(GET_SWZ(src->Swizzle, 0)), + t_swizzle(GET_SWZ(src->Swizzle, 0)), + t_src_class(src->File), + src->NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src->RelAddr << 4); +} + +static unsigned long t_opcode(enum prog_opcode opcode) +{ + + switch(opcode){ + case OPCODE_DST: return R200_VPI_OUT_OP_DST; + case OPCODE_EX2: return R200_VPI_OUT_OP_EX2; + case OPCODE_EXP: return R200_VPI_OUT_OP_EXP; + case OPCODE_FRC: return R200_VPI_OUT_OP_FRC; + case OPCODE_LG2: return R200_VPI_OUT_OP_LG2; + case OPCODE_LOG: return R200_VPI_OUT_OP_LOG; + case OPCODE_MAX: return R200_VPI_OUT_OP_MAX; + case OPCODE_MIN: return R200_VPI_OUT_OP_MIN; + case OPCODE_MUL: return R200_VPI_OUT_OP_MUL; + case OPCODE_RCP: return R200_VPI_OUT_OP_RCP; + case OPCODE_RSQ: return R200_VPI_OUT_OP_RSQ; + case OPCODE_SGE: return R200_VPI_OUT_OP_SGE; + case OPCODE_SLT: return R200_VPI_OUT_OP_SLT; + case OPCODE_DP4: return R200_VPI_OUT_OP_DOT; + case OPCODE_ADD: return R200_VPI_OUT_OP_ADD; + + default: + fprintf(stderr, "%s: Should not be called with opcode %d!", __FUNCTION__, opcode); + } + exit(-1); + return 0; +} + +static unsigned long op_operands(enum prog_opcode opcode) +{ + int i; + + /* Can we trust mesas opcodes to be in order ? */ + for(i=0; i < sizeof(op_names) / sizeof(*op_names); i++) + if(op_names[i].opcode == opcode) + return op_names[i].ip; + + fprintf(stderr, "op %d not found in op_names\n", opcode); + exit(-1); + return 0; +} + +/* TODO: Get rid of t_src_class call */ +#define CMP_SRCS(a, b) ((a.RelAddr != b.RelAddr) || (a.Index != b.Index && \ + ((t_src_class(a.File) == VSF_IN_CLASS_PARAM && \ + t_src_class(b.File) == VSF_IN_CLASS_PARAM) || \ + (t_src_class(a.File) == VSF_IN_CLASS_ATTR && \ + t_src_class(b.File) == VSF_IN_CLASS_ATTR)))) \ + +/* fglrx on rv250 codes up unused sources as follows: + unused but necessary sources are same as previous source, zero-ed out. + unnecessary sources are same as previous source but with VSF_IN_CLASS_NONE set. + i.e. an add (2 args) has its 2nd arg (if you use it as mov) zero-ed out, and 3rd arg + set to VSF_IN_CLASS_NONE. Not sure if strictly necessary. */ + +/* use these simpler definitions. Must obviously not be used with not yet set up regs. + Those are NOT semantically equivalent to the r300 ones, requires code changes */ +#define ZERO_SRC_0 (((o_inst->src0 & ~(0xfff << R200_VPI_IN_X_SHIFT)) \ + | ((R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_X_SHIFT) \ + | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_Y_SHIFT) \ + | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_Z_SHIFT) \ + | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_W_SHIFT)))) + +#define ZERO_SRC_1 (((o_inst->src1 & ~(0xfff << R200_VPI_IN_X_SHIFT)) \ + | ((R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_X_SHIFT) \ + | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_Y_SHIFT) \ + | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_Z_SHIFT) \ + | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_W_SHIFT)))) + +#define ZERO_SRC_2 (((o_inst->src2 & ~(0xfff << R200_VPI_IN_X_SHIFT)) \ + | ((R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_X_SHIFT) \ + | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_Y_SHIFT) \ + | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_Z_SHIFT) \ + | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_W_SHIFT)))) + +#define UNUSED_SRC_0 ((o_inst->src0 & ~15) | 9) + +#define UNUSED_SRC_1 ((o_inst->src1 & ~15) | 9) + +#define UNUSED_SRC_2 ((o_inst->src2 & ~15) | 9) + + +/* DP4 version seems to trigger some hw peculiarity - fglrx does this on r200 however */ +#define PREFER_DP4 + +static GLboolean r200_translate_vertex_program(struct r200_vertex_program *vp) +{ + struct vertex_program *mesa_vp = (void *)vp; + struct prog_instruction *vpi; + int i; + VERTEX_SHADER_INSTRUCTION *o_inst; + unsigned long operands; + int are_srcs_scalar; + unsigned long hw_op; + + vp->native = GL_FALSE; + + if ((mesa_vp->Base.InputsRead & + ~(VERT_BIT_POS | VERT_BIT_NORMAL | VERT_BIT_COLOR0 | VERT_BIT_COLOR1 | + VERT_BIT_FOG | VERT_BIT_TEX0 | VERT_BIT_TEX1 | VERT_BIT_TEX2 | + VERT_BIT_TEX3 | VERT_BIT_TEX4 | VERT_BIT_TEX5)) != 0) { + if (R200_DEBUG & DEBUG_FALLBACKS) { + fprintf(stderr, "can't handle vert prog inputs 0x%x\n", + mesa_vp->Base.InputsRead); + } + return GL_FALSE; + } + + /* Initial value should be last tmp reg that hw supports. + Strangely enough r300 doesnt mind even though these would be out of range. + Smart enough to realize that it doesnt need it? */ + int u_temp_i = R200_VSF_MAX_TEMPS - 1; + struct prog_src_register src[3]; + +/* if (getenv("R300_VP_SAFETY")) { + WARN_ONCE("R300_VP_SAFETY enabled.\n"); + + vpi = malloc((mesa_vp->Base.NumInstructions + VSF_MAX_FRAGMENT_TEMPS) * sizeof(struct prog_instruction)); + memset(vpi, 0, VSF_MAX_FRAGMENT_TEMPS * sizeof(struct prog_instruction)); + + for (i=0; i < VSF_MAX_FRAGMENT_TEMPS; i++) { + vpi[i].Opcode = OPCODE_MOV; + vpi[i].StringPos = 0; + vpi[i].Data = 0; + + vpi[i].DstReg.File = PROGRAM_TEMPORARY; + vpi[i].DstReg.Index = i; + vpi[i].DstReg.WriteMask = WRITEMASK_XYZW; + vpi[i].DstReg.CondMask = COND_TR; + + vpi[i].SrcReg[0].File = PROGRAM_STATE_VAR; + vpi[i].SrcReg[0].Index = 0; + vpi[i].SrcReg[0].Swizzle = MAKE_SWIZZLE4(SWIZZLE_ONE, SWIZZLE_ONE, SWIZZLE_ONE, SWIZZLE_ONE); + } + + memcpy(&vpi[i], mesa_vp->Base.Instructions, mesa_vp->Base.NumInstructions * sizeof(struct prog_instruction)); + + free(mesa_vp->Base.Instructions); + + mesa_vp->Base.Instructions = vpi; + + mesa_vp->Base.NumInstructions += VSF_MAX_FRAGMENT_TEMPS; + vpi = &mesa_vp->Base.Instructions[mesa_vp->Base.NumInstructions-1]; + + assert(vpi->Opcode == OPCODE_END); + }*/ +/* FIXME: is changing the prog safe to do here? */ + if (mesa_vp->IsPositionInvariant) { + struct program_parameter_list *paramList; + GLint tokens[6] = { STATE_MATRIX, STATE_MVP, 0, 0, 0, STATE_MATRIX }; + +#ifdef PREFER_DP4 + tokens[5] = STATE_MATRIX; +#else + tokens[5] = STATE_MATRIX_TRANSPOSE; +#endif + paramList = mesa_vp->Base.Parameters; + + vpi = malloc((mesa_vp->Base.NumInstructions + 4) * sizeof(struct prog_instruction)); + memset(vpi, 0, 4 * sizeof(struct prog_instruction)); + + for (i=0; i < 4; i++) { + GLint idx; + tokens[3] = tokens[4] = i; + idx = _mesa_add_state_reference(paramList, tokens); +#ifdef PREFER_DP4 + vpi[i].Opcode = OPCODE_DP4; + vpi[i].StringPos = 0; + vpi[i].Data = 0; + + vpi[i].DstReg.File = PROGRAM_OUTPUT; + vpi[i].DstReg.Index = VERT_RESULT_HPOS; + vpi[i].DstReg.WriteMask = 1 << i; + vpi[i].DstReg.CondMask = COND_TR; + + vpi[i].SrcReg[0].File = PROGRAM_STATE_VAR; + vpi[i].SrcReg[0].Index = idx; + vpi[i].SrcReg[0].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W); + + vpi[i].SrcReg[1].File = PROGRAM_INPUT; + vpi[i].SrcReg[1].Index = VERT_ATTRIB_POS; + vpi[i].SrcReg[1].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W); +#else + if (i == 0) + vpi[i].Opcode = OPCODE_MUL; + else + vpi[i].Opcode = OPCODE_MAD; + + vpi[i].StringPos = 0; + vpi[i].Data = 0; + + if (i == 3) + vpi[i].DstReg.File = PROGRAM_OUTPUT; + else + vpi[i].DstReg.File = PROGRAM_TEMPORARY; + vpi[i].DstReg.Index = 0; + vpi[i].DstReg.WriteMask = 0xf; + vpi[i].DstReg.CondMask = COND_TR; + + vpi[i].SrcReg[0].File = PROGRAM_STATE_VAR; + vpi[i].SrcReg[0].Index = idx; + vpi[i].SrcReg[0].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W); + + vpi[i].SrcReg[1].File = PROGRAM_INPUT; + vpi[i].SrcReg[1].Index = VERT_ATTRIB_POS; + vpi[i].SrcReg[1].Swizzle = MAKE_SWIZZLE4(i, i, i, i); + + if (i > 0) { + vpi[i].SrcReg[2].File = PROGRAM_TEMPORARY; + vpi[i].SrcReg[2].Index = 0; + vpi[i].SrcReg[2].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W); + } +#endif + } + + memcpy(&vpi[i], mesa_vp->Base.Instructions, mesa_vp->Base.NumInstructions * sizeof(struct prog_instruction)); + + free(mesa_vp->Base.Instructions); + + mesa_vp->Base.Instructions = vpi; + + mesa_vp->Base.NumInstructions += 4; + vpi = &mesa_vp->Base.Instructions[mesa_vp->Base.NumInstructions-1]; + + assert(vpi->Opcode == OPCODE_END); + + mesa_vp->Base.InputsRead |= (1 << VERT_ATTRIB_POS); + mesa_vp->Base.OutputsWritten |= (1 << VERT_RESULT_HPOS); + + //fprintf(stderr, "IsPositionInvariant is set!\n"); + //_mesa_print_program(&mesa_vp->Base); + } + + vp->pos_end = 0; + mesa_vp->Base.NumNativeInstructions = 0; + mesa_vp->Base.NumNativeParameters = mesa_vp->Base.Parameters->NumParameters; + + for(i=0; i < VERT_ATTRIB_MAX; i++) + vp->inputs[i] = -1; +/* fglrx uses fixed inputs as follows for conventional attribs. + generic attribs use non-fixed assignment, fglrx will always use the lowest attrib values available. + There are 12 generic attribs possible, corresponding to attrib 0, 2-11 and 13 in a hw vertex prog. + attr 1 and 12 are not available for generic attribs as those cannot be made vec4 (correspond to + vertex normal/weight) + attr 0 is pos, R200_VTX_XY1|R200_VTX_Z1|R200_VTX_W1 in R200_SE_VTX_FMT_0 + attr 2-5 use colors 0-3 (R200_VTX_FP_RGBA << R200_VTX_COLOR_0/1/2/3_SHIFT in R200_SE_VTX_FMT_0) + attr 6-11 use tex 0-5 (4 << R200_VTX_TEX0/1/2/3/4/5_COMP_CNT_SHIFT in R200_SE_VTX_FMT_1) + attr 13 uses vtx1 pos (R200_VTX_XY1|R200_VTX_Z1|R200_VTX_W1 in R200_SE_VTX_FMT_0) + generic attribs would require some more work (dma regions, renaming). */ + +/* may look different when using idx buf / input_route instead of se_vtx_fmt? */ + vp->inputs[VERT_ATTRIB_POS] = 0; + vp->inputs[VERT_ATTRIB_WEIGHT] = 12; + vp->inputs[VERT_ATTRIB_NORMAL] = 1; + vp->inputs[VERT_ATTRIB_COLOR0] = 2; + vp->inputs[VERT_ATTRIB_COLOR1] = 3; + vp->inputs[VERT_ATTRIB_FOG] = 15; + vp->inputs[VERT_ATTRIB_TEX0] = 6; + vp->inputs[VERT_ATTRIB_TEX1] = 7; + vp->inputs[VERT_ATTRIB_TEX2] = 8; + vp->inputs[VERT_ATTRIB_TEX3] = 9; + vp->inputs[VERT_ATTRIB_TEX4] = 10; + vp->inputs[VERT_ATTRIB_TEX5] = 11; +/* attr 4,5 and 13 are only used with generic attribs. + Haven't seen attr 14 used, maybe that's for the hw pointsize vec1 (which is + not possibe to use with vertex progs as it is lacking in vert prog specification) */ + + assert(mesa_vp->Base.OutputsWritten & (1 << VERT_RESULT_HPOS)); + + vp->translated = GL_TRUE; + + o_inst = vp->instr; + for(vpi = mesa_vp->Base.Instructions; vpi->Opcode != OPCODE_END; vpi++, o_inst++){ + if (u_temp_i < mesa_vp->Base.NumTemporaries) { + if (R200_DEBUG & DEBUG_FALLBACKS) { + fprintf(stderr, "Ran out of temps, num temps %d, us %d\n", mesa_vp->Base.NumTemporaries, u_temp_i); + } + return GL_FALSE; + } + u_temp_i = R200_VSF_MAX_TEMPS - 1; + if(o_inst - vp->instr >= R200_VSF_MAX_INST) { + mesa_vp->Base.NumNativeInstructions = 129; + if (R200_DEBUG & DEBUG_FALLBACKS) { + fprintf(stderr, "more than 128 native instructions\n"); + } + return GL_FALSE; + } + + operands = op_operands(vpi->Opcode); + are_srcs_scalar = operands & SCALAR_FLAG; + operands &= OP_MASK; + + for(i=0; i < operands; i++) + src[i] = vpi->SrcReg[i]; + + if(operands == 3){ /* TODO: scalars */ + if( CMP_SRCS(src[1], src[2]) || CMP_SRCS(src[0], src[2]) ){ + o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD, + (u_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP, + VSF_FLAG_ALL); + + o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[2]), + SWIZZLE_X, SWIZZLE_Y, + SWIZZLE_Z, SWIZZLE_W, + t_src_class(src[2].File), VSF_FLAG_NONE) | (src[2].RelAddr << 4); + + o_inst->src1 = ZERO_SRC_0; + o_inst->src2 = UNUSED_SRC_1; + o_inst++; + + src[2].File = PROGRAM_TEMPORARY; + src[2].Index = u_temp_i; + src[2].RelAddr = 0; + u_temp_i--; + } + } + + if(operands >= 2){ + if( CMP_SRCS(src[1], src[0]) ){ + o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD, + (u_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP, + VSF_FLAG_ALL); + + o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]), + SWIZZLE_X, SWIZZLE_Y, + SWIZZLE_Z, SWIZZLE_W, + t_src_class(src[0].File), VSF_FLAG_NONE) | (src[0].RelAddr << 4); + + o_inst->src1 = ZERO_SRC_0; + o_inst->src2 = UNUSED_SRC_1; + o_inst++; + + src[0].File = PROGRAM_TEMPORARY; + src[0].Index = u_temp_i; + src[0].RelAddr = 0; + u_temp_i--; + } + } + + /* These ops need special handling. */ + switch(vpi->Opcode){ + /* FIXME: ARL works fine, but negative offsets won't work - fglrx just sems to ignore neg offsets + which isn't quite correct... */ + case OPCODE_ARL: + o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ARL, t_dst(&vpi->DstReg), + t_dst_mask(vpi->DstReg.WriteMask)); + o_inst->src0 = t_src_scalar(vp, &src[0]); + o_inst->src1 = UNUSED_SRC_0; + o_inst->src2 = UNUSED_SRC_1; + goto next; + + case OPCODE_POW: +/* pow takes only one argument, first scalar is in slot x, 2nd in slot z (other slots don't matter). + So may need to insert additional instruction */ +/* this appears to be different to r300 */ + if ((src[0].File == src[1].File) && + (src[0].Index == src[1].Index)) { + o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_POW, t_dst(&vpi->DstReg), + t_dst_mask(vpi->DstReg.WriteMask)); + o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]), + t_swizzle(GET_SWZ(src[0].Swizzle, 0)), + SWIZZLE_ZERO, + t_swizzle(GET_SWZ(src[1].Swizzle, 0)), + SWIZZLE_ZERO, + t_src_class(src[0].File), + src[0].NegateBase) | (src[0].RelAddr << 4); + o_inst->src1 = UNUSED_SRC_0; + o_inst->src2 = UNUSED_SRC_0; + } + else { + o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD, + (u_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP, + VSF_FLAG_ALL); + o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]), + t_swizzle(GET_SWZ(src[0].Swizzle, 0)), + SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO, + t_src_class(src[0].File), + src[0].NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[0].RelAddr << 4); + o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]), + SWIZZLE_ZERO, SWIZZLE_ZERO, + t_swizzle(GET_SWZ(src[1].Swizzle, 0)), SWIZZLE_ZERO, + t_src_class(src[1].File), + src[1].NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[1].RelAddr << 4); + o_inst->src2 = UNUSED_SRC_1; + o_inst++; + + o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_POW, t_dst(&vpi->DstReg), + t_dst_mask(vpi->DstReg.WriteMask)); + o_inst->src0 = MAKE_VSF_SOURCE(u_temp_i, + VSF_IN_COMPONENT_X, + VSF_IN_COMPONENT_Y, + VSF_IN_COMPONENT_Z, + VSF_IN_COMPONENT_W, + VSF_IN_CLASS_TMP, + VSF_FLAG_NONE); + o_inst->src1 = UNUSED_SRC_0; + o_inst->src2 = UNUSED_SRC_0; + u_temp_i--; + } + goto next; + + case OPCODE_MOV://ADD RESULT 1.X Y Z W PARAM 0{} {X Y Z W} PARAM 0{} {ZERO ZERO ZERO ZERO} + o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD, t_dst(&vpi->DstReg), + t_dst_mask(vpi->DstReg.WriteMask)); + o_inst->src0 = t_src(vp, &src[0]); + o_inst->src1 = ZERO_SRC_0; + o_inst->src2 = UNUSED_SRC_1; + goto next; + case OPCODE_MAD: + hw_op=(src[0].File == PROGRAM_TEMPORARY && + src[1].File == PROGRAM_TEMPORARY && + src[2].File == PROGRAM_TEMPORARY) ? R200_VPI_OUT_OP_MAD_2 : R200_VPI_OUT_OP_MAD; + + o_inst->op = MAKE_VSF_OP(hw_op, t_dst(&vpi->DstReg), + t_dst_mask(vpi->DstReg.WriteMask)); + o_inst->src0 = t_src(vp, &src[0]); +#if 0 +if ((o_inst - vp->instr) == 31) { +/* fix up the broken vertex program of quake4 demo... */ +o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]), + SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, + t_src_class(src[1].File), + src[1].NegateBase) | (src[1].RelAddr << 4); +o_inst->src2 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]), + SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, + t_src_class(src[1].File), + src[1].NegateBase) | (src[1].RelAddr << 4); +} +else { + o_inst->src1 = t_src(vp, &src[1]); + o_inst->src2 = t_src(vp, &src[2]); +} +#else + o_inst->src1 = t_src(vp, &src[1]); + o_inst->src2 = t_src(vp, &src[2]); +#endif + goto next; + + case OPCODE_DP3://DOT RESULT 1.X Y Z W PARAM 0{} {X Y Z ZERO} PARAM 0{} {X Y Z ZERO} + o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_DOT, t_dst(&vpi->DstReg), + t_dst_mask(vpi->DstReg.WriteMask)); + + o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]), + t_swizzle(GET_SWZ(src[0].Swizzle, 0)), + t_swizzle(GET_SWZ(src[0].Swizzle, 1)), + t_swizzle(GET_SWZ(src[0].Swizzle, 2)), + SWIZZLE_ZERO, + t_src_class(src[0].File), + src[0].NegateBase) | (src[0].RelAddr << 4); + + o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]), + t_swizzle(GET_SWZ(src[1].Swizzle, 0)), + t_swizzle(GET_SWZ(src[1].Swizzle, 1)), + t_swizzle(GET_SWZ(src[1].Swizzle, 2)), + SWIZZLE_ZERO, + t_src_class(src[1].File), + src[1].NegateBase) | (src[1].RelAddr << 4); + + o_inst->src2 = UNUSED_SRC_1; + goto next; + + case OPCODE_SUB://ADD RESULT 1.X Y Z W TMP 0{} {X Y Z W} PARAM 1{X Y Z W } {X Y Z W} neg Xneg Yneg Zneg W + o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD, t_dst(&vpi->DstReg), + t_dst_mask(vpi->DstReg.WriteMask)); + + o_inst->src0 = t_src(vp, &src[0]); + o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]), + t_swizzle(GET_SWZ(src[1].Swizzle, 0)), + t_swizzle(GET_SWZ(src[1].Swizzle, 1)), + t_swizzle(GET_SWZ(src[1].Swizzle, 2)), + t_swizzle(GET_SWZ(src[1].Swizzle, 3)), + t_src_class(src[1].File), + (!src[1].NegateBase) ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[1].RelAddr << 4); + o_inst->src2 = UNUSED_SRC_1; + goto next; + + case OPCODE_ABS://MAX RESULT 1.X Y Z W PARAM 0{} {X Y Z W} PARAM 0{X Y Z W } {X Y Z W} neg Xneg Yneg Zneg W + o_inst->op=MAKE_VSF_OP(R200_VPI_OUT_OP_MAX, t_dst(&vpi->DstReg), + t_dst_mask(vpi->DstReg.WriteMask)); + + o_inst->src0=t_src(vp, &src[0]); + o_inst->src1=MAKE_VSF_SOURCE(t_src_index(vp, &src[0]), + t_swizzle(GET_SWZ(src[0].Swizzle, 0)), + t_swizzle(GET_SWZ(src[0].Swizzle, 1)), + t_swizzle(GET_SWZ(src[0].Swizzle, 2)), + t_swizzle(GET_SWZ(src[0].Swizzle, 3)), + t_src_class(src[0].File), + (!src[0].NegateBase) ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[0].RelAddr << 4); + o_inst->src2 = UNUSED_SRC_1; + goto next; + + case OPCODE_FLR: + /* FRC TMP 0.X Y Z W PARAM 0{} {X Y Z W} + ADD RESULT 1.X Y Z W PARAM 0{} {X Y Z W} TMP 0{X Y Z W } {X Y Z W} neg Xneg Yneg Zneg W */ + + o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_FRC, + (u_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP, + t_dst_mask(vpi->DstReg.WriteMask)); + + o_inst->src0 = t_src(vp, &src[0]); + o_inst->src1 = UNUSED_SRC_0; + o_inst->src2 = UNUSED_SRC_1; + o_inst++; + + o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD, t_dst(&vpi->DstReg), + t_dst_mask(vpi->DstReg.WriteMask)); + + o_inst->src0 = t_src(vp, &src[0]); + o_inst->src1 = MAKE_VSF_SOURCE(u_temp_i, + VSF_IN_COMPONENT_X, + VSF_IN_COMPONENT_Y, + VSF_IN_COMPONENT_Z, + VSF_IN_COMPONENT_W, + VSF_IN_CLASS_TMP, + /* Not 100% sure about this */ + (!src[0].NegateBase) ? VSF_FLAG_ALL : VSF_FLAG_NONE/*VSF_FLAG_ALL*/); + + o_inst->src2 = UNUSED_SRC_0; + u_temp_i--; + goto next; + + case OPCODE_LG2:// LG2 RESULT 1.X Y Z W PARAM 0{} {X X X X} + o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_LG2, t_dst(&vpi->DstReg), + t_dst_mask(vpi->DstReg.WriteMask)); + + o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]), + t_swizzle(GET_SWZ(src[0].Swizzle, 0)), + t_swizzle(GET_SWZ(src[0].Swizzle, 0)), + t_swizzle(GET_SWZ(src[0].Swizzle, 0)), + t_swizzle(GET_SWZ(src[0].Swizzle, 0)), + t_src_class(src[0].File), + src[0].NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[0].RelAddr << 4); + o_inst->src1 = UNUSED_SRC_0; + o_inst->src2 = UNUSED_SRC_0; + goto next; + + case OPCODE_LIT://LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W} + o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_LIT, t_dst(&vpi->DstReg), + t_dst_mask(vpi->DstReg.WriteMask)); +/* r200 in contrast to r300 does not seem to need any complicated setup, + its LIT instruction is "more native" */ + o_inst->src0 = t_src(vp, &src[0]); + o_inst->src1 = UNUSED_SRC_0; + o_inst->src2 = UNUSED_SRC_0; + goto next; + + case OPCODE_DPH://DOT RESULT 1.X Y Z W PARAM 0{} {X Y Z ONE} PARAM 0{} {X Y Z W} + o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_DOT, t_dst(&vpi->DstReg), + t_dst_mask(vpi->DstReg.WriteMask)); + + o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]), + t_swizzle(GET_SWZ(src[0].Swizzle, 0)), + t_swizzle(GET_SWZ(src[0].Swizzle, 1)), + t_swizzle(GET_SWZ(src[0].Swizzle, 2)), + VSF_IN_COMPONENT_ONE, + t_src_class(src[0].File), + src[0].NegateBase) | (src[0].RelAddr << 4); + o_inst->src1 = t_src(vp, &src[1]); + o_inst->src2 = UNUSED_SRC_1; + goto next; + + case OPCODE_XPD: + /* mul r0, r1.yzxw, r2.zxyw + mad r0, -r2.yzxw, r1.zxyw, r0 + NOTE: might need MAD_2 + */ + + o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_MUL, + (u_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP, + t_dst_mask(vpi->DstReg.WriteMask)); + + o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]), + t_swizzle(GET_SWZ(src[0].Swizzle, 1)), // y + t_swizzle(GET_SWZ(src[0].Swizzle, 2)), // z + t_swizzle(GET_SWZ(src[0].Swizzle, 0)), // x + t_swizzle(GET_SWZ(src[0].Swizzle, 3)), // w + t_src_class(src[0].File), + src[0].NegateBase) | (src[0].RelAddr << 4); + + o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]), + t_swizzle(GET_SWZ(src[1].Swizzle, 2)), // z + t_swizzle(GET_SWZ(src[1].Swizzle, 0)), // x + t_swizzle(GET_SWZ(src[1].Swizzle, 1)), // y + t_swizzle(GET_SWZ(src[1].Swizzle, 3)), // w + t_src_class(src[1].File), + src[1].NegateBase) | (src[1].RelAddr << 4); + + o_inst->src2 = UNUSED_SRC_1; + o_inst++; + u_temp_i--; + + o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_MAD, t_dst(&vpi->DstReg), + t_dst_mask(vpi->DstReg.WriteMask)); + + o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]), + t_swizzle(GET_SWZ(src[1].Swizzle, 1)), // y + t_swizzle(GET_SWZ(src[1].Swizzle, 2)), // z + t_swizzle(GET_SWZ(src[1].Swizzle, 0)), // x + t_swizzle(GET_SWZ(src[1].Swizzle, 3)), // w + t_src_class(src[1].File), + (!src[1].NegateBase) ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[1].RelAddr << 4); + + o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]), + t_swizzle(GET_SWZ(src[0].Swizzle, 2)), // z + t_swizzle(GET_SWZ(src[0].Swizzle, 0)), // x + t_swizzle(GET_SWZ(src[0].Swizzle, 1)), // y + t_swizzle(GET_SWZ(src[0].Swizzle, 3)), // w + t_src_class(src[0].File), + src[0].NegateBase) | (src[0].RelAddr << 4); + + o_inst->src2 = MAKE_VSF_SOURCE(u_temp_i+1, + VSF_IN_COMPONENT_X, + VSF_IN_COMPONENT_Y, + VSF_IN_COMPONENT_Z, + VSF_IN_COMPONENT_W, + VSF_IN_CLASS_TMP, + VSF_FLAG_NONE); + + goto next; + + case OPCODE_SWZ: + o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD, t_dst(&vpi->DstReg), + t_dst_mask(vpi->DstReg.WriteMask)); + o_inst->src0 = t_src(vp, &src[0]); + o_inst->src1 = ZERO_SRC_0; + o_inst->src2 = UNUSED_SRC_1; + goto next; + + case OPCODE_RCC: + if (R200_DEBUG & DEBUG_FALLBACKS) { + fprintf(stderr, "Don't know how to handle op %d yet\n", vpi->Opcode); + } + return GL_FALSE; + break; + case OPCODE_END: + break; + default: + break; + } + + o_inst->op = MAKE_VSF_OP(t_opcode(vpi->Opcode), t_dst(&vpi->DstReg), + t_dst_mask(vpi->DstReg.WriteMask)); + + if(are_srcs_scalar){ + switch(operands){ + case 1: + o_inst->src0 = t_src_scalar(vp, &src[0]); + o_inst->src1 = UNUSED_SRC_0; + o_inst->src2 = UNUSED_SRC_1; + break; + + case 2: + o_inst->src0 = t_src_scalar(vp, &src[0]); + o_inst->src1 = t_src_scalar(vp, &src[1]); + o_inst->src2 = UNUSED_SRC_1; + break; + + case 3: + o_inst->src0 = t_src_scalar(vp, &src[0]); + o_inst->src1 = t_src_scalar(vp, &src[1]); + o_inst->src2 = t_src_scalar(vp, &src[2]); + break; + + default: + fprintf(stderr, "illegal number of operands %lu\n", operands); + exit(-1); + break; + } + } else { + switch(operands){ + case 1: + o_inst->src0 = t_src(vp, &src[0]); + o_inst->src1 = UNUSED_SRC_0; + o_inst->src2 = UNUSED_SRC_1; + break; + + case 2: + o_inst->src0 = t_src(vp, &src[0]); + o_inst->src1 = t_src(vp, &src[1]); + o_inst->src2 = UNUSED_SRC_1; + break; + + case 3: + o_inst->src0 = t_src(vp, &src[0]); + o_inst->src1 = t_src(vp, &src[1]); + o_inst->src2 = t_src(vp, &src[2]); + break; + + default: + fprintf(stderr, "illegal number of operands %lu\n", operands); + exit(-1); + break; + } + } + next: + if ((o_inst->op & R200_VSF_OUT_CLASS_MASK) == R200_VSF_OUT_CLASS_RESULT_POS) { + vp->pos_end = (o_inst - vp->instr); + } + } + + /* need to test again since some instructions require more than one (up to 3) native inst */ + if(o_inst - vp->instr > R200_VSF_MAX_INST) { + mesa_vp->Base.NumNativeInstructions = 129; + if (R200_DEBUG & DEBUG_FALLBACKS) { + fprintf(stderr, "more than 128 native instructions\n"); + } + return GL_FALSE; + } + vp->native = GL_TRUE; + mesa_vp->Base.NumNativeInstructions = (o_inst - vp->instr); +#if 0 + fprintf(stderr, "hw program:\n"); + for(i=0; i < vp->program.length; i++) + fprintf(stderr, "%08x\n", vp->instr[i]); +#endif + return GL_TRUE; +} + +void r200SetupVertexProg( GLcontext *ctx ) { + r200ContextPtr rmesa = R200_CONTEXT(ctx); + struct r200_vertex_program *vp = (struct r200_vertex_program *)ctx->VertexProgram.Current; + GLboolean fallback; + GLint i; + + if (!vp->translated) { + rmesa->curr_vp_hw = NULL; + r200_translate_vertex_program(vp); + } + /* could optimize setting up vertex progs away for non-tcl hw */ + fallback = !(vp->native && r200VertexProgUpdateParams(ctx, vp) && + rmesa->r200Screen->drmSupportsVertexProgram); + TCL_FALLBACK(ctx, R200_TCL_FALLBACK_VERTEX_PROGRAM, fallback); + if (fallback) return; + + R200_STATECHANGE( rmesa, pvs ); + + rmesa->hw.pvs.cmd[PVS_CNTL_1] = (0 << R200_PVS_CNTL_1_PROGRAM_START_SHIFT) | + ((vp->mesa_program.Base.NumNativeInstructions - 1) << R200_PVS_CNTL_1_PROGRAM_END_SHIFT) | + (vp->pos_end << R200_PVS_CNTL_1_POS_END_SHIFT); + rmesa->hw.pvs.cmd[PVS_CNTL_2] = (0 << R200_PVS_CNTL_2_PARAM_OFFSET_SHIFT) | + (vp->mesa_program.Base.NumNativeParameters << R200_PVS_CNTL_2_PARAM_COUNT_SHIFT); + + /* maybe user clip planes just work with vertex progs... untested */ + if (ctx->Transform.ClipPlanesEnabled) { + R200_STATECHANGE( rmesa, tcl ); + if (vp->mesa_program.IsPositionInvariant) { + rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] |= (ctx->Transform.ClipPlanesEnabled << 2); + } + else { + rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] &= ~(0xfc); + } + } + + if (vp != rmesa->curr_vp_hw) { + GLuint count = vp->mesa_program.Base.NumNativeInstructions; + drm_radeon_cmd_header_t tmp; + + R200_STATECHANGE( rmesa, vpi[0] ); + R200_STATECHANGE( rmesa, vpi[1] ); + + /* FIXME: what about using a memcopy... */ + for (i = 0; (i < 64) && i < count; i++) { + rmesa->hw.vpi[0].cmd[VPI_OPDST_0 + 4 * i] = vp->instr[i].op; + rmesa->hw.vpi[0].cmd[VPI_SRC0_0 + 4 * i] = vp->instr[i].src0; + rmesa->hw.vpi[0].cmd[VPI_SRC1_0 + 4 * i] = vp->instr[i].src1; + rmesa->hw.vpi[0].cmd[VPI_SRC2_0 + 4 * i] = vp->instr[i].src2; + } + /* hack up the cmd_size so not the whole state atom is emitted always. + This may require some more thought, we may emit half progs on lost state, but + hopefully it won't matter? + WARNING: must not use R200_DB_STATECHANGE, this will produce bogus (and rejected) + packet emits (due to the mismatched cmd_size and count in cmd/last_cmd) */ + rmesa->hw.vpi[0].cmd_size = 1 + 4 * ((count > 64) ? 64 : count); + tmp.i = rmesa->hw.vpi[0].cmd[VPI_CMD_0]; + tmp.veclinear.count = (count > 64) ? 64 : count; + rmesa->hw.vpi[0].cmd[VPI_CMD_0] = tmp.i; + if (count > 64) { + for (i = 0; i < (count - 64); i++) { + rmesa->hw.vpi[1].cmd[VPI_OPDST_0 + 4 * i] = vp->instr[i + 64].op; + rmesa->hw.vpi[1].cmd[VPI_SRC0_0 + 4 * i] = vp->instr[i + 64].src0; + rmesa->hw.vpi[1].cmd[VPI_SRC1_0 + 4 * i] = vp->instr[i + 64].src1; + rmesa->hw.vpi[1].cmd[VPI_SRC2_0 + 4 * i] = vp->instr[i + 64].src2; + } + rmesa->hw.vpi[1].cmd_size = 1 + 4 * (count - 64); + tmp.i = rmesa->hw.vpi[1].cmd[VPI_CMD_0]; + tmp.veclinear.count = count - 64; + rmesa->hw.vpi[1].cmd[VPI_CMD_0] = tmp.i; + } + rmesa->curr_vp_hw = vp; + } +} + + +static void r200BindProgram(GLcontext *ctx, GLenum target, struct program *prog) +{ + r200ContextPtr rmesa = R200_CONTEXT(ctx); + + switch(target){ + case GL_VERTEX_PROGRAM_ARB: + rmesa->curr_vp_hw = NULL; + break; + default: + _mesa_problem(ctx, "Target not supported yet!"); + break; + } +} + +static struct program *r200NewProgram(GLcontext *ctx, GLenum target, GLuint id) +{ + struct r200_vertex_program *vp; + + switch(target){ + case GL_VERTEX_PROGRAM_ARB: + vp = CALLOC_STRUCT(r200_vertex_program); + return _mesa_init_vertex_program(ctx, &vp->mesa_program, target, id); + case GL_FRAGMENT_PROGRAM_ARB: + case GL_FRAGMENT_PROGRAM_NV: + return _mesa_init_fragment_program( ctx, CALLOC_STRUCT(fragment_program), target, id ); + default: + _mesa_problem(ctx, "Bad target in r200NewProgram"); + } + return NULL; +} + + +static void r200DeleteProgram(GLcontext *ctx, struct program *prog) +{ + _mesa_delete_program(ctx, prog); +} + +static void r200ProgramStringNotify(GLcontext *ctx, GLenum target, struct program *prog) +{ + struct r200_vertex_program *vp = (void *)prog; + + switch(target) { + case GL_VERTEX_PROGRAM_ARB: + vp->translated = GL_FALSE; + memset(&vp->translated, 0, sizeof(struct r200_vertex_program) - sizeof(struct vertex_program)); + /*r200_translate_vertex_shader(vp);*/ + break; + } + /* need this for tcl fallbacks */ + _tnl_program_string(ctx, target, prog); +} + +static GLboolean r200IsProgramNative(GLcontext *ctx, GLenum target, struct program *prog) +{ + struct r200_vertex_program *vp = (void *)prog; + + switch(target){ + case GL_VERTEX_STATE_PROGRAM_NV: + case GL_VERTEX_PROGRAM_ARB: + if (!vp->translated) { + r200_translate_vertex_program(vp); + } + /* does not take parameters etc. into account */ + return vp->native; + default: + _mesa_problem(ctx, "Bad target in r200NewProgram"); + } + return 0; +} + +void r200InitShaderFuncs(struct dd_function_table *functions) +{ + functions->NewProgram = r200NewProgram; + functions->BindProgram = r200BindProgram; + functions->DeleteProgram = r200DeleteProgram; + functions->ProgramStringNotify = r200ProgramStringNotify; + functions->IsProgramNative = r200IsProgramNative; +} diff --git a/src/mesa/drivers/dri/r200/r200_vertprog.h b/src/mesa/drivers/dri/r200/r200_vertprog.h new file mode 100644 index 00000000000..00ad2dd1b30 --- /dev/null +++ b/src/mesa/drivers/dri/r200/r200_vertprog.h @@ -0,0 +1,160 @@ +#ifndef __VERTEX_SHADER_H__ +#define __VERTEX_SHADER_H__ + +#include "r200_reg.h" + +typedef struct { + uint32_t op; + uint32_t src0; + uint32_t src1; + uint32_t src2; +} VERTEX_SHADER_INSTRUCTION; + +extern void r200InitShaderFuncs(struct dd_function_table *functions); +extern void r200SetupVertexProg( GLcontext *ctx ); + +#define VSF_FLAG_X 1 +#define VSF_FLAG_Y 2 +#define VSF_FLAG_Z 4 +#define VSF_FLAG_W 8 +#define VSF_FLAG_XYZ (VSF_FLAG_X | VSF_FLAG_Y | VSF_FLAG_Z) +#define VSF_FLAG_ALL 0xf +#define VSF_FLAG_NONE 0 + +#define R200_VSF_MAX_INST 128 +#define R200_VSF_MAX_PARAM 192 +#define R200_VSF_MAX_TEMPS 12 + +#define R200_VPI_OUT_REG_INDEX_SHIFT 13 +#define R200_VPI_OUT_REG_INDEX_MASK (31 << 13) /* GUESS based on fglrx native limits */ + +#define R200_VPI_OUT_WRITE_X (1 << 20) +#define R200_VPI_OUT_WRITE_Y (1 << 21) +#define R200_VPI_OUT_WRITE_Z (1 << 22) +#define R200_VPI_OUT_WRITE_W (1 << 23) + +#define R200_VPI_IN_REG_CLASS_TEMPORARY (0 << 0) +#define R200_VPI_IN_REG_CLASS_ATTRIBUTE (1 << 0) +#define R200_VPI_IN_REG_CLASS_PARAMETER (2 << 0) +#define R200_VPI_IN_REG_CLASS_NONE (9 << 0) +#define R200_VPI_IN_REG_CLASS_MASK (31 << 0) /* GUESS */ + +#define R200_VPI_IN_REG_INDEX_SHIFT 5 +#define R200_VPI_IN_REG_INDEX_MASK (255 << 5) /* GUESS based on fglrx native limits */ + +/* The R200 can select components from the input register arbitrarily. +// Use the following constants, shifted by the component shift you +// want to select */ +#define R200_VPI_IN_SELECT_X 0 +#define R200_VPI_IN_SELECT_Y 1 +#define R200_VPI_IN_SELECT_Z 2 +#define R200_VPI_IN_SELECT_W 3 +#define R200_VPI_IN_SELECT_ZERO 4 +#define R200_VPI_IN_SELECT_ONE 5 +#define R200_VPI_IN_SELECT_MASK 7 + +#define R200_VPI_IN_X_SHIFT 13 +#define R200_VPI_IN_Y_SHIFT 16 +#define R200_VPI_IN_Z_SHIFT 19 +#define R200_VPI_IN_W_SHIFT 22 + +#define R200_VPI_IN_NEG_X (1 << 25) +#define R200_VPI_IN_NEG_Y (1 << 26) +#define R200_VPI_IN_NEG_Z (1 << 27) +#define R200_VPI_IN_NEG_W (1 << 28) + +#define R200_VSF_OUT_CLASS_TMP (0 << 8) +#define R200_VSF_OUT_CLASS_ADDR (3 << 8) +#define R200_VSF_OUT_CLASS_RESULT_POS (4 << 8) +#define R200_VSF_OUT_CLASS_RESULT_COLOR (5 << 8) +#define R200_VSF_OUT_CLASS_RESULT_TEXC (6 << 8) +#define R200_VSF_OUT_CLASS_RESULT_FOGC (7 << 8) +#define R200_VSF_OUT_CLASS_RESULT_POINTSIZE (8 << 8) +#define R200_VSF_OUT_CLASS_MASK (31 << 8) + +/* opcodes - they all are the same as on r300 it seems */ +#define R200_VPI_OUT_OP_DOT (1 << 0) +#define R200_VPI_OUT_OP_MUL (2 << 0) +#define R200_VPI_OUT_OP_ADD (3 << 0) +#define R200_VPI_OUT_OP_MAD (4 << 0) +#define R200_VPI_OUT_OP_DST (5 << 0) +#define R200_VPI_OUT_OP_FRC (6 << 0) +#define R200_VPI_OUT_OP_MAX (7 << 0) +#define R200_VPI_OUT_OP_MIN (8 << 0) +#define R200_VPI_OUT_OP_SGE (9 << 0) +#define R200_VPI_OUT_OP_SLT (10 << 0) + +#define R200_VPI_OUT_OP_ARL (13 << 0) + +#define R200_VPI_OUT_OP_EXP (65 << 0) +#define R200_VPI_OUT_OP_LOG (66 << 0) + +#define R200_VPI_OUT_OP_LIT (68 << 0) +#define R200_VPI_OUT_OP_POW (69 << 0) +#define R200_VPI_OUT_OP_RCP (70 << 0) +#define R200_VPI_OUT_OP_RSQ (72 << 0) + +#define R200_VPI_OUT_OP_EX2 (75 << 0) +#define R200_VPI_OUT_OP_LG2 (76 << 0) + +#define R200_VPI_OUT_OP_MAD_2 (128 << 0) + +/* first CARD32 of an instruction */ + +/* possible operations: + DOT, MUL, ADD, MAD, FRC, MAX, MIN, SGE, SLT, EXP, LOG, LIT, POW, RCP, RSQ, EX2, + LG2, MAD_2, ARL */ + +#define MAKE_VSF_OP(op, out_reg, out_reg_fields) \ + ((op) | (out_reg) | ((out_reg_fields) << 20) ) + +#define VSF_IN_CLASS_TMP 0 +#define VSF_IN_CLASS_ATTR 1 +#define VSF_IN_CLASS_PARAM 2 +#define VSF_IN_CLASS_NONE 9 + +#define VSF_IN_COMPONENT_X 0 +#define VSF_IN_COMPONENT_Y 1 +#define VSF_IN_COMPONENT_Z 2 +#define VSF_IN_COMPONENT_W 3 +#define VSF_IN_COMPONENT_ZERO 4 +#define VSF_IN_COMPONENT_ONE 5 + +#define MAKE_VSF_SOURCE(in_reg_index, comp_x, comp_y, comp_z, comp_w, class, negate) \ + ( ((in_reg_index)<TclFallback || rmesa->vb.fell_back || ctx->CompileFlag || - (ctx->Fog.Enabled && (ctx->Fog.FogCoordinateSource == GL_FOG_COORD))) + (ctx->Fog.Enabled && (ctx->Fog.FogCoordinateSource == GL_FOG_COORD)) || + /* TODO: set tcl out fmt/compsel and reenable vtxfmt code */ + ctx->VertexProgram._Enabled) return GL_FALSE; if (ctx->Driver.NeedFlush & FLUSH_UPDATE_CURRENT)