r300: finish blit support for r300
[mesa.git] / src / mesa / drivers / dri / r300 / r300_swtcl.c
index 78fa031479ceaf9dfc733b081ef2f7c162a912ed..383c8a274b5345cbc5428c3fefeb6e67bd13aca8 100644 (file)
@@ -38,12 +38,14 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r300_swtcl.h"
 #include "r300_emit.h"
 #include "r300_tex.h"
+#include "r300_render.h"
+#include "main/simple_list.h"
 
 #define EMIT_ATTR( ATTR, STYLE )                                       \
 do {                                                                   \
-   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].attrib = (ATTR);    \
-   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].format = (STYLE);   \
-   rmesa->radeon.swtcl.vertex_attr_count++;                                    \
+       rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].attrib = (ATTR);        \
+       rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].format = (STYLE);       \
+       rmesa->radeon.swtcl.vertex_attr_count++;                                        \
 } while (0)
 
 #define EMIT_PAD( N )                                                  \
@@ -54,167 +56,93 @@ do {                                                                       \
    rmesa->radeon.swtcl.vertex_attr_count++;                                    \
 } while (0)
 
-#define ADD_ATTR(_attr, _format, _dst_loc, _swizzle, _write_mask) \
+#define ADD_ATTR(_attr, _format, _dst_loc, _swizzle, _write_mask, _normalize) \
 do { \
-       attrs[num_attrs].attr = (_attr); \
-       attrs[num_attrs].format = (_format); \
+       attrs[num_attrs].element = (_attr); \
+       attrs[num_attrs].data_type = (_format); \
        attrs[num_attrs].dst_loc = (_dst_loc); \
        attrs[num_attrs].swizzle = (_swizzle); \
        attrs[num_attrs].write_mask = (_write_mask); \
+       attrs[num_attrs]._signed = 0; \
+       attrs[num_attrs].normalize = (_normalize); \
        ++num_attrs; \
 } while (0)
 
-static void r300SwtclVAPSetup(GLcontext *ctx, GLuint InputsRead, GLuint OutputsWritten, GLuint vap_out_fmt_1)
-{
-       r300ContextPtr rmesa = R300_CONTEXT( ctx );
-       struct vertex_attribute *attrs = rmesa->swtcl.vert_attrs;
-       int i, j, reg_count;
-       uint32_t *vir0 = &rmesa->hw.vir[0].cmd[1];
-       uint32_t *vir1 = &rmesa->hw.vir[1].cmd[1];
-
-       for (i = 0; i < R300_VIR_CMDSIZE-1; ++i)
-               vir0[i] = vir1[i] = 0;
-
-       for (i = 0, j = 0; i < rmesa->radeon.swtcl.vertex_attr_count; ++i) {
-               int tmp, data_format;
-               switch (attrs[i].format) {
-                       case EMIT_1F:
-                               data_format = R300_DATA_TYPE_FLOAT_1;
-                               break;
-                       case EMIT_2F:
-                               data_format = R300_DATA_TYPE_FLOAT_2;
-                               break;
-                       case EMIT_3F:
-                               data_format = R300_DATA_TYPE_FLOAT_3;
-                               break;
-                       case EMIT_4F:
-                               data_format = R300_DATA_TYPE_FLOAT_4;
-                               break;
-                       case EMIT_4UB_4F_RGBA:
-                       case EMIT_4UB_4F_ABGR:
-                               data_format = R300_DATA_TYPE_BYTE | R300_NORMALIZE;
-                               break;
-                       default:
-                               fprintf(stderr, "%s: Invalid data format type", __FUNCTION__);
-                               _mesa_exit(-1);
-                               break;
-               }
-
-               tmp = data_format | (attrs[i].dst_loc << R300_DST_VEC_LOC_SHIFT);
-               if (i % 2 == 0) {
-                       vir0[j] = tmp << R300_DATA_TYPE_0_SHIFT;
-                       vir1[j] = attrs[i].swizzle | (attrs[i].write_mask << R300_WRITE_ENA_SHIFT);
-               } else {
-                       vir0[j] |= tmp << R300_DATA_TYPE_1_SHIFT;
-                       vir1[j] |= (attrs[i].swizzle | (attrs[i].write_mask << R300_WRITE_ENA_SHIFT)) << R300_SWIZZLE1_SHIFT;
-                       ++j;
-               }
-       }
-
-       reg_count = (rmesa->radeon.swtcl.vertex_attr_count + 1) >> 1;
-       if (rmesa->radeon.swtcl.vertex_attr_count % 2 != 0) {
-               vir0[reg_count-1] |= R300_LAST_VEC << R300_DATA_TYPE_0_SHIFT;
-       } else {
-               vir0[reg_count-1] |= R300_LAST_VEC << R300_DATA_TYPE_1_SHIFT;
-       }
-
-       R300_STATECHANGE(rmesa, vir[0]);
-       R300_STATECHANGE(rmesa, vir[1]);
-       R300_STATECHANGE(rmesa, vof);
-       R300_STATECHANGE(rmesa, vic);
-
-       if (rmesa->radeon.radeonScreen->kernel_mm) {
-               rmesa->hw.vir[0].cmd[0] &= 0xC000FFFF;
-               rmesa->hw.vir[1].cmd[0] &= 0xC000FFFF;
-               rmesa->hw.vir[0].cmd[0] |= (reg_count & 0x3FFF) << 16;
-               rmesa->hw.vir[1].cmd[0] |= (reg_count & 0x3FFF) << 16;
-       } else {
-               ((drm_r300_cmd_header_t *) rmesa->hw.vir[0].cmd)->packet0.count = reg_count;
-               ((drm_r300_cmd_header_t *) rmesa->hw.vir[1].cmd)->packet0.count = reg_count;
-       }
-
-       rmesa->hw.vic.cmd[R300_VIC_CNTL_0] = r300VAPInputCntl0(ctx, InputsRead);
-       rmesa->hw.vic.cmd[R300_VIC_CNTL_1] = r300VAPInputCntl1(ctx, InputsRead);
-       rmesa->hw.vof.cmd[R300_VOF_CNTL_0] = r300VAPOutputCntl0(ctx, OutputsWritten);
-       /**
-         * Can't use r300VAPOutputCntl1 function because it assumes
-         * that all texture coords have 4 components and that's the case
-         * for HW TCL path, but not for SW TCL.
-         */
-       rmesa->hw.vof.cmd[R300_VOF_CNTL_1] = vap_out_fmt_1;
-}
-
-
-static void r300SetVertexFormat( GLcontext *ctx )
+void r300ChooseSwtclVertexFormat(GLcontext *ctx, GLuint *_InputsRead,  GLuint *_OutputsWritten)
 {
        r300ContextPtr rmesa = R300_CONTEXT( ctx );
        TNLcontext *tnl = TNL_CONTEXT(ctx);
        struct vertex_buffer *VB = &tnl->vb;
-       int first_free_tex = 0, vap_out_fmt_1 = 0;
+       int first_free_tex = 0;
        GLuint InputsRead = 0;
        GLuint OutputsWritten = 0;
        int num_attrs = 0;
-       struct vertex_attribute *attrs = rmesa->swtcl.vert_attrs;
+       GLuint fp_reads = rmesa->selected_fp->InputsRead;
+       struct vertex_attribute *attrs = rmesa->vbuf.attribs;
 
+       radeon_print(RADEON_SWRENDER, RADEON_VERBOSE, "%s\n", __func__);
        rmesa->swtcl.coloroffset = rmesa->swtcl.specoffset = 0;
        rmesa->radeon.swtcl.vertex_attr_count = 0;
 
+       if (RADEON_DEBUG & RADEON_VERTS)
+               fprintf(stderr, "%s\n", __func__);
+
        /* We always want non Ndc coords format */
        VB->AttribPtr[VERT_ATTRIB_POS] = VB->ClipPtr;
 
-       if (RENDERINPUTS_TEST(tnl->render_inputs_bitset, _TNL_ATTRIB_POS)) {
-               InputsRead |= 1 << VERT_ATTRIB_POS;
-               OutputsWritten |= 1 << VERT_RESULT_HPOS;
-               EMIT_ATTR( _TNL_ATTRIB_POS, EMIT_4F );
-               ADD_ATTR(VERT_ATTRIB_POS, EMIT_4F, SWTCL_OVM_POS, SWIZZLE_XYZW, MASK_XYZW);
-               rmesa->swtcl.coloroffset = 4;
-       }
+       /* Always write position vector */
+       InputsRead |= 1 << VERT_ATTRIB_POS;
+       OutputsWritten |= 1 << VERT_RESULT_HPOS;
+       EMIT_ATTR( _TNL_ATTRIB_POS, EMIT_4F );
+       ADD_ATTR(VERT_ATTRIB_POS, R300_DATA_TYPE_FLOAT_4, SWTCL_OVM_POS, SWIZZLE_XYZW, MASK_XYZW, 0);
+       rmesa->swtcl.coloroffset = 4;
 
-       if (RENDERINPUTS_TEST(tnl->render_inputs_bitset, _TNL_ATTRIB_COLOR0)) {
+       if (fp_reads & FRAG_BIT_COL0) {
                InputsRead |= 1 << VERT_ATTRIB_COLOR0;
                OutputsWritten |= 1 << VERT_RESULT_COL0;
 #if MESA_LITTLE_ENDIAN
                EMIT_ATTR( _TNL_ATTRIB_COLOR0, EMIT_4UB_4F_RGBA );
-               ADD_ATTR(VERT_ATTRIB_COLOR0, EMIT_4UB_4F_RGBA, SWTCL_OVM_COLOR0, SWIZZLE_XYZW, MASK_XYZW);
+               ADD_ATTR(VERT_ATTRIB_COLOR0, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR0, SWIZZLE_XYZW, MASK_XYZW, 1);
 #else
                EMIT_ATTR( _TNL_ATTRIB_COLOR0, EMIT_4UB_4F_ABGR );
-               ADD_ATTR(VERT_ATTRIB_COLOR0, EMIT_4UB_4F_ABGR, SWTCL_OVM_COLOR0, SWIZZLE_XYZW, MASK_XYZW);
+               ADD_ATTR(VERT_ATTRIB_COLOR0, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR0, SWIZZLE_XYZW, MASK_XYZW, 1);
 #endif
        }
 
-       if (RENDERINPUTS_TEST(tnl->render_inputs_bitset, _TNL_ATTRIB_COLOR1 )) {
+       if (fp_reads & FRAG_BIT_COL1) {
                GLuint swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ONE);
                InputsRead |= 1 << VERT_ATTRIB_COLOR1;
                OutputsWritten |= 1 << VERT_RESULT_COL1;
 #if MESA_LITTLE_ENDIAN
                EMIT_ATTR( _TNL_ATTRIB_COLOR1, EMIT_4UB_4F_RGBA );
-               ADD_ATTR(VERT_ATTRIB_COLOR1, EMIT_4UB_4F_RGBA, SWTCL_OVM_COLOR1, swiz, MASK_XYZW);
+               ADD_ATTR(VERT_ATTRIB_COLOR1, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR1, swiz, MASK_XYZW, 1);
 #else
                EMIT_ATTR( _TNL_ATTRIB_COLOR1, EMIT_4UB_4F_ABGR );
-               ADD_ATTR(VERT_ATTRIB_COLOR1, EMIT_4UB_4F_ABGR, SWTCL_OVM_COLOR1, swiz, MASK_XYZW);
+               ADD_ATTR(VERT_ATTRIB_COLOR1, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR1, swiz, MASK_XYZW, 1);
 #endif
                rmesa->swtcl.specoffset = rmesa->swtcl.coloroffset + 1;
        }
 
        if (ctx->Light.Enabled && ctx->Light.Model.TwoSide) {
-               VB->AttribPtr[VERT_ATTRIB_GENERIC0] = VB->ColorPtr[1];
+               VB->AttribPtr[VERT_ATTRIB_GENERIC0] = VB->BackfaceColorPtr;
                OutputsWritten |= 1 << VERT_RESULT_BFC0;
 #if MESA_LITTLE_ENDIAN
                EMIT_ATTR( _TNL_ATTRIB_GENERIC0, EMIT_4UB_4F_RGBA );
-               ADD_ATTR(VERT_ATTRIB_GENERIC0, EMIT_4UB_4F_RGBA, SWTCL_OVM_COLOR2, SWIZZLE_XYZW, MASK_XYZW);
+               ADD_ATTR(VERT_ATTRIB_GENERIC0, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR2, SWIZZLE_XYZW, MASK_XYZW, 1);
 #else
                EMIT_ATTR( _TNL_ATTRIB_GENERIC0, EMIT_4UB_4F_ABGR );
-               ADD_ATTR(VERT_ATTRIB_GENERIC0, EMIT_4UB_4F_ABGR, SWTCL_OVM_COLOR2, SWIZZLE_XYZW, MASK_XYZW);
+               ADD_ATTR(VERT_ATTRIB_GENERIC0, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR2, SWIZZLE_XYZW, MASK_XYZW, 1);
 #endif
-               if (RENDERINPUTS_TEST(tnl->render_inputs_bitset, _TNL_ATTRIB_COLOR1 )) {
+               if (fp_reads & FRAG_BIT_COL1) {
+                       VB->AttribPtr[VERT_ATTRIB_GENERIC1] = VB->BackfaceSecondaryColorPtr;
                        GLuint swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ONE);
                        OutputsWritten |= 1 << VERT_RESULT_BFC1;
 #if MESA_LITTLE_ENDIAN
                        EMIT_ATTR( _TNL_ATTRIB_GENERIC1, EMIT_4UB_4F_RGBA );
-                       ADD_ATTR(VERT_ATTRIB_GENERIC1, EMIT_4UB_4F_RGBA, SWTCL_OVM_COLOR3, swiz, MASK_XYZW);
+                       ADD_ATTR(VERT_ATTRIB_GENERIC1, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR3, swiz, MASK_XYZW, 1);
 #else
                        EMIT_ATTR( _TNL_ATTRIB_GENERIC1, EMIT_4UB_4F_ABGR );
-                       ADD_ATTR(VERT_ATTRIB_GENERIC1, EMIT_4UB_4F_ABGR, SWTCL_OVM_COLOR3, swiz, MASK_XYZW);
+                       ADD_ATTR(VERT_ATTRIB_GENERIC1, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR3, swiz, MASK_XYZW, 1);
 #endif
                }
        }
@@ -224,33 +152,53 @@ static void r300SetVertexFormat( GLcontext *ctx )
                InputsRead |= 1 << VERT_ATTRIB_POINT_SIZE;
                OutputsWritten |= 1 << VERT_RESULT_PSIZ;
                EMIT_ATTR( _TNL_ATTRIB_POINTSIZE, EMIT_1F );
-               ADD_ATTR(VERT_ATTRIB_POINT_SIZE, EMIT_1F, SWTCL_OVM_POINT_SIZE, swiz, MASK_X);
+               ADD_ATTR(VERT_ATTRIB_POINT_SIZE, R300_DATA_TYPE_FLOAT_1, SWTCL_OVM_POINT_SIZE, swiz, MASK_X, 0);
+       }
+
+       if (rmesa->selected_fp->wpos_attr != FRAG_ATTRIB_MAX) {
+               int tex_id = rmesa->selected_fp->wpos_attr - FRAG_ATTRIB_TEX0;
+
+               VB->AttribPtr[VERT_ATTRIB_TEX0 + tex_id] = VB->AttribPtr[VERT_ATTRIB_POS];
+               VB->AttribPtr[_TNL_ATTRIB_TEX0 + tex_id] = VB->AttribPtr[VERT_ATTRIB_POS];
+               RENDERINPUTS_SET(tnl->render_inputs_bitset, _TNL_ATTRIB_TEX0 + tex_id);
+       }
+
+       if (rmesa->selected_fp->fog_attr != FRAG_ATTRIB_MAX) {
+               int tex_id = rmesa->selected_fp->fog_attr - FRAG_ATTRIB_TEX0;
+
+               VB->AttribPtr[VERT_ATTRIB_TEX0 + tex_id] = VB->AttribPtr[VERT_ATTRIB_FOG];
+               VB->AttribPtr[_TNL_ATTRIB_TEX0 + tex_id] = VB->AttribPtr[VERT_ATTRIB_FOG];
+               RENDERINPUTS_SET(tnl->render_inputs_bitset, _TNL_ATTRIB_TEX0 + tex_id);
        }
 
        /**
         *  Sending only one texcoord component may lead to lock up,
         *  so for all textures always output 4 texcoord components to RS.
         */
-       if (RENDERINPUTS_TEST_RANGE(tnl->render_inputs_bitset, _TNL_FIRST_TEX, _TNL_LAST_TEX )) {
+       {
                int i;
-               GLuint swiz, format;
+               GLuint swiz, format, hw_format;
                for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
-                       if (RENDERINPUTS_TEST(tnl->render_inputs_bitset, _TNL_ATTRIB_TEX(i) )) {
-                               switch (VB->TexCoordPtr[i]->size) {
+                       if (fp_reads & FRAG_BIT_TEX(i)) {
+                               switch (VB->AttribPtr[_TNL_ATTRIB_TEX0 + i]->size) {
                                        case 1:
                                                format = EMIT_1F;
+                                               hw_format = R300_DATA_TYPE_FLOAT_1;
                                                swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ONE);
                                                break;
                                        case 2:
                                                format = EMIT_2F;
+                                               hw_format = R300_DATA_TYPE_FLOAT_2;
                                                swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_ZERO, SWIZZLE_ONE);
                                                break;
                                        case 3:
                                                format = EMIT_3F;
+                                               hw_format = R300_DATA_TYPE_FLOAT_3;
                                                swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ONE);
                                                break;
                                        case 4:
                                                format = EMIT_4F;
+                                               hw_format = R300_DATA_TYPE_FLOAT_4;
                                                swiz = SWIZZLE_XYZW;
                                                break;
                                        default:
@@ -259,44 +207,33 @@ static void r300SetVertexFormat( GLcontext *ctx )
                                InputsRead |= 1 << (VERT_ATTRIB_TEX0 + i);
                                OutputsWritten |= 1 << (VERT_RESULT_TEX0 + i);
                                EMIT_ATTR(_TNL_ATTRIB_TEX(i), format);
-                               ADD_ATTR(VERT_ATTRIB_TEX0 + i, format, SWTCL_OVM_TEX(i), swiz, MASK_XYZW);
-                               vap_out_fmt_1 |= 4 << (i * 3);
+                               ADD_ATTR(VERT_ATTRIB_TEX0 + i, hw_format, SWTCL_OVM_TEX(first_free_tex), swiz, MASK_XYZW, 0);
                                ++first_free_tex;
                        }
                }
        }
 
-       /* RS can't put fragment position on the pixel stack, so stuff it in texcoord if needed */
-       if (RENDERINPUTS_TEST(tnl->render_inputs_bitset, _TNL_ATTRIB_POS) && (ctx->FragmentProgram._Current->Base.InputsRead & FRAG_BIT_WPOS)) {
-               if (first_free_tex >= ctx->Const.MaxTextureUnits) {
-                       fprintf(stderr, "\tout of free texcoords to write w pos\n");
-                       _mesa_exit(-1);
-               }
-
-               InputsRead |= 1 << (VERT_ATTRIB_TEX0 + first_free_tex);
-               OutputsWritten |= 1 << (VERT_RESULT_TEX0 + first_free_tex);
-               EMIT_ATTR( _TNL_ATTRIB_POS, EMIT_4F );
-               ADD_ATTR(VERT_ATTRIB_POS, EMIT_4F, SWTCL_OVM_TEX(first_free_tex), SWIZZLE_XYZW, MASK_XYZW);
-               vap_out_fmt_1 |= 4 << (first_free_tex * 3);
-               ++first_free_tex;
+       if (first_free_tex >= ctx->Const.MaxTextureUnits) {
+               fprintf(stderr, "\tout of free texcoords to write fog coordinate\n");
+               _mesa_exit(-1);
        }
 
-       if (RENDERINPUTS_TEST(tnl->render_inputs_bitset, _TNL_ATTRIB_FOG)) {
-               if (first_free_tex >= ctx->Const.MaxTextureUnits) {
-                       fprintf(stderr, "\tout of free texcoords to write fog coordinate\n");
-                       _mesa_exit(-1);
-               }
+       R300_NEWPRIM(rmesa);
+       rmesa->vbuf.num_attribs = num_attrs;
+       *_InputsRead = InputsRead;
+       *_OutputsWritten = OutputsWritten;
 
-               InputsRead |= 1 << VERT_ATTRIB_FOG;
-               OutputsWritten |= 1 << VERT_RESULT_FOGC;
-               GLuint swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO);
-               EMIT_ATTR( _TNL_ATTRIB_FOG, EMIT_1F );
-               ADD_ATTR(VERT_ATTRIB_FOG, EMIT_1F, SWTCL_OVM_TEX(first_free_tex), swiz, MASK_X);
-               vap_out_fmt_1 |=  1 << (first_free_tex * 3);
-       }
+       RENDERINPUTS_COPY(rmesa->render_inputs_bitset, tnl->render_inputs_bitset);
+}
 
-       R300_NEWPRIM(rmesa);
-       r300SwtclVAPSetup(ctx, InputsRead, OutputsWritten, vap_out_fmt_1);
+static void r300PrepareVertices(GLcontext *ctx)
+{
+       r300ContextPtr rmesa = R300_CONTEXT(ctx);
+       GLuint InputsRead, OutputsWritten;
+       radeon_print(RADEON_SWRENDER, RADEON_TRACE, "%s\n", __func__);
+
+       r300ChooseSwtclVertexFormat(ctx, &InputsRead, &OutputsWritten);
+       r300SetupVAP(ctx, InputsRead, OutputsWritten);
 
        rmesa->radeon.swtcl.vertex_size =
                _tnl_install_attrs( ctx,
@@ -305,8 +242,33 @@ static void r300SetVertexFormat( GLcontext *ctx )
                                    NULL, 0 );
 
        rmesa->radeon.swtcl.vertex_size /= 4;
+}
 
-       RENDERINPUTS_COPY(rmesa->render_inputs_bitset, tnl->render_inputs_bitset);
+static void r300_predict_emit_size( r300ContextPtr rmesa )
+{
+       if (!rmesa->radeon.swtcl.emit_prediction) {
+               const int vertex_size = 7;
+               const int prim_size = 3;
+               const int cache_flush_size = 4;
+               const int pre_emit_state = 4;
+               const int scissor_size = 3;
+               const int state_size = radeonCountStateEmitSize(&rmesa->radeon);
+
+               if (rcommonEnsureCmdBufSpace(&rmesa->radeon,
+                                       state_size + pre_emit_state + scissor_size
+                                       + vertex_size + prim_size + cache_flush_size * 2,
+                                       __FUNCTION__))
+                       rmesa->radeon.swtcl.emit_prediction = radeonCountStateEmitSize(&rmesa->radeon);
+               else
+                       rmesa->radeon.swtcl.emit_prediction = state_size;
+
+               rmesa->radeon.swtcl.emit_prediction += rmesa->radeon.cmdbuf.cs->cdw
+                       + vertex_size + scissor_size + prim_size + cache_flush_size * 2 + pre_emit_state;
+               radeon_print(RADEON_SWRENDER, RADEON_VERBOSE,
+                               "%s, size %d\n",
+                               __func__, rmesa->radeon.cmdbuf.cs->cdw
+                               + vertex_size + scissor_size + prim_size + cache_flush_size * 2 + pre_emit_state);
+       }
 }
 
 
@@ -342,11 +304,21 @@ static void r300RasterPrimitive( GLcontext *ctx, GLuint prim );
 #define HAVE_POLYGONS    1
 #define HAVE_ELTS        1
 
+static void* r300_alloc_verts(r300ContextPtr rmesa, GLuint n, GLuint size)
+{
+       void *rv;
+       do {
+               r300_predict_emit_size( rmesa );
+               rv = rcommonAllocDmaLowVerts( &rmesa->radeon, n, size * 4 );
+       } while (!rv);
+       return rv;
+}
+
 #undef LOCAL_VARS
 #undef ALLOC_VERTS
 #define CTX_ARG r300ContextPtr rmesa
 #define GET_VERTEX_DWORDS() rmesa->radeon.swtcl.vertex_size
-#define ALLOC_VERTS( n, size ) rcommonAllocDmaLowVerts( &rmesa->radeon, n, size * 4 )
+#define ALLOC_VERTS( n, size ) r300_alloc_verts(rmesa, n, size);
 #define LOCAL_VARS                                             \
    r300ContextPtr rmesa = R300_CONTEXT(ctx);           \
    const char *r300verts = (char *)rmesa->radeon.swtcl.verts;
@@ -532,6 +504,7 @@ static void r300ChooseRenderState( GLcontext *ctx )
        r300ContextPtr rmesa = R300_CONTEXT(ctx);
        GLuint index = 0;
        GLuint flags = ctx->_TriangleCaps;
+       radeon_print(RADEON_SWRENDER, RADEON_VERBOSE, "%s\n", __func__);
 
        if (flags & DD_TRI_UNFILLED)      index |= R300_UNFILLED_BIT;
 
@@ -556,20 +529,21 @@ static void r300ChooseRenderState( GLcontext *ctx )
        }
 }
 
-
 void r300RenderStart(GLcontext *ctx)
 {
+       radeon_print(RADEON_SWRENDER, RADEON_VERBOSE, "%s\n", __func__);
        r300ContextPtr rmesa = R300_CONTEXT( ctx );
 
        r300ChooseRenderState(ctx);
-       r300SetVertexFormat(ctx);
+
+       r300UpdateShaders(rmesa);
+
+       r300PrepareVertices(ctx);
 
        r300ValidateBuffers(ctx);
 
-       r300UpdateShaders(rmesa);
        r300UpdateShaderStates(rmesa);
 
-       r300EmitCacheFlush(rmesa);
 
        /* investigate if we can put back flush optimisation if needed */
        if (rmesa->radeon.dma.flush != NULL) {
@@ -584,6 +558,7 @@ void r300RenderFinish(GLcontext *ctx)
 static void r300RasterPrimitive( GLcontext *ctx, GLuint hwprim )
 {
        r300ContextPtr rmesa = R300_CONTEXT(ctx);
+       radeon_print(RADEON_SWRENDER, RADEON_TRACE, "%s\n", __func__);
 
        if (rmesa->radeon.swtcl.hw_primitive != hwprim) {
                R300_NEWPRIM( rmesa );
@@ -596,6 +571,7 @@ void r300RenderPrimitive(GLcontext *ctx, GLenum prim)
 
        r300ContextPtr rmesa = R300_CONTEXT(ctx);
        rmesa->radeon.swtcl.render_primitive = prim;
+       radeon_print(RADEON_SWRENDER, RADEON_TRACE, "%s\n", __func__);
 
        if ((prim == GL_TRIANGLES) && (ctx->_TriangleCaps & DD_TRI_UNFILLED))
                return;
@@ -605,6 +581,8 @@ void r300RenderPrimitive(GLcontext *ctx, GLenum prim)
 
 void r300ResetLineStipple(GLcontext *ctx)
 {
+       if (RADEON_DEBUG & RADEON_VERTS)
+               fprintf(stderr, "%s\n", __func__);
 }
 
 void r300InitSwtcl(GLcontext *ctx)
@@ -612,11 +590,13 @@ void r300InitSwtcl(GLcontext *ctx)
        TNLcontext *tnl = TNL_CONTEXT(ctx);
        r300ContextPtr rmesa = R300_CONTEXT(ctx);
        static int firsttime = 1;
+       radeon_print(RADEON_SWRENDER, RADEON_NORMAL, "%s\n", __func__);
 
        if (firsttime) {
                init_rast_tab();
                firsttime = 0;
        }
+       rmesa->radeon.swtcl.emit_prediction = 0;
 
        tnl->Driver.Render.Start = r300RenderStart;
        tnl->Driver.Render.Finish = r300RenderFinish;
@@ -649,8 +629,8 @@ static void r300EmitVertexAOS(r300ContextPtr rmesa, GLuint vertex_size, struct r
 {
        BATCH_LOCALS(&rmesa->radeon);
 
-       if (RADEON_DEBUG & DEBUG_VERTS)
-               fprintf(stderr, "%s:  vertex_size %d, offset 0x%x \n",
+       radeon_print(RADEON_SWRENDER, RADEON_TRACE,
+               "%s:  vertex_size %d, offset 0x%x \n",
                        __FUNCTION__, vertex_size, offset);
 
        BEGIN_BATCH(7);
@@ -665,6 +645,8 @@ static void r300EmitVbufPrim(r300ContextPtr rmesa, GLuint primitive, GLuint vert
 {
        BATCH_LOCALS(&rmesa->radeon);
        int type, num_verts;
+       if (RADEON_DEBUG & RADEON_VERTS)
+               fprintf(stderr, "%s\n", __func__);
 
        type = r300PrimitiveType(rmesa, primitive);
        num_verts = r300NumVerts(rmesa, vertex_nr, primitive);
@@ -677,20 +659,26 @@ static void r300EmitVbufPrim(r300ContextPtr rmesa, GLuint primitive, GLuint vert
 
 void r300_swtcl_flush(GLcontext *ctx, uint32_t current_offset)
 {
+       radeon_print(RADEON_SWRENDER, RADEON_TRACE, "%s\n", __func__);
        r300ContextPtr rmesa = R300_CONTEXT(ctx);
 
-       rcommonEnsureCmdBufSpace(&rmesa->radeon,
-                          rmesa->radeon.hw.max_state_size + (12*sizeof(int)),
-                          __FUNCTION__);
+       r300EmitCacheFlush(rmesa);
+
        radeonEmitState(&rmesa->radeon);
+       r300_emit_scissor(ctx);
        r300EmitVertexAOS(rmesa,
-                       rmesa->radeon.swtcl.vertex_size,
-                       rmesa->radeon.dma.current,
-                       current_offset);
+                         rmesa->radeon.swtcl.vertex_size,
+                         rmesa->radeon.swtcl.bo,
+                         current_offset);
 
        r300EmitVbufPrim(rmesa,
                   rmesa->radeon.swtcl.hw_primitive,
                   rmesa->radeon.swtcl.numverts);
        r300EmitCacheFlush(rmesa);
+       if ( rmesa->radeon.swtcl.emit_prediction < rmesa->radeon.cmdbuf.cs->cdw )
+               WARN_ONCE("Rendering was %d commands larger than predicted size."
+                       " We might overflow  command buffer.\n",
+                       rmesa->radeon.cmdbuf.cs->cdw - rmesa->radeon.swtcl.emit_prediction );
+       rmesa->radeon.swtcl.emit_prediction = 0;
        COMMIT_BATCH();
 }