From: Keith Whitwell <keith@tungstengraphics.com>
Date: Thu, 17 Apr 2008 22:44:32 +0000 (+0100)
Subject: draw: split off all the extra functionality in the vertex shader
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=a773f06e969a3992451dd7fe6fd55ea96b2774fa;p=mesa.git

draw: split off all the extra functionality in the vertex shader

This will at least allow us to make the initial gains to get decent
vertex performance much more quickly & with higher confidence of getting
it right.

At some later point can look again at code-generating all the
fetch/cliptest/viewport extras in the same block as the vertex shader.
For now, just need to get some decent baseline performance.
---

diff --git a/src/gallium/auxiliary/draw/Makefile b/src/gallium/auxiliary/draw/Makefile
index 836e98f086a..154c8a99b57 100644
--- a/src/gallium/auxiliary/draw/Makefile
+++ b/src/gallium/auxiliary/draw/Makefile
@@ -20,8 +20,10 @@ C_SOURCES = \
 	draw_pt_fetch_emit.c \
 	draw_pt_fetch_pipeline.c \
 	draw_pt_fetch_shade_pipeline.c \
-	draw_pt_pipeline.c \
+	draw_pt_fetch.c \
+	draw_pt_post_vs.c \
 	draw_pt_emit.c \
+	draw_pt_pipeline.c \
 	draw_pt_elts.c \
 	draw_prim.c \
 	draw_pstipple.c \
diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c
index 9b2dcc0b572..4838b68ed17 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -110,6 +110,12 @@ struct draw_context *draw_create( void )
 
    tgsi_exec_machine_init(&draw->machine);
 
+   /* FIXME: give this machine thing a proper constructor:
+    */
+   draw->machine.Inputs = align_malloc(PIPE_MAX_ATTRIBS * sizeof(struct tgsi_exec_vector), 16);
+   draw->machine.Outputs = align_malloc(PIPE_MAX_ATTRIBS * sizeof(struct tgsi_exec_vector), 16);
+
+
    if (!draw_pt_init( draw ))
       goto fail;
 
@@ -155,8 +161,13 @@ void draw_destroy( struct draw_context *draw )
    if (draw->pipeline.rasterize)
       draw->pipeline.rasterize->destroy( draw->pipeline.rasterize );
 
+   if (draw->machine.Inputs)
+      align_free(draw->machine.Inputs);
+   if (draw->machine.Outputs)
+      align_free(draw->machine.Outputs);
    tgsi_exec_machine_free_data(&draw->machine);
-   
+
+
    if (draw->vs.vertex_cache)
       align_free( draw->vs.vertex_cache ); /* Frees all the vertices. */
 
@@ -265,6 +276,7 @@ draw_set_vertex_elements(struct draw_context *draw,
    draw_do_flush( draw, DRAW_FLUSH_VERTEX_CACHE/*STATE_CHANGE*/ );
 
    memcpy(draw->vertex_element, elements, count * sizeof(elements[0]));
+   draw->nr_vertex_elements = count;
 }
 
 
@@ -463,15 +475,3 @@ boolean draw_get_edgeflag( struct draw_context *draw,
       return 1;
 }
 
-
-#if 0
-/* Crufty init function.  Fix me.
- */
-boolean draw_init_machine( struct draw_context *draw )
-{
-   ALIGN16_DECL(struct tgsi_exec_vector, inputs, PIPE_MAX_ATTRIBS);
-   ALIGN16_DECL(struct tgsi_exec_vector, outputs, PIPE_MAX_ATTRIBS);
-   machine->Inputs = ALIGN16_ASSIGN(inputs);
-   machine->Outputs = ALIGN16_ASSIGN(outputs);
-}
-#endif
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index 9407217fd33..da94e697811 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -224,6 +224,8 @@ struct draw_context
    unsigned nr_vertex_buffers;
 
    struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS];
+   unsigned nr_vertex_elements;
+
    struct draw_vertex_shader *vertex_shader;
 
    boolean identity_viewport;
diff --git a/src/gallium/auxiliary/draw/draw_pt.h b/src/gallium/auxiliary/draw/draw_pt.h
index 31d18ec62be..316289969bf 100644
--- a/src/gallium/auxiliary/draw/draw_pt.h
+++ b/src/gallium/auxiliary/draw/draw_pt.h
@@ -112,6 +112,7 @@ struct draw_pt_middle_end {
  * mode...  
  */
 struct vbuf_render;
+struct vertex_header;
 
 
 /* Helper functions.
@@ -132,25 +133,25 @@ struct draw_pt_middle_end *draw_pt_fetch_pipeline_or_emit(struct draw_context *d
  */
 void draw_pt_run_pipeline( struct draw_context *draw,
                            unsigned prim,
-                           char *verts,
-                           unsigned vertex_stride,
+                           struct vertex_header *verts,
                            unsigned vertex_count,
+                           unsigned vertex_stride,
                            const ushort *elts,
                            unsigned count );
 
 
-/* HW vertex emit:
+/*******************************************************************************
+ * HW vertex emit:
  */
 struct pt_emit;
 
 void draw_pt_emit_prepare( struct pt_emit *emit,
-			   unsigned prim,
-			   unsigned opt );
+			   unsigned prim );
 
 void draw_pt_emit( struct pt_emit *emit,
-		   char *verts,
-		   unsigned stride,
+		   const float (*vertex_data)[4],
 		   unsigned vertex_count,
+		   unsigned stride,
 		   const ushort *elts,
 		   unsigned count );
 
@@ -159,6 +160,42 @@ void draw_pt_emit_destroy( struct pt_emit *emit );
 struct pt_emit *draw_pt_emit_create( struct draw_context *draw );
 
 
+/*******************************************************************************
+ * API vertex fetch:
+ */
+
+struct pt_fetch;
+void draw_pt_fetch_prepare( struct pt_fetch *fetch,
+			    boolean emit_header,
+			    unsigned vertex_size );
+
+void draw_pt_fetch_run( struct pt_fetch *fetch,
+			const unsigned *elts,
+			unsigned count,
+			char *verts );
+
+void draw_pt_fetch_destroy( struct pt_fetch *fetch );
+
+struct pt_fetch *draw_pt_fetch_create( struct draw_context *draw );
+
+/*******************************************************************************
+ * Post-VS: cliptest, rhw, viewport
+ */
+struct pt_post_vs;
+
+boolean draw_pt_post_vs_run( struct pt_post_vs *pvs,
+			     struct vertex_header *pipeline_verts,
+			     unsigned stride,
+			     unsigned count );
+
+void draw_pt_post_vs_prepare( struct pt_post_vs *pvs,
+			      boolean bypass_clipping,
+			      boolean identity_viewport,
+			      boolean opengl );
+
+struct pt_post_vs *draw_pt_post_vs_create( struct draw_context *draw );
+
+void draw_pt_post_vs_destroy( struct pt_post_vs *pvs );
 
 
 #endif
diff --git a/src/gallium/auxiliary/draw/draw_pt_emit.c b/src/gallium/auxiliary/draw/draw_pt_emit.c
index e9ed29450af..ef9db70a027 100644
--- a/src/gallium/auxiliary/draw/draw_pt_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_emit.c
@@ -38,16 +38,11 @@ struct pt_emit {
    struct draw_context *draw;
 
    struct translate *translate;
-
-   unsigned pipeline_vertex_size;
-   unsigned prim;
-   unsigned opt;
 };
 
 
 void draw_pt_emit_prepare( struct pt_emit *emit,
-			   unsigned prim,
-			   unsigned opt )
+			   unsigned prim )
 {
    struct draw_context *draw = emit->draw;
    const struct vertex_info *vinfo;
@@ -75,8 +70,7 @@ void draw_pt_emit_prepare( struct pt_emit *emit,
       unsigned emit_sz = 0;
       unsigned src_buffer = 0;
       unsigned output_format;
-      unsigned src_offset = (sizeof(struct vertex_header) + 
-			     vinfo->src_index[i] * 4 * sizeof(float) );
+      unsigned src_offset = (vinfo->src_index[i] * 4 * sizeof(float) );
 
 
          
@@ -139,9 +133,9 @@ void draw_pt_emit_prepare( struct pt_emit *emit,
 
 
 void draw_pt_emit( struct pt_emit *emit,
-		   char *verts,
-		   unsigned stride,
+		   const float (*vertex_data)[4],
 		   unsigned vertex_count,
+		   unsigned stride,
 		   const ushort *elts,
 		   unsigned count )
 {
@@ -164,7 +158,7 @@ void draw_pt_emit( struct pt_emit *emit,
 
    translate->set_buffer(translate, 
 			 0, 
-			 verts,
+			 vertex_data,
 			 stride );
 
    translate->set_buffer(translate, 
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch.c b/src/gallium/auxiliary/draw/draw_pt_fetch.c
new file mode 100644
index 00000000000..a7553023b81
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch.c
@@ -0,0 +1,175 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "pipe/p_util.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+#include "draw/draw_vbuf.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_pt.h"
+#include "translate/translate.h"
+
+
+struct pt_fetch {
+   struct draw_context *draw;
+
+   struct translate *translate;
+   
+   unsigned vertex_size;
+};
+
+
+
+/* Perform the fetch from API vertex elements & vertex buffers, to a
+ * contiguous set of float[4] attributes as required for the
+ * vertex_shader->run_linear() method.
+ *
+ * This is used in all cases except pure passthrough
+ * (draw_pt_fetch_emit.c) which has its own version to translate
+ * directly to hw vertices.
+ *
+ */
+void draw_pt_fetch_prepare( struct pt_fetch *fetch,
+			    boolean emit_header,
+			    unsigned vertex_size )
+{
+   struct draw_context *draw = fetch->draw;
+   unsigned i, nr = 0;
+   unsigned dst_offset = 0;
+   struct translate_key key;
+
+   fetch->vertex_size = vertex_size;
+
+   memset(&key, 0, sizeof(key));
+
+   /* If PT_SHADE is not set, then we are creating post-shader
+    * vertices, meaning that we need to emit/leave space for a vertex
+    * header.
+    *
+    * It's worth considering whether the vertex headers should contain
+    * a pointer to the 'data', rather than having it inline.
+    * Something to look at after we've fully switched over to the pt
+    * paths.
+    */
+   if (emit_header)
+   {
+      /* Need to set header->vertex_id = 0xffff somehow.
+       */
+      key.element[nr].input_format = PIPE_FORMAT_R32_FLOAT;
+      key.element[nr].input_buffer = draw->nr_vertex_buffers;
+      key.element[nr].input_offset = 0;
+      key.element[nr].output_format = PIPE_FORMAT_R32_FLOAT;
+      key.element[nr].output_offset = dst_offset;
+      dst_offset += 1 * sizeof(float);
+      nr++;
+
+
+      /* Just leave the clip[] array untouched.
+       */
+      dst_offset += 4 * sizeof(float);
+   }
+      
+
+   for (i = 0; i < draw->nr_vertex_elements; i++) {
+      key.element[nr].input_format = draw->vertex_element[i].src_format;
+      key.element[nr].input_buffer = draw->vertex_element[i].vertex_buffer_index;
+      key.element[nr].input_offset = draw->vertex_element[i].src_offset;
+      key.element[nr].output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
+      key.element[nr].output_offset = dst_offset;
+
+      dst_offset += 4 * sizeof(float);
+      nr++;
+   }
+
+   assert(dst_offset <= vertex_size);
+
+   key.nr_elements = nr;
+   key.output_stride = vertex_size;
+
+
+   /* Don't bother with caching at this stage:
+    */
+   if (!fetch->translate ||
+       memcmp(&fetch->translate->key, &key, sizeof(key)) != 0) 
+   {
+      if (fetch->translate)
+	 fetch->translate->release(fetch->translate);
+
+      fetch->translate = translate_generic_create( &key );
+
+      if (emit_header) {
+	 static struct vertex_header vh = { 0, 0, 0, 0xffff };
+	 fetch->translate->set_buffer(fetch->translate, 
+				      draw->nr_vertex_buffers, 
+				      &vh,
+				      0);
+      }
+   }
+}
+
+
+
+
+void draw_pt_fetch_run( struct pt_fetch *fetch,
+			const unsigned *elts,
+			unsigned count,
+			char *verts )
+{
+   struct draw_context *draw = fetch->draw;
+   struct translate *translate = fetch->translate;
+   unsigned i;
+
+   for (i = 0; i < draw->nr_vertex_buffers; i++) {
+      translate->set_buffer(translate, 
+			    i, 
+			    ((char *)draw->user.vbuffer[i] + 
+			     draw->vertex_buffer[i].buffer_offset),
+			    draw->vertex_buffer[i].pitch );
+   }
+
+   translate->run_elts( translate,
+			elts, 
+			count,
+			verts );
+}
+
+
+struct pt_fetch *draw_pt_fetch_create( struct draw_context *draw )
+{
+   struct pt_fetch *fetch = CALLOC_STRUCT(pt_fetch);
+   if (!fetch)
+      return NULL;
+	 
+   fetch->draw = draw;
+   return fetch;
+}
+
+void draw_pt_fetch_destroy( struct pt_fetch *fetch )
+{
+   FREE(fetch);
+}
+
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_pipeline.c b/src/gallium/auxiliary/draw/draw_pt_fetch_pipeline.c
index 79548d41563..26d0b372867 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_pipeline.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_pipeline.c
@@ -286,9 +286,9 @@ static void fetch_pipeline_run( struct draw_pt_middle_end *middle,
     */
    draw_pt_run_pipeline( fpme->draw,
                          fpme->prim,
-                         pipeline_verts,
-                         fpme->pipeline_vertex_size,
+                         (struct vertex_header *)pipeline_verts,
                          fetch_count,
+                         fpme->pipeline_vertex_size,
                          draw_elts,
                          draw_count );
                  
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
index 8db9e31e2dd..0b9e8d15ba9 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
@@ -39,8 +39,11 @@ struct fetch_pipeline_middle_end {
    struct draw_context *draw;
 
    struct pt_emit *emit;
+   struct pt_fetch *fetch;
+   struct pt_post_vs *post_vs;
 
-   unsigned pipeline_vertex_size;
+   unsigned vertex_data_offset;
+   unsigned vertex_size;
    unsigned prim;
    unsigned opt;
 };
@@ -51,15 +54,43 @@ static void fetch_pipeline_prepare( struct draw_pt_middle_end *middle,
 				    unsigned opt )
 {
    struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;
+   struct draw_context *draw = fpme->draw;
+   struct draw_vertex_shader *vs = draw->vertex_shader;
+   unsigned nr = MAX2( vs->info.num_inputs,
+		       vs->info.num_outputs );
 
    fpme->prim = prim;
    fpme->opt = opt;
 
+   /* Always leave room for the vertex header whether we need it or
+    * not.  It's hard to get rid of it in particular because of the
+    * viewport code in draw_pt_post_vs.c.  
+    */
+   fpme->vertex_size = sizeof(struct vertex_header) + nr * 4 * sizeof(float);
+
+   
+
+   draw_pt_fetch_prepare( fpme->fetch, 
+			  (opt & (PT_CLIPTEST | PT_PIPELINE)) != 0,
+			  fpme->vertex_size );
+
+   /* XXX: it's not really gl rasterization rules we care about here,
+    * but gl vs dx9 clip spaces.
+    */
+   draw_pt_post_vs_prepare( fpme->post_vs,
+			    draw->rasterizer->bypass_clipping,
+			    draw->identity_viewport,
+			    draw->rasterizer->gl_rasterization_rules );
+			    
+
    if (!(opt & PT_PIPELINE)) 
-      draw_pt_emit_prepare( fpme->emit, prim, opt );
+      draw_pt_emit_prepare( fpme->emit, 
+			    prim );
+
+   /* No need to prepare the shader.
+    */
+   vs->prepare(vs, draw);
 
-   //fpme->pipeline_vertex_size = sizeof(struct vertex_header) + nr * 4 * sizeof(float);
-   fpme->pipeline_vertex_size = MAX_VERTEX_ALLOCATION;
 }
 
 
@@ -74,44 +105,63 @@ static void fetch_pipeline_run( struct draw_pt_middle_end *middle,
    struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;
    struct draw_context *draw = fpme->draw;
    struct draw_vertex_shader *shader = draw->vertex_shader;
-   char *pipeline_verts;
-   unsigned pipeline = PT_PIPELINE;
+   unsigned opt = fpme->opt;
 
-   pipeline_verts = MALLOC(fpme->pipeline_vertex_size *
-			   fetch_count);
+   struct vertex_header *pipeline_verts = 
+      (struct vertex_header *)MALLOC(fpme->vertex_size * fetch_count);
 
    if (!pipeline_verts) {
       assert(0);
       return;
    }
 
-
-   /* Shade
+   /* Fetch into our vertex buffer
     */
-   shader->prepare(shader, draw);
-
-   if (shader->run(shader, draw, fetch_elts, fetch_count, pipeline_verts,
-		   fpme->pipeline_vertex_size))
+   draw_pt_fetch_run( fpme->fetch,
+		      fetch_elts, 
+		      fetch_count,
+		      (char *)pipeline_verts );
+
+   /* Run the shader, note that this overwrites the data[] parts of
+    * the pipeline verts.  If there is no shader, ie a bypass shader,
+    * then the inputs == outputs, and are already in the correct
+    * place.
+    */
+   if (opt & PT_SHADE)
    {
-      pipeline |= PT_CLIPTEST;
+      shader->run_linear(shader, 
+			 (const float (*)[4])pipeline_verts->data,
+			 (      float (*)[4])pipeline_verts->data,
+			 (const float (*)[4])draw->user.constants,
+			 fetch_count,
+			 fpme->vertex_size,
+			 fpme->vertex_size);
    }
 
+   if (draw_pt_post_vs_run( fpme->post_vs,
+			    pipeline_verts,
+			    fetch_count,
+			    fpme->vertex_size ))
+   {
+      opt |= PT_PIPELINE;
+   }
 
    /* Do we need to run the pipeline?
     */
-   if (fpme->opt & pipeline) {
+   if (opt & PT_PIPELINE) {
       draw_pt_run_pipeline( fpme->draw,
                             fpme->prim,
                             pipeline_verts,
-                            fpme->pipeline_vertex_size,
                             fetch_count,
+                            fpme->vertex_size,
                             draw_elts,
                             draw_count );
-   } else {
+   } 
+   else {
       draw_pt_emit( fpme->emit,
-		    pipeline_verts,
-		    fpme->pipeline_vertex_size,
+		    (const float (*)[4])pipeline_verts->data,
 		    fetch_count,
+		    fpme->vertex_size,
 		    draw_elts,
 		    draw_count );
    }
@@ -129,6 +179,17 @@ static void fetch_pipeline_finish( struct draw_pt_middle_end *middle )
 
 static void fetch_pipeline_destroy( struct draw_pt_middle_end *middle )
 {
+   struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;
+
+   if (fpme->fetch)
+      draw_pt_fetch_destroy( fpme->fetch );
+
+   if (fpme->emit)
+      draw_pt_emit_destroy( fpme->emit );
+
+   if (fpme->post_vs)
+      draw_pt_post_vs_destroy( fpme->post_vs );
+
    FREE(middle);
 }
 
@@ -146,6 +207,14 @@ struct draw_pt_middle_end *draw_pt_fetch_pipeline_or_emit( struct draw_context *
 
    fpme->draw = draw;
 
+   fpme->fetch = draw_pt_fetch_create( draw );
+   if (!fpme->fetch)
+      goto fail;
+
+   fpme->post_vs = draw_pt_post_vs_create( draw );
+   if (!fpme->post_vs)
+      goto fail;
+
    fpme->emit = draw_pt_emit_create( draw );
    if (!fpme->emit) 
       goto fail;
diff --git a/src/gallium/auxiliary/draw/draw_pt_pipeline.c b/src/gallium/auxiliary/draw/draw_pt_pipeline.c
index 17ce6febec9..1a9a3adb03d 100644
--- a/src/gallium/auxiliary/draw/draw_pt_pipeline.c
+++ b/src/gallium/auxiliary/draw/draw_pt_pipeline.c
@@ -117,12 +117,13 @@ void draw_pt_reset_vertex_ids( struct draw_context *draw )
  */
 void draw_pt_run_pipeline( struct draw_context *draw,
                            unsigned prim,
-                           char *verts,
-                           unsigned stride,
+                           struct vertex_header *pipeline_verts,
                            unsigned vertex_count,
+                           unsigned stride,
                            const ushort *elts,
                            unsigned count )
 {
+   char *verts = (char *)pipeline_verts;
    unsigned i;
 
    draw->pt.pipeline.verts = verts;
diff --git a/src/gallium/auxiliary/draw/draw_pt_post_vs.c b/src/gallium/auxiliary/draw/draw_pt_post_vs.c
new file mode 100644
index 00000000000..315b02f4ee2
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pt_post_vs.c
@@ -0,0 +1,202 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "pipe/p_util.h"
+#include "pipe/p_context.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+#include "draw/draw_vbuf.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_pt.h"
+
+struct pt_post_vs {
+   struct draw_context *draw;
+
+   boolean (*run)( struct pt_post_vs *pvs,
+		struct vertex_header *vertices,
+		unsigned count,
+		unsigned stride );
+};
+
+
+
+static INLINE unsigned
+compute_clipmask_gl(const float *clip, /*const*/ float plane[][4], unsigned nr)
+{
+   unsigned mask = 0x0;
+   unsigned i;
+
+   /* Do the hardwired planes first:
+    */
+   if (-clip[0] + clip[3] < 0) mask |= CLIP_RIGHT_BIT;
+   if ( clip[0] + clip[3] < 0) mask |= CLIP_LEFT_BIT;
+   if (-clip[1] + clip[3] < 0) mask |= CLIP_TOP_BIT;
+   if ( clip[1] + clip[3] < 0) mask |= CLIP_BOTTOM_BIT;
+   if (-clip[2] + clip[3] < 0) mask |= CLIP_FAR_BIT;
+   if ( clip[2] + clip[3] < 0) mask |= CLIP_NEAR_BIT;
+
+   /* Followed by any remaining ones:
+    */
+   for (i = 6; i < nr; i++) {
+      if (dot4(clip, plane[i]) < 0) 
+         mask |= (1<<i);
+   }
+
+   return mask;
+}
+
+
+/* The normal case - cliptest, rhw divide, viewport transform.
+ *
+ * Also handle identity viewport here at the expense of a few wasted
+ * instructions
+ */
+static boolean post_vs_cliptest_viewport_gl( struct pt_post_vs *pvs,
+					  struct vertex_header *vertices,
+					  unsigned count,
+					  unsigned stride )
+{
+   struct vertex_header *out = vertices;
+   const float *scale = pvs->draw->viewport.scale;
+   const float *trans = pvs->draw->viewport.translate;
+   unsigned j;
+   unsigned clipped = 0;
+
+   for (j = 0; j < count; j++) {
+      out->clip[0] = out->data[0][0];
+      out->clip[1] = out->data[0][1];
+      out->clip[2] = out->data[0][2];
+      out->clip[3] = out->data[0][3];
+
+      out->vertex_id = 0xffff;
+      out->edgeflag = 1;
+      out->clipmask = compute_clipmask_gl(out->clip, 
+					  pvs->draw->plane,
+					  pvs->draw->nr_planes);
+      clipped += out->clipmask;
+
+      if (out->clipmask == 0)
+      {
+	 /* divide by w */
+	 float w = 1.0f / out->data[0][3];
+
+	 /* Viewport mapping */
+	 out->data[0][0] = out->data[0][0] * w * scale[0] + trans[0];
+	 out->data[0][1] = out->data[0][1] * w * scale[1] + trans[1];
+	 out->data[0][2] = out->data[0][2] * w * scale[2] + trans[2];
+	 out->data[0][3] = w;
+      }
+
+      out = (struct vertex_header *)( (char *)out + stride );
+   }
+
+   return clipped != 0;
+}
+
+
+
+/* If bypass_clipping is set, skip cliptest and rhw divide.
+ */
+static boolean post_vs_viewport( struct pt_post_vs *pvs,
+			      struct vertex_header *vertices,
+			      unsigned count,
+			      unsigned stride )
+{
+   struct vertex_header *out = vertices;
+   const float *scale = pvs->draw->viewport.scale;
+   const float *trans = pvs->draw->viewport.translate;
+   unsigned j;
+
+   debug_printf("%s\n", __FUNCTION__);
+   for (j = 0; j < count; j++) {
+      /* Viewport mapping only, no cliptest/rhw divide
+       */
+      out->data[0][0] = out->data[0][0] * scale[0] + trans[0];
+      out->data[0][1] = out->data[0][1] * scale[1] + trans[1];
+      out->data[0][2] = out->data[0][2] * scale[2] + trans[2];
+
+      out = (struct vertex_header *)((char *)out + stride);
+   }
+   
+   return FALSE;
+}
+
+
+/* If bypass_clipping is set and we have an identity viewport, nothing
+ * to do.
+ */
+static boolean post_vs_none( struct pt_post_vs *pvs,
+			     struct vertex_header *vertices,
+			     unsigned count,
+			     unsigned stride )
+{
+   debug_printf("%s\n", __FUNCTION__);
+   return FALSE;
+}
+
+boolean draw_pt_post_vs_run( struct pt_post_vs *pvs,
+			     struct vertex_header *pipeline_verts,
+			     unsigned count,
+			     unsigned stride )
+{
+   return pvs->run( pvs, pipeline_verts, count, stride );
+}
+
+
+void draw_pt_post_vs_prepare( struct pt_post_vs *pvs,
+			      boolean bypass_clipping,
+			      boolean identity_viewport,
+			      boolean opengl )
+{
+   if (bypass_clipping) {
+      if (identity_viewport)
+	 pvs->run = post_vs_none;
+      else
+	 pvs->run = post_vs_viewport;
+   }
+   else {
+      //if (opengl) 
+      pvs->run = post_vs_cliptest_viewport_gl;
+   }
+}
+
+
+struct pt_post_vs *draw_pt_post_vs_create( struct draw_context *draw )
+{
+   struct pt_post_vs *pvs = CALLOC_STRUCT( pt_post_vs );
+   if (!pvs)
+      return NULL;
+
+   pvs->draw = draw;
+   
+   return pvs;
+}
+
+void draw_pt_post_vs_destroy( struct pt_post_vs *pvs )
+{
+   FREE(pvs);
+}
diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c
index 0e05b797159..184151b9b14 100644
--- a/src/gallium/auxiliary/draw/draw_vs_exec.c
+++ b/src/gallium/auxiliary/draw/draw_vs_exec.c
@@ -58,8 +58,10 @@ static void
 vs_exec_prepare( struct draw_vertex_shader *shader,
 		 struct draw_context *draw )
 {
+   struct exec_vertex_shader *evs = exec_vertex_shader(shader);
+
    /* specify the vertex program to interpret/execute */
-   tgsi_exec_machine_bind_shader(&draw->machine,
+   tgsi_exec_machine_bind_shader(evs->machine,
 				 shader->state.tokens,
 				 PIPE_MAX_SAMPLERS,
 				 NULL /*samplers*/ );
@@ -84,31 +86,45 @@ vs_exec_run( struct draw_vertex_shader *shader,
 	     void *vOut,
              unsigned vertex_size)
 {
-   struct tgsi_exec_machine *machine = &draw->machine;
+   struct exec_vertex_shader *evs = exec_vertex_shader(shader);
+   struct tgsi_exec_machine *machine = evs->machine;
    unsigned int i, j;
    unsigned int clipped = 0;
-
-   ALIGN16_DECL(struct tgsi_exec_vector, inputs, PIPE_MAX_ATTRIBS);
-   ALIGN16_DECL(struct tgsi_exec_vector, outputs, PIPE_MAX_ATTRIBS);
+   struct tgsi_exec_vector *outputs = 0;
    const float *scale = draw->viewport.scale;
    const float *trans = draw->viewport.translate;
 
    assert(shader->info.output_semantic_name[0] == TGSI_SEMANTIC_POSITION);
 
    machine->Consts = (const float (*)[4]) draw->user.constants;
-   machine->Inputs = ALIGN16_ASSIGN(inputs);
+
    if (draw->rasterizer->bypass_vs) {
       /* outputs are just the inputs */
-      machine->Outputs = machine->Inputs;
+      outputs = machine->Inputs;
    }
    else {
-      machine->Outputs = ALIGN16_ASSIGN(outputs);
+      outputs = machine->Outputs;
    }
 
    for (i = 0; i < count; i += MAX_TGSI_VERTICES) {
       unsigned int max_vertices = MIN2(MAX_TGSI_VERTICES, count - i);
       draw->vertex_fetch.fetch_func( draw, machine, &elts[i], max_vertices );
 
+#if 0
+      for (j = 0; j < max_vertices; j++) {
+	 unsigned slot;
+	 debug_printf("%d) Input vert:\n", i + j);
+	 for (slot = 0; slot < shader->info.num_inputs; slot++) {
+	    debug_printf("\t%d: %f %f %f %f\n", slot,
+			 machine->Inputs[slot].xyzw[0].f[j],
+			 machine->Inputs[slot].xyzw[1].f[j],
+			 machine->Inputs[slot].xyzw[2].f[j],
+			 machine->Inputs[slot].xyzw[3].f[j]);
+	 }
+      }
+#endif
+
+
       if (!draw->rasterizer->bypass_vs) {
          /* run interpreter */
          tgsi_exec_machine_run( machine );
@@ -127,10 +143,10 @@ vs_exec_run( struct draw_vertex_shader *shader,
           * program as a set of DP4 instructions appended to the
           * user-provided code.
           */
-         x = out->clip[0] = machine->Outputs[0].xyzw[0].f[j];
-         y = out->clip[1] = machine->Outputs[0].xyzw[1].f[j];
-         z = out->clip[2] = machine->Outputs[0].xyzw[2].f[j];
-         w = out->clip[3] = machine->Outputs[0].xyzw[3].f[j];
+         x = out->clip[0] = outputs[0].xyzw[0].f[j];
+         y = out->clip[1] = outputs[0].xyzw[1].f[j];
+         z = out->clip[2] = outputs[0].xyzw[2].f[j];
+         w = out->clip[3] = outputs[0].xyzw[3].f[j];
 
          if (!draw->rasterizer->bypass_clipping) {
             out->clipmask = compute_clipmask(out->clip, draw->plane,
@@ -156,7 +172,8 @@ vs_exec_run( struct draw_vertex_shader *shader,
             out->data[0][2] = z * scale[2] + trans[2];
             out->data[0][3] = w;
          }
-         else {
+         else 
+	 {
             out->data[0][0] = x;
             out->data[0][1] = y;
             out->data[0][2] = z;
@@ -167,10 +184,10 @@ vs_exec_run( struct draw_vertex_shader *shader,
           * vertex attrib slots.
           */
          for (slot = 1; slot < draw->num_vs_outputs; slot++) {
-            out->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
-            out->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
-            out->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
-            out->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
+            out->data[slot][0] = outputs[slot].xyzw[0].f[j];
+            out->data[slot][1] = outputs[slot].xyzw[1].f[j];
+            out->data[slot][2] = outputs[slot].xyzw[2].f[j];
+            out->data[slot][3] = outputs[slot].xyzw[3].f[j];
          }
 
 #if 0 /*DEBUG*/
@@ -216,12 +233,25 @@ vs_exec_run_linear( struct draw_vertex_shader *shader,
       /* Swizzle inputs.  
        */
       for (j = 0; j < max_vertices; j++) {
+#if 0
+         debug_printf("%d) Input vert:\n", i + j);
+         for (slot = 0; slot < shader->info.num_inputs; slot++) {
+            debug_printf("\t%d: %f %f %f %f\n", slot,
+			 input[slot][0],
+			 input[slot][1],
+			 input[slot][2],
+			 input[slot][3]);
+         }
+#endif
+
          for (slot = 0; slot < shader->info.num_inputs; slot++) {
             machine->Inputs[slot].xyzw[0].f[j] = input[slot][0];
             machine->Inputs[slot].xyzw[1].f[j] = input[slot][1];
             machine->Inputs[slot].xyzw[2].f[j] = input[slot][2];
             machine->Inputs[slot].xyzw[3].f[j] = input[slot][3];
          }
+
+	 input = (const float (*)[4])((const char *)input + input_stride);
       } 
 
       /* run interpreter */
@@ -235,13 +265,23 @@ vs_exec_run_linear( struct draw_vertex_shader *shader,
             output[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
             output[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
             output[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
+
          }
+
+#if 0
+	 debug_printf("%d) Post xform vert:\n", i + j);
+	 for (slot = 0; slot < shader->info.num_outputs; slot++) {
+	    debug_printf("\t%d: %f %f %f %f\n", slot,
+			 output[slot][0],
+			 output[slot][1],
+			 output[slot][2],
+			 output[slot][3]);
+         }
+#endif
+
+	 output = (float (*)[4])((char *)output + output_stride);
       } 
 
-      /* Advance input, output pointers: 
-       */
-      input = (const float (*)[4])((const char *)input + input_stride);
-      output = (float (*)[4])((char *)output + output_stride);
    }
 }
 
diff --git a/src/gallium/auxiliary/draw/draw_vs_llvm.c b/src/gallium/auxiliary/draw/draw_vs_llvm.c
index d0baca715e6..4dbfa955a4e 100644
--- a/src/gallium/auxiliary/draw/draw_vs_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_vs_llvm.c
@@ -47,6 +47,7 @@
 struct draw_llvm_vertex_shader {
    struct draw_vertex_shader base;
    struct gallivm_prog *llvm_prog;
+   struct tgsi_exec_machine *machine;
 };
 
 
@@ -77,12 +78,9 @@ vs_llvm_run( struct draw_vertex_shader *base,
    struct draw_llvm_vertex_shader *shader =
       (struct draw_llvm_vertex_shader *)base;
 
-   struct tgsi_exec_machine *machine = &draw->machine;
+   struct tgsi_exec_machine *machine = shader->machine;
    unsigned int j;
    unsigned int clipped = 0;
-
-   ALIGN16_DECL(struct tgsi_exec_vector, inputs, PIPE_MAX_ATTRIBS);
-   ALIGN16_DECL(struct tgsi_exec_vector, outputs, PIPE_MAX_ATTRIBS);
    const float *scale = draw->viewport.scale;
    const float *trans = draw->viewport.translate;
 
@@ -93,13 +91,12 @@ vs_llvm_run( struct draw_vertex_shader *base,
    /* Consts does not require 16 byte alignment. */
    machine->Consts = (float (*)[4]) draw->user.constants;
 
-   machine->Inputs = ALIGN16_ASSIGN(inputs);
    if (draw->rasterizer->bypass_vs) {
       /* outputs are just the inputs */
-      machine->Outputs = machine->Inputs;
+      outputs = machine->Inputs;
    }
    else {
-      machine->Outputs = ALIGN16_ASSIGN(outputs);
+      outputs = machine->Outputs;
    }
 
 
@@ -119,10 +116,10 @@ vs_llvm_run( struct draw_vertex_shader *base,
       unsigned slot;
       float x, y, z, w;
 
-      x = vOut[j]->clip[0] = machine->Outputs[0].xyzw[0].f[j];
-      y = vOut[j]->clip[1] = machine->Outputs[0].xyzw[1].f[j];
-      z = vOut[j]->clip[2] = machine->Outputs[0].xyzw[2].f[j];
-      w = vOut[j]->clip[3] = machine->Outputs[0].xyzw[3].f[j];
+      x = vOut[j]->clip[0] = outputs[0].xyzw[0].f[j];
+      y = vOut[j]->clip[1] = outputs[0].xyzw[1].f[j];
+      z = vOut[j]->clip[2] = outputs[0].xyzw[2].f[j];
+      w = vOut[j]->clip[3] = outputs[0].xyzw[3].f[j];
 
       if (!draw->rasterizer->bypass_clipping) {
          vOut[j]->clipmask = compute_clipmask(vOut[j]->clip, draw->plane,
@@ -159,10 +156,10 @@ vs_llvm_run( struct draw_vertex_shader *base,
        * vertex attrib slots.
        */
       for (slot = 1; slot < draw->num_vs_outputs; slot++) {
-         vOut[j]->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
-         vOut[j]->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
-         vOut[j]->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
-         vOut[j]->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
+         vOut[j]->data[slot][0] = outputs[slot].xyzw[0].f[j];
+         vOut[j]->data[slot][1] = outputs[slot].xyzw[1].f[j];
+         vOut[j]->data[slot][2] = outputs[slot].xyzw[2].f[j];
+         vOut[j]->data[slot][3] = outputs[slot].xyzw[3].f[j];
       }
    } /* loop over vertices */
    return clipped != 0;
@@ -183,7 +180,7 @@ vs_llvm_run_linear( struct draw_vertex_shader *base,
    struct draw_llvm_vertex_shader *shader =
       (struct draw_llvm_vertex_shader *)base;
 
-   struct tgsi_exec_machine *machine = &draw->machine;
+   struct tgsi_exec_machine *machine = shader->machine;
    unsigned int j;
 
 
@@ -199,6 +196,8 @@ vs_llvm_run_linear( struct draw_vertex_shader *base,
 	    machine->Inputs[slot].xyzw[2].f[j] = input[slot][2];
 	    machine->Inputs[slot].xyzw[3].f[j] = input[slot][3];
 	 }
+
+	 input = (const float (*)[4])((const char *)input + input_stride);
       } 
 
       /* run shader */
@@ -216,12 +215,9 @@ vs_llvm_run_linear( struct draw_vertex_shader *base,
          output[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
          output[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
          output[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
-      }
 
-      /* Advance input, output pointers: 
-       */
-      input = (const float (*)[4])((const char *)input + input_stride);
-      output = (float (*)[4])((char *)output + output_stride);
+	 output = (float (*)[4])((char *)output + output_stride);
+      }
    } 
 }
 
@@ -263,6 +259,7 @@ draw_create_vs_llvm(struct draw_context *draw,
    vs->base.run = vs_llvm_run;
    vs->base.run_linear = vs_llvm_run_linear;
    vs->base.delete = vs_llvm_delete;
+   vs->machine = &draw->machine;
 
    {
       struct gallivm_ir *ir = gallivm_ir_new(GALLIVM_VS);
diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c
index 873ecfdc5d3..a763f3845c5 100644
--- a/src/gallium/auxiliary/draw/draw_vs_sse.c
+++ b/src/gallium/auxiliary/draw/draw_vs_sse.c
@@ -91,12 +91,10 @@ vs_sse_run( struct draw_vertex_shader *base,
             unsigned vertex_size )
 {
    struct draw_sse_vertex_shader *shader = (struct draw_sse_vertex_shader *)base;
-   struct tgsi_exec_machine *machine = &draw->machine;
+   struct tgsi_exec_machine *machine = shader->machine;
    unsigned int i, j;
    unsigned int clipped = 0;
-
-   ALIGN16_DECL(struct tgsi_exec_vector, inputs, PIPE_MAX_ATTRIBS);
-   ALIGN16_DECL(struct tgsi_exec_vector, outputs, PIPE_MAX_ATTRIBS);
+   struct tgsi_exec_vector *outputs = 0;
    const float *scale = draw->viewport.scale;
    const float *trans = draw->viewport.translate;
 
@@ -104,13 +102,13 @@ vs_sse_run( struct draw_vertex_shader *base,
 
    /* Consts does not require 16 byte alignment. */
    machine->Consts = (const float (*)[4]) draw->user.constants;
-   machine->Inputs = ALIGN16_ASSIGN(inputs);
+
    if (draw->rasterizer->bypass_vs) {
       /* outputs are just the inputs */
-      machine->Outputs = machine->Inputs;
+      outputs = machine->Inputs;
    }
    else {
-      machine->Outputs = ALIGN16_ASSIGN(outputs);
+      outputs = machine->Outputs;
    }
 
    for (i = 0; i < count; i += SSE_MAX_VERTICES) {
@@ -142,10 +140,10 @@ vs_sse_run( struct draw_vertex_shader *base,
          struct vertex_header *out =
             draw_header_from_block(vOut, vertex_size, i + j);
 
-         x = out->clip[0] = machine->Outputs[0].xyzw[0].f[j];
-         y = out->clip[1] = machine->Outputs[0].xyzw[1].f[j];
-         z = out->clip[2] = machine->Outputs[0].xyzw[2].f[j];
-         w = out->clip[3] = machine->Outputs[0].xyzw[3].f[j];
+         x = out->clip[0] = outputs[0].xyzw[0].f[j];
+         y = out->clip[1] = outputs[0].xyzw[1].f[j];
+         z = out->clip[2] = outputs[0].xyzw[2].f[j];
+         w = out->clip[3] = outputs[0].xyzw[3].f[j];
 
          if (!draw->rasterizer->bypass_clipping) {
             out->clipmask = compute_clipmask(out->clip, draw->plane,
@@ -182,10 +180,10 @@ vs_sse_run( struct draw_vertex_shader *base,
           * vertex attrib slots.
           */
          for (slot = 1; slot < draw->num_vs_outputs; slot++) {
-            out->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
-            out->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
-            out->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
-            out->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
+            out->data[slot][0] = outputs[slot].xyzw[0].f[j];
+            out->data[slot][1] = outputs[slot].xyzw[1].f[j];
+            out->data[slot][2] = outputs[slot].xyzw[2].f[j];
+            out->data[slot][3] = outputs[slot].xyzw[3].f[j];
          }
 #if 0 /*DEBUG*/
          printf("%d) Post xform vert:\n", i + j);
@@ -233,6 +231,8 @@ vs_sse_run_linear( struct draw_vertex_shader *base,
             machine->Inputs[slot].xyzw[2].f[j] = input[slot][2];
             machine->Inputs[slot].xyzw[3].f[j] = input[slot][3];
          }
+
+	 input = (const float (*)[4])((const char *)input + input_stride);
       } 
 
       /* run compiled shader
@@ -253,12 +253,9 @@ vs_sse_run_linear( struct draw_vertex_shader *base,
             output[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
             output[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
          }
-      } 
 
-      /* Advance input, output pointers: 
-       */
-      input = (const float (*)[4])((const char *)input + input_stride);
-      output = (float (*)[4])((char *)output + output_stride);
+	 output = (float (*)[4])((char *)output + output_stride);
+      } 
    }
 }
 
@@ -300,6 +297,7 @@ draw_create_vs_sse(struct draw_context *draw,
    vs->base.run = vs_sse_run;
    vs->base.run_linear = vs_sse_run_linear;
    vs->base.delete = vs_sse_delete;
+   vs->machine = &draw->machine;
    
    x86_init_func( &vs->sse2_program );