st/mesa: factor ucp-lowering logic into helper

[mesa.git] / src / mesa / tnl / t_pipeline.c
diff --git a/src/mesa/tnl/t_pipeline.c b/src/mesa/tnl/t_pipeline.c

index d6121d70b6f8545f576c05e1d7bb31618ae4a627..760e69dba9d899d7528dba86e9cecc4fa0d191c0 100644 (file)
--- a/src/mesa/tnl/t_pipeline.c
+++ b/src/mesa/tnl/t_pipeline.c
@@ -1,10 +1,7 @@
-/* $Id: t_pipeline.c,v 1.23 2002/10/16 17:57:52 brianp Exp $ */
-
  /*
   * Mesa 3-D graphics library
- * Version:  3.5
   *
- * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
+ * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
   *
   * Permission is hereby granted, free of charge, to any person obtaining a
   * copy of this software and associated documentation files (the "Software"),
@@ -19,147 +16,235 @@
   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
   * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
   *
   * Authors:
- *    Keith Whitwell <keithw@valinux.com>
+ *    Keith Whitwell <keithw@vmware.com>
   */
  
-#include "glheader.h"
-#include "context.h"
-#include "mem.h"
-#include "mmath.h"
-#include "state.h"
-#include "mtypes.h"
+#include "main/glheader.h"
+#include "main/context.h"
  
-#include "math/m_translate.h"
-#include "math/m_xform.h"
+#include "main/mtypes.h"
  
  #include "t_context.h"
  #include "t_pipeline.h"
+#include "t_vp_build.h"
+#include "t_vertex.h"
  
-
-void _tnl_install_pipeline( GLcontext *ctx,
-                           const struct gl_pipeline_stage **stages )
+void _tnl_install_pipeline( struct gl_context *ctx,
+                           const struct tnl_pipeline_stage **stages )
  {
     TNLcontext *tnl = TNL_CONTEXT(ctx);
-   struct gl_pipeline *pipe = &tnl->pipeline;
     GLuint i;
  
-   ASSERT(pipe->nr_stages == 0);
-
-   pipe->run_state_changes = ~0;
-   pipe->run_input_changes = ~0;
-   pipe->build_state_changes = ~0;
-   pipe->build_state_trigger = 0;
-   pipe->inputs = 0;
+   tnl->pipeline.new_state = ~0;
  
     /* Create a writeable copy of each stage.
      */
     for (i = 0 ; i < MAX_PIPELINE_STAGES && stages[i] ; i++) {
-      MEMCPY( &pipe->stages[i], stages[i], sizeof( **stages ));
-      pipe->build_state_trigger |= pipe->stages[i].check_state;
+      struct tnl_pipeline_stage *s = &tnl->pipeline.stages[i];
+      memcpy(s, stages[i], sizeof(*s));
+      if (s->create)
+        s->create(ctx, s);
     }
  
-   MEMSET( &pipe->stages[i], 0, sizeof( **stages ));
-
-   pipe->nr_stages = i;
+   tnl->pipeline.nr_stages = i;
  }
  
-void _tnl_destroy_pipeline( GLcontext *ctx )
+void _tnl_destroy_pipeline( struct gl_context *ctx )
  {
     TNLcontext *tnl = TNL_CONTEXT(ctx);
     GLuint i;
  
-   for (i = 0 ; i < tnl->pipeline.nr_stages ; i++)
-      tnl->pipeline.stages[i].destroy( &tnl->pipeline.stages[i] );
+   for (i = 0 ; i < tnl->pipeline.nr_stages ; i++) {
+      struct tnl_pipeline_stage *s = &tnl->pipeline.stages[i];
+      if (s->destroy)
+        s->destroy(s);
+   }
  
     tnl->pipeline.nr_stages = 0;
  }
  
-/* TODO: merge validate with run.
- */
-void _tnl_validate_pipeline( GLcontext *ctx )
+
+
+static GLuint check_input_changes( struct gl_context *ctx )
  {
     TNLcontext *tnl = TNL_CONTEXT(ctx);
-   struct gl_pipeline *pipe = &tnl->pipeline;
-   struct gl_pipeline_stage *s = pipe->stages;
-   GLuint newstate = pipe->build_state_changes;
-   GLuint generated = 0;
-   GLuint changed_inputs = 0;
-
-   pipe->inputs = 0;
-   pipe->build_state_changes = 0;
-
-   for ( ; s->check ; s++) {
-
-      s->changed_inputs |= s->inputs & changed_inputs;
-
-      if (s->check_state & newstate) {
-        if (s->active) {
-           GLuint old_outputs = s->outputs;
-           s->check(ctx, s);
-           if (!s->active)
-              changed_inputs |= old_outputs;
-        }
-        else
-           s->check(ctx, s);
+   GLuint i;
+
+   for (i = 0; i <= _TNL_LAST_MAT; i++) {
+      if (tnl->vb.AttribPtr[i]->size != tnl->pipeline.last_attrib_size[i] ||
+         tnl->vb.AttribPtr[i]->stride != tnl->pipeline.last_attrib_stride[i]) {
+        tnl->pipeline.last_attrib_size[i] = tnl->vb.AttribPtr[i]->size;
+        tnl->pipeline.last_attrib_stride[i] = tnl->vb.AttribPtr[i]->stride;
+        tnl->pipeline.input_changes |= 1<<i;
        }
+   }
+
+   return tnl->pipeline.input_changes;
+}
  
-      if (s->active) {
-        pipe->inputs |= s->inputs & ~generated;
-        generated |= s->outputs;
+
+static GLuint check_output_changes( struct gl_context *ctx )
+{
+#if 0
+   TNLcontext *tnl = TNL_CONTEXT(ctx);
+
+   for (i = 0; i < VARYING_SLOT_MAX; i++) {
+      if (tnl->vb.ResultPtr[i]->size != tnl->last_result_size[i] ||
+         tnl->vb.ResultPtr[i]->stride != tnl->last_result_stride[i]) {
+        tnl->last_result_size[i] = tnl->vb.ResultPtr[i]->size;
+        tnl->last_result_stride[i] = tnl->vb.ResultPtr[i]->stride;
+        tnl->pipeline.output_changes |= 1<<i;
        }
     }
+
+   if (tnl->pipeline.output_changes)
+      tnl->Driver.NotifyOutputChanges( ctx, tnl->pipeline.output_changes );
+
+   return tnl->pipeline.output_changes;
+#else
+   return ~0;
+#endif
  }
  
+/**
+ * START/END_FAST_MATH macros:
+ *
+ * START_FAST_MATH: Set x86 FPU to faster, 32-bit precision mode (and save
+ *                  original mode to a temporary).
+ * END_FAST_MATH: Restore x86 FPU to original mode.
+ */
+#if defined(__GNUC__) && defined(__i386__)
+/*
+ * Set the x86 FPU control word to guarentee only 32 bits of precision
+ * are stored in registers.  Allowing the FPU to store more introduces
+ * differences between situations where numbers are pulled out of memory
+ * vs. situations where the compiler is able to optimize register usage.
+ *
+ * In the worst case, we force the compiler to use a memory access to
+ * truncate the float, by specifying the 'volatile' keyword.
+ */
+/* Hardware default: All exceptions masked, extended double precision,
+ * round to nearest (IEEE compliant):
+ */
+#define DEFAULT_X86_FPU                0x037f
+/* All exceptions masked, single precision, round to nearest:
+ */
+#define FAST_X86_FPU           0x003f
+/* The fldcw instruction will cause any pending FP exceptions to be
+ * raised prior to entering the block, and we clear any pending
+ * exceptions before exiting the block.  Hence, asm code has free
+ * reign over the FPU while in the fast math block.
+ */
+#if defined(NO_FAST_MATH)
+#define START_FAST_MATH(x)                                             \
+do {                                                                   \
+   static GLuint mask = DEFAULT_X86_FPU;                               \
+   __asm__ ( "fnstcw %0" : "=m" (*&(x)) );                             \
+   __asm__ ( "fldcw %0" : : "m" (mask) );                              \
+} while (0)
+#else
+#define START_FAST_MATH(x)                                             \
+do {                                                                   \
+   static GLuint mask = FAST_X86_FPU;                                  \
+   __asm__ ( "fnstcw %0" : "=m" (*&(x)) );                             \
+   __asm__ ( "fldcw %0" : : "m" (mask) );                              \
+} while (0)
+#endif
+/* Restore original FPU mode, and clear any exceptions that may have
+ * occurred in the FAST_MATH block.
+ */
+#define END_FAST_MATH(x)                                               \
+do {                                                                   \
+   __asm__ ( "fnclex ; fldcw %0" : : "m" (*&(x)) );                    \
+} while (0)
+
+#elif defined(_MSC_VER) && defined(_M_IX86)
+#define DEFAULT_X86_FPU                0x037f /* See GCC comments above */
+#define FAST_X86_FPU           0x003f /* See GCC comments above */
+#if defined(NO_FAST_MATH)
+#define START_FAST_MATH(x) do {\
+       static GLuint mask = DEFAULT_X86_FPU;\
+       __asm fnstcw word ptr [x]\
+       __asm fldcw word ptr [mask]\
+} while(0)
+#else
+#define START_FAST_MATH(x) do {\
+       static GLuint mask = FAST_X86_FPU;\
+       __asm fnstcw word ptr [x]\
+       __asm fldcw word ptr [mask]\
+} while(0)
+#endif
+#define END_FAST_MATH(x) do {\
+       __asm fnclex\
+       __asm fldcw word ptr [x]\
+} while(0)
+
+#else
+#define START_FAST_MATH(x)  x = 0
+#define END_FAST_MATH(x)  (void)(x)
+#endif
  
  
-void _tnl_run_pipeline( GLcontext *ctx )
+void _tnl_run_pipeline( struct gl_context *ctx )
  {
     TNLcontext *tnl = TNL_CONTEXT(ctx);
-   struct vertex_buffer *VB = &tnl->vb;
-   struct gl_pipeline *pipe = &tnl->pipeline;
-   struct gl_pipeline_stage *s = pipe->stages;
-   GLuint changed_state = pipe->run_state_changes;
-   GLuint changed_inputs = pipe->run_input_changes;
-   GLboolean running = GL_TRUE;
     unsigned short __tmp;
+   GLuint i;
  
-   pipe->run_state_changes = 0;
-   pipe->run_input_changes = 0;
-
-   /* Done elsewhere.
-    */
-   ASSERT(pipe->build_state_changes == 0);
-
-   START_FAST_MATH(__tmp);
+   if (!tnl->vb.Count)
+      return;
  
-   /* If something changes in the pipeline, tag all subsequent stages
-    * using this value for recalculation.  Inactive stages have their
-    * state and inputs examined to try to keep cached data alive over
-    * state-changes.
+   /* Check for changed input sizes or change in stride to/from zero
+    * (ie const or non-const).
      */
-   for ( ; s->run ; s++) {
-      s->changed_inputs |= s->inputs & changed_inputs;
+   if (check_input_changes( ctx ) || tnl->pipeline.new_state) {
+      if (ctx->VertexProgram._MaintainTnlProgram)
+        _tnl_UpdateFixedFunctionProgram( ctx );
+
+      for (i = 0; i < tnl->pipeline.nr_stages ; i++) {
+        struct tnl_pipeline_stage *s = &tnl->pipeline.stages[i];
+        if (s->validate)
+           s->validate( ctx, s );
+      }
  
-      if (s->run_state & changed_state)
-        s->changed_inputs = s->inputs;
+      tnl->pipeline.new_state = 0;
+      tnl->pipeline.input_changes = 0;
  
-      if (s->active && running) {
-        if (s->changed_inputs)
-           changed_inputs |= s->outputs;
+      /* Pipeline can only change its output in response to either a
+       * statechange or an input size/stride change.  No other changes
+       * are allowed.
+       */
+      if (check_output_changes( ctx ))
+        _tnl_notify_pipeline_output_change( ctx );
+   }
  
-        running = s->run( ctx, s );
+#ifndef _OPENMP
+   /* Don't adjust FPU precision mode in case multiple threads are to be used.
+    * This would require that the additional threads also changed the FPU mode
+    * which is quite a mess as this had to be done in all parallelized sections;
+    * otherwise the master thread and all other threads are running in different
+    * modes, producing inconsistent results.
+    * Note that all x64 implementations don't define/use START_FAST_MATH, so
+    * this is "hack" is only used in i386 mode
+    */
+   START_FAST_MATH(__tmp);
+#endif
  
-        s->changed_inputs = 0;
-        VB->importable_data &= ~s->outputs;
-      }
+   for (i = 0; i < tnl->pipeline.nr_stages ; i++) {
+      struct tnl_pipeline_stage *s = &tnl->pipeline.stages[i];
+      if (!s->run( ctx, s ))
+        break;
     }
  
+#ifndef _OPENMP
     END_FAST_MATH(__tmp);
+#endif
  }
  
  
@@ -186,7 +271,7 @@ void _tnl_run_pipeline( GLcontext *ctx )
   *
   * - inserting optimized (but specialized) stages ahead of the
   *   general-purpose fallback implementation.  For example, the old
- *   fastpath mechanism, which only works when the VERT_BIT_ELT input is
+ *   fastpath mechanism, which only works when the VB->Elts input is
   *   available, can be duplicated by placing the fastpath stage at the
   *   head of this pipeline.  Such specialized stages are currently
   *   constrained to have no outputs (ie. they must either finish the *
@@ -195,17 +280,21 @@ void _tnl_run_pipeline( GLcontext *ctx )
   * Some work can be done to lift some of the restrictions in the final
   * case, if it becomes necessary to do so.
   */
-const struct gl_pipeline_stage *_tnl_default_pipeline[] = {
+const struct tnl_pipeline_stage *_tnl_default_pipeline[] = {
     &_tnl_vertex_transform_stage,
     &_tnl_normal_transform_stage,
     &_tnl_lighting_stage,
-   &_tnl_fog_coordinate_stage,
     &_tnl_texgen_stage,
     &_tnl_texture_transform_stage,
     &_tnl_point_attenuation_stage,
-#if FEATURE_NV_vertex_program
     &_tnl_vertex_program_stage,
-#endif
+   &_tnl_fog_coordinate_stage,
+   &_tnl_render_stage,
+   NULL
+};
+
+const struct tnl_pipeline_stage *_tnl_vp_pipeline[] = {
+   &_tnl_vertex_program_stage,
     &_tnl_render_stage,
-   0
+   NULL
  };