Merge remote branch 'origin/lp-binning'

author José Fonseca <jfonseca@vmware.com>

Fri, 5 Feb 2010 13:48:35 +0000 (13:48 +0000)

committer José Fonseca <jfonseca@vmware.com>

Fri, 5 Feb 2010 13:48:35 +0000 (13:48 +0000)
author José Fonseca <jfonseca@vmware.com>
Fri, 5 Feb 2010 13:48:35 +0000 (13:48 +0000)
committer José Fonseca <jfonseca@vmware.com>
Fri, 5 Feb 2010 13:48:35 +0000 (13:48 +0000)
diff --combined src/gallium/auxiliary/os/os_thread.h

index 2da5fd77386901c07cd2b20875492f1c77e1eeb0,0000000000000000000000000000000000000000..24a2309976a8449f3c3ff53ffa3650bba391b800

mode 100644,000000..100644
--- 1/src/gallium/auxiliary/os/os_thread.h
--- /dev/null
+++ b/src/gallium/auxiliary/os/os_thread.h
@@@ -1,298 -1,0 +1,435 @@@
-  * Thread, mutex, condition var and thread-specific data functions.
+ +/**************************************************************************
+ + * 
+ + * Copyright 1999-2006 Brian Paul
+ + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ + * All Rights Reserved.
+ + * 
+ + * Permission is hereby granted, free of charge, to any person obtaining a
+ + * copy of this software and associated documentation files (the "Software"),
+ + * to deal in the Software without restriction, including without limitation
+ + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ + * and/or sell copies of the Software, and to permit persons to whom the
+ + * Software is furnished to do so, subject to the following conditions:
+ + *
+ + * The above copyright notice and this permission notice shall be included
+ + * in all copies or substantial portions of the Software.
+ + *
+ + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ + * 
+ + **************************************************************************/
+ +
+ +
+ +/**
+ + * @file
+ + * 
- typedef pthread_cond_t pipe_condvar;
++ * Thread, mutex, condition variable, barrier, semaphore and
++ * thread-specific data functions.
+ + */
+ +
+ +
+ +#ifndef OS_THREAD_H_
+ +#define OS_THREAD_H_
+ +
+ +
+ +#include "pipe/p_compiler.h"
+ +#include "util/u_debug.h" /* for assert */
+ +
+ +
+ +#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU)
+ +
+ +#include <pthread.h> /* POSIX threads headers */
+ +#include <stdio.h> /* for perror() */
+ +
+ +#define PIPE_THREAD_HAVE_CONDVAR
+ +
++/* pipe_thread
++ */
+ +typedef pthread_t pipe_thread;
+ +
+ +#define PIPE_THREAD_ROUTINE( name, param ) \
+ +   void *name( void *param )
+ +
+ +static INLINE pipe_thread pipe_thread_create( void *(* routine)( void *), void *param )
+ +{
+ +   pipe_thread thread;
+ +   if (pthread_create( &thread, NULL, routine, param ))
+ +      return 0;
+ +   return thread;
+ +}
+ +
+ +static INLINE int pipe_thread_wait( pipe_thread thread )
+ +{
+ +   return pthread_join( thread, NULL );
+ +}
+ +
+ +static INLINE int pipe_thread_destroy( pipe_thread thread )
+ +{
+ +   return pthread_detach( thread );
+ +}
+ +
++
++/* pipe_mutex
++ */
+ +typedef pthread_mutex_t pipe_mutex;
- /* XXX: dummy definitions, make it compile */
+ +
+ +#define pipe_static_mutex(mutex) \
+ +   static pipe_mutex mutex = PTHREAD_MUTEX_INITIALIZER
+ +
+ +#define pipe_mutex_init(mutex) \
+ +   (void) pthread_mutex_init(&(mutex), NULL)
+ +
+ +#define pipe_mutex_destroy(mutex) \
+ +   pthread_mutex_destroy(&(mutex))
+ +
+ +#define pipe_mutex_lock(mutex) \
+ +   (void) pthread_mutex_lock(&(mutex))
+ +
+ +#define pipe_mutex_unlock(mutex) \
+ +   (void) pthread_mutex_unlock(&(mutex))
+ +
++
++/* pipe_condvar
++ */
++typedef pthread_cond_t pipe_condvar;
++
+ +#define pipe_static_condvar(mutex) \
+ +   static pipe_condvar mutex = PTHREAD_COND_INITIALIZER
+ +
+ +#define pipe_condvar_init(cond)       \
+ +   pthread_cond_init(&(cond), NULL)
+ +
+ +#define pipe_condvar_destroy(cond) \
+ +   pthread_cond_destroy(&(cond))
+ +
+ +#define pipe_condvar_wait(cond, mutex) \
+ +  pthread_cond_wait(&(cond), &(mutex))
+ +
+ +#define pipe_condvar_signal(cond) \
+ +  pthread_cond_signal(&(cond))
+ +
+ +#define pipe_condvar_broadcast(cond) \
+ +  pthread_cond_broadcast(&(cond))
+ +
+ +
++/* pipe_barrier
++ */
++typedef pthread_barrier_t pipe_barrier;
++
++static INLINE void pipe_barrier_init(pipe_barrier *barrier, unsigned count)
++{
++   pthread_barrier_init(barrier, NULL, count);
++}
++
++static INLINE void pipe_barrier_destroy(pipe_barrier *barrier)
++{
++   pthread_barrier_destroy(barrier);
++}
++
++static INLINE void pipe_barrier_wait(pipe_barrier *barrier)
++{
++   pthread_barrier_wait(barrier);
++}
++
++
+ +#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
+ +
+ +#include <windows.h>
+ +
++/* pipe_thread
++ */
+ +typedef HANDLE pipe_thread;
+ +
+ +#define PIPE_THREAD_ROUTINE( name, param ) \
+ +   void * WINAPI name( void *param )
+ +
+ +static INLINE pipe_thread pipe_thread_create( void *(WINAPI * routine)( void *), void *param )
+ +{
+ +   DWORD id;
+ +   return CreateThread( NULL, 0, (LPTHREAD_START_ROUTINE) routine, param, 0, &id );
+ +}
+ +
+ +static INLINE int pipe_thread_wait( pipe_thread thread )
+ +{
+ +   if (WaitForSingleObject( thread, INFINITE ) == WAIT_OBJECT_0)
+ +      return 0;
+ +   return -1;
+ +}
+ +
+ +static INLINE int pipe_thread_destroy( pipe_thread thread )
+ +{
+ +   if (CloseHandle( thread ))
+ +      return 0;
+ +   return -1;
+ +}
+ +
++
++/* pipe_mutex
++ */
+ +typedef CRITICAL_SECTION pipe_mutex;
+ +
+ +#define pipe_static_mutex(mutex) \
+ +   /*static*/ pipe_mutex mutex = {0,0,0,0,0,0}
+ +
+ +#define pipe_mutex_init(mutex) \
+ +   InitializeCriticalSection(&mutex)
+ +
+ +#define pipe_mutex_destroy(mutex) \
+ +   DeleteCriticalSection(&mutex)
+ +
+ +#define pipe_mutex_lock(mutex) \
+ +   EnterCriticalSection(&mutex)
+ +
+ +#define pipe_mutex_unlock(mutex) \
+ +   LeaveCriticalSection(&mutex)
+ +
- #define pipe_condvar_init(condvar) \
-    (void) condvar
+ +
++/* pipe_condvar (XXX FIX THIS)
++ */
+ +typedef unsigned pipe_condvar;
+ +
- #define pipe_condvar_broadcast(condvar) \
-    (void) condvar
++#define pipe_condvar_init(cond) \
++   (void) cond
++
++#define pipe_condvar_destroy(cond) \
++   (void) cond
++
++#define pipe_condvar_wait(cond, mutex) \
++   (void) cond; (void) mutex
++
++#define pipe_condvar_signal(cond) \
++   (void) cond
++
++#define pipe_condvar_broadcast(cond) \
++   (void) cond
++
++
++/* pipe_barrier (XXX FIX THIS)
++ */
++typedef unsigned pipe_barrier;
++
++static INLINE void pipe_barrier_init(pipe_barrier *barrier, unsigned count)
++{
++   /* XXX we could implement barriers with a mutex and condition var */
++   assert(0);
++}
++
++static INLINE void pipe_barrier_destroy(pipe_barrier *barrier)
++{
++   assert(0);
++}
++
++static INLINE void pipe_barrier_wait(pipe_barrier *barrier)
++{
++   assert(0);
++}
++
+ +
+ +
+ +#else
+ +
+ +/** Dummy definitions */
+ +
+ +typedef unsigned pipe_thread;
+ +
+ +#define PIPE_THREAD_ROUTINE( name, param ) \
+ +   void * name( void *param )
+ +
+ +static INLINE pipe_thread pipe_thread_create( void *(* routine)( void *), void *param )
+ +{
+ +   return 0;
+ +}
+ +
+ +static INLINE int pipe_thread_wait( pipe_thread thread )
+ +{
+ +   return -1;
+ +}
+ +
+ +static INLINE int pipe_thread_destroy( pipe_thread thread )
+ +{
+ +   return -1;
+ +}
+ +
+ +typedef unsigned pipe_mutex;
+ +typedef unsigned pipe_condvar;
++typedef unsigned pipe_barrier;
+ +
+ +#define pipe_static_mutex(mutex) \
+ +   static pipe_mutex mutex = 0
+ +
+ +#define pipe_mutex_init(mutex) \
+ +   (void) mutex
+ +
+ +#define pipe_mutex_destroy(mutex) \
+ +   (void) mutex
+ +
+ +#define pipe_mutex_lock(mutex) \
+ +   (void) mutex
+ +
+ +#define pipe_mutex_unlock(mutex) \
+ +   (void) mutex
+ +
+ +#define pipe_static_condvar(condvar) \
+ +   static unsigned condvar = 0
+ +
+ +#define pipe_condvar_init(condvar) \
+ +   (void) condvar
+ +
+ +#define pipe_condvar_destroy(condvar) \
+ +   (void) condvar
+ +
+ +#define pipe_condvar_wait(condvar, mutex) \
+ +   (void) condvar
+ +
+ +#define pipe_condvar_signal(condvar) \
+ +   (void) condvar
+ +
+ +#define pipe_condvar_broadcast(condvar) \
+ +   (void) condvar
+ +
+ +
++static INLINE void pipe_barrier_init(pipe_barrier *barrier, unsigned count)
++{
++   /* XXX we could implement barriers with a mutex and condition var */
++   assert(0);
++}
++
++static INLINE void pipe_barrier_destroy(pipe_barrier *barrier)
++{
++   assert(0);
++}
++
++static INLINE void pipe_barrier_wait(pipe_barrier *barrier)
++{
++   assert(0);
++}
++
++
++
+ +#endif  /* PIPE_OS_? */
+ +
+ +
++/*
++ * Semaphores
++ */
++
++typedef struct
++{
++   pipe_mutex mutex;
++   pipe_condvar cond;
++   int counter;
++} pipe_semaphore;
++
++
++static INLINE void
++pipe_semaphore_init(pipe_semaphore *sema, int init_val)
++{
++   pipe_mutex_init(sema->mutex);
++   pipe_condvar_init(sema->cond);
++   sema->counter = init_val;
++}
++
++static INLINE void
++pipe_semaphore_destroy(pipe_semaphore *sema)
++{
++   pipe_mutex_destroy(sema->mutex);
++   pipe_condvar_destroy(sema->cond);
++}
++
++/** Signal/increment semaphore counter */
++static INLINE void
++pipe_semaphore_signal(pipe_semaphore *sema)
++{
++   pipe_mutex_lock(sema->mutex);
++   sema->counter++;
++   pipe_condvar_signal(sema->cond);
++   pipe_mutex_unlock(sema->mutex);
++}
++
++/** Wait for semaphore counter to be greater than zero */
++static INLINE void
++pipe_semaphore_wait(pipe_semaphore *sema)
++{
++   pipe_mutex_lock(sema->mutex);
++   while (sema->counter <= 0) {
++      pipe_condvar_wait(sema->cond, sema->mutex);
++   }
++   sema->counter--;
++   pipe_mutex_unlock(sema->mutex);
++}
++
++
+ +
+ +/*
+ + * Thread-specific data.
+ + */
+ +
+ +typedef struct {
+ +#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU)
+ +   pthread_key_t key;
+ +#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
+ +   DWORD key;
+ +#endif
+ +   int initMagic;
+ +} pipe_tsd;
+ +
+ +
+ +#define PIPE_TSD_INIT_MAGIC 0xff8adc98
+ +
+ +
+ +static INLINE void
+ +pipe_tsd_init(pipe_tsd *tsd)
+ +{
+ +#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU)
+ +   if (pthread_key_create(&tsd->key, NULL/*free*/) != 0) {
+ +      perror("pthread_key_create(): failed to allocate key for thread specific data");
+ +      exit(-1);
+ +   }
+ +#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
+ +   assert(0);
+ +#endif
+ +   tsd->initMagic = PIPE_TSD_INIT_MAGIC;
+ +}
+ +
+ +static INLINE void *
+ +pipe_tsd_get(pipe_tsd *tsd)
+ +{
+ +   if (tsd->initMagic != (int) PIPE_TSD_INIT_MAGIC) {
+ +      pipe_tsd_init(tsd);
+ +   }
+ +#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU)
+ +   return pthread_getspecific(tsd->key);
+ +#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
+ +   assert(0);
+ +   return NULL;
+ +#else
+ +   assert(0);
+ +   return NULL;
+ +#endif
+ +}
+ +
+ +static INLINE void
+ +pipe_tsd_set(pipe_tsd *tsd, void *value)
+ +{
+ +   if (tsd->initMagic != (int) PIPE_TSD_INIT_MAGIC) {
+ +      pipe_tsd_init(tsd);
+ +   }
+ +#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU)
+ +   if (pthread_setspecific(tsd->key, value) != 0) {
+ +      perror("pthread_set_specific() failed");
+ +      exit(-1);
+ +   }
+ +#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
+ +   assert(0);
+ +#else
+ +   assert(0);
+ +#endif
+ +}
+ +
+ +
+ +
+ +#endif /* OS_THREAD_H_ */
diff --combined src/gallium/auxiliary/util/u_debug.c

index a8d18333d8999ad140eb8cde7dfc20885dd33097,7ee0864d292d3bc0bd4a4a9930d73d061f969337..4821b8a1434f87db2e3ef7c0a04fb8f715d2ce70
--- 1/src/gallium/auxiliary/util/u_debug.c
--- 2/src/gallium/auxiliary/util/u_debug.c
+++ b/src/gallium/auxiliary/util/u_debug.c
@@@ -29,30 -29,125 +29,30 @@@
   
   #include "pipe/p_config.h" 
   
- -#include <stdarg.h>
- -
- -
- -#ifdef PIPE_SUBSYSTEM_WINDOWS_DISPLAY
- -
- -#include <windows.h>
- -#include <winddi.h>
- -
- -#elif defined(PIPE_SUBSYSTEM_WINDOWS_CE)
- -
- -#include <stdio.h> 
- -#include <stdlib.h> 
- -#include <windows.h> 
- -#include <types.h> 
- -
- -#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
- -
- -#ifndef WIN32_LEAN_AND_MEAN
- -#define WIN32_LEAN_AND_MEAN      // Exclude rarely-used stuff from Windows headers
- -#endif
- -#include <windows.h>
- -#include <stdio.h>
- -
- -#else
- -
- -#include <stdio.h>
- -#include <stdlib.h>
- -
- -#endif
- -
- -#include "pipe/p_compiler.h" 
+ +#include "pipe/p_compiler.h"
+ +#include "os/os_stream.h"
   #include "util/u_debug.h" 
   #include "pipe/p_format.h" 
   #include "pipe/p_state.h" 
- -#include "pipe/p_inlines.h" 
+ +#include "util/u_inlines.h" 
   #include "util/u_format.h"
   #include "util/u_memory.h" 
   #include "util/u_string.h" 
- -#include "util/u_stream.h" 
   #include "util/u_math.h" 
   #include "util/u_tile.h" 
   #include "util/u_prim.h" 
   
   
- -#ifdef PIPE_SUBSYSTEM_WINDOWS_DISPLAY
- -static INLINE void 
- -_EngDebugPrint(const char *format, ...)
- -{
- -   va_list ap;
- -   va_start(ap, format);
- -   EngDebugPrint("", (PCHAR)format, ap);
- -   va_end(ap);
- -}
- -#endif
- -
- -
   void _debug_vprintf(const char *format, va_list ap)
   {
- -#if defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
- -   /* EngDebugPrint does not handle float point arguments, so we need to use
- -    * our own vsnprintf implementation. It is also very slow, so buffer until
- -    * we find a newline. */
- -   static char buf[512] = {'\0'};
- -   size_t len = strlen(buf);
- -   int ret = util_vsnprintf(buf + len, sizeof(buf) - len, format, ap);
- -   if(ret > (int)(sizeof(buf) - len - 1) || util_strchr(buf + len, '\n')) {
- -      _EngDebugPrint("%s", buf);
- -      buf[0] = '\0';
- -   }
- -#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
- -   /* OutputDebugStringA can be very slow, so buffer until we find a newline. */
+ +   /* We buffer until we find a newline. */
      static char buf[4096] = {'\0'};
      size_t len = strlen(buf);
      int ret = util_vsnprintf(buf + len, sizeof(buf) - len, format, ap);
      if(ret > (int)(sizeof(buf) - len - 1) || util_strchr(buf + len, '\n')) {
- -      OutputDebugStringA(buf);
+ +      os_log_message(buf);
         buf[0] = '\0';
      }
- -   
- -   if(GetConsoleWindow() && !IsDebuggerPresent()) {
- -      fflush(stdout);
- -      vfprintf(stderr, format, ap);
- -      fflush(stderr);
- -   }
- -   
- -#elif defined(PIPE_SUBSYSTEM_WINDOWS_CE)
- -   wchar_t *wide_format;
- -   long wide_str_len;   
- -   char buf[512];   
- -   int ret;   
- -#if (_WIN32_WCE < 600)
- -   ret = vsprintf(buf, format, ap);   
- -   if(ret < 0){   
- -       sprintf(buf, "Cant handle debug print!");   
- -       ret = 25;
- -   }
- -#else
- -   ret = vsprintf_s(buf, 512, format, ap);   
- -   if(ret < 0){   
- -       sprintf_s(buf, 512, "Cant handle debug print!");   
- -       ret = 25;
- -   }
- -#endif
- -   buf[ret] = '\0';   
- -   /* Format is ascii - needs to be converted to wchar_t for printing */   
- -   wide_str_len = MultiByteToWideChar(CP_ACP, 0, (const char *) buf, -1, NULL, 0);   
- -   wide_format = (wchar_t *) malloc((wide_str_len+1) * sizeof(wchar_t));   
- -   if (wide_format) {   
- -      MultiByteToWideChar(CP_ACP, 0, (const char *) buf, -1,   
- -            wide_format, wide_str_len);   
- -      NKDbgPrintfW(wide_format, wide_format);   
- -      free(wide_format);   
- -   } 
- -#elif defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
- -   /* TODO */
- -#else /* !PIPE_SUBSYSTEM_WINDOWS */
- -   fflush(stdout);
- -   vfprintf(stderr, format, ap);
- -#endif
   }
   
   
@@@ -74,12 -169,108 +74,12 @@@ void debug_print_blob( const char *name
   #endif
   
   
- -#ifndef debug_break
- -void debug_break(void) 
- -{
- -#if defined(PIPE_SUBSYSTEM_WINDOWS_USER)
- -   DebugBreak();
- -#elif defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
- -   EngDebugBreak();
- -#else
- -   abort();
- -#endif
- -}
- -#endif
- -
- -
- -#ifdef PIPE_SUBSYSTEM_WINDOWS_DISPLAY
- -static const char *
- -find(const char *start, const char *end, char c) 
- -{
- -   const char *p;
- -   for(p = start; !end || p != end; ++p) {
- -      if(*p == c)
- -       return p;
- -      if(*p < 32)
- -       break;
- -   }
- -   return NULL;
- -}
- -
- -static int 
- -compare(const char *start, const char *end, const char *s)
- -{
- -   const char *p, *q;
- -   for(p = start, q = s; p != end && *q != '\0'; ++p, ++q) {
- -      if(*p != *q)
- -       return 0;
- -   }
- -   return p == end && *q == '\0';
- -}
- -
- -static void 
- -copy(char *dst, const char *start, const char *end, size_t n) 
- -{
- -   const char *p;
- -   char *q;
- -   for(p = start, q = dst, n = n - 1; p != end && n; ++p, ++q, --n)
- -      *q = *p;
- -   *q = '\0';
- -}
- -#endif
- -
- -
- -static INLINE const char *
- -_debug_get_option(const char *name)
- -{
- -#if defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
- -   /* EngMapFile creates the file if it does not exists, so it must either be
- -    * disabled on release versions (or put in a less conspicuous place). */
- -#ifdef DEBUG
- -   const char *result = NULL;
- -   ULONG_PTR iFile = 0;
- -   const void *pMap = NULL;
- -   const char *sol, *eol, *sep;
- -   static char output[1024];
- -   
- -   pMap = EngMapFile(L"\\??\\c:\\gallium.cfg", 0, &iFile);
- -   if(pMap) {
- -      sol = (const char *)pMap;
- -      while(1) {
- -       /* TODO: handle LF line endings */
- -       eol = find(sol, NULL, '\r');
- -       if(!eol || eol == sol)
- -          break;
- -       sep = find(sol, eol, '=');
- -       if(!sep)
- -          break;
- -       if(compare(sol, sep, name)) {
- -          copy(output, sep + 1, eol, sizeof(output));
- -          result = output;
- -          break;
- -       }
- -       sol = eol + 2;
- -      }
- -      EngUnmapFile(iFile);
- -   }
- -   return result;
- -#else
- -   return NULL;
- -#endif
- -#elif defined(PIPE_SUBSYSTEM_WINDOWS_CE) || defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT) 
- -   /* TODO: implement */
- -   return NULL;
- -#else
- -   return getenv(name);
- -#endif
- -}
- -
   const char *
   debug_get_option(const char *name, const char *dfault)
   {
      const char *result;
   
- -   result = _debug_get_option(name);
+ +   result = os_get_option(name);
      if(!result)
         result = dfault;
         
@@@ -91,7 -282,7 +91,7 @@@
   boolean
   debug_get_bool_option(const char *name, boolean dfault)
   {
- -   const char *str = _debug_get_option(name);
+ +   const char *str = os_get_option(name);
      boolean result;
      
      if(str == NULL)
@@@ -121,7 -312,7 +121,7 @@@ debug_get_num_option(const char *name, 
      long result;
      const char *str;
      
- -   str = _debug_get_option(name);
+ +   str = os_get_option(name);
      if(!str)
         result = dfault;
      else {
@@@ -157,7 -348,7 +157,7 @@@ debug_get_flags_option(const char *name
      unsigned long result;
      const char *str;
      
- -   str = _debug_get_option(name);
+ +   str = os_get_option(name);
      if(!str)
         result = dfault;
      else if (!util_strcmp(str, "help")) {
@@@ -198,7 -389,7 +198,7 @@@ void _debug_assert_fail(const char *exp
   #else
      if (debug_get_bool_option("GALLIUM_ABORT_ON_ASSERT", TRUE))
   #endif
- -      debug_break();
+ +      os_abort();
      else
         _debug_printf("continuing...\n");
   }
@@@ -440,6 -631,14 +440,14 @@@ const char *u_prim_name( unsigned prim 
   
   
   #ifdef DEBUG
+ /**
+  * Dump an image to a .raw or .ppm file (depends on OS).
+  * \param format  PIPE_FORMAT_x
+  * \param cpp  bytes per pixel
+  * \param width  width in pixels
+  * \param height height in pixels
+  * \param stride  row stride in bytes
+  */
   void debug_dump_image(const char *prefix,
                         unsigned format, unsigned cpp,
                         unsigned width, unsigned height,
@@@ -481,6 -680,52 +489,52 @@@
      }
         
      EngUnmapFile(iFile);
+ #elif defined(PIPE_OS_UNIX)
+    /* write a ppm file */
+    char filename[256];
+    FILE *f;
+ 
+    util_snprintf(filename, sizeof(filename), "%s.ppm", prefix);
+ 
+    f = fopen(filename, "w");
+    if (f) {
+       int i, x, y;
+       int r, g, b;
+       const uint8_t *ptr = (uint8_t *) data;
+ 
+       /* XXX this is a hack */
+       switch (format) {
+       case PIPE_FORMAT_A8R8G8B8_UNORM:
+          r = 2;
+          g = 1;
+          b = 0;
+          break;
+       default:
+          r = 0;
+          g = 1;
+          b = 1;
+       }
+ 
+       fprintf(f, "P6\n");
+       fprintf(f, "# ppm-file created by osdemo.c\n");
+       fprintf(f, "%i %i\n", width, height);
+       fprintf(f, "255\n");
+       fclose(f);
+ 
+       f = fopen(filename, "ab");  /* reopen in binary append mode */
+       for (y = 0; y < height; y++) {
+          for (x = 0; x < width; x++) {
+             i = y * stride + x * cpp;
+             fputc(ptr[i + r], f); /* write red */
+             fputc(ptr[i + g], f); /* write green */
+             fputc(ptr[i + b], f); /* write blue */
+          }
+       }
+       fclose(f);
+    }
+    else {
+       fprintf(stderr, "Can't open %s for writing\n", filename);
+    }
   #endif
   }
   
@@@ -521,6 -766,27 +575,27 @@@ error
   }
   
   
+ void debug_dump_texture(const char *prefix,
+                         struct pipe_texture *texture)
+ {
+    struct pipe_surface *surface;
+    struct pipe_screen *screen;
+ 
+    if (!texture)
+       return;
+ 
+    screen = texture->screen;
+ 
+    /* XXX for now, just dump image for face=0, level=0 */
+    surface = screen->get_tex_surface(screen, texture, 0, 0, 0,
+                                      PIPE_TEXTURE_USAGE_SAMPLER);
+    if (surface) {
+       debug_dump_surface(prefix, surface);
+       screen->tex_surface_destroy(surface);
+    }
+ }
+ 
+ 
   #pragma pack(push,2)
   struct bmp_file_header {
      uint16_t bfType;
@@@ -606,7 -872,7 +681,7 @@@ debug_dump_float_rgba_bmp(const char *f
                             float *rgba, unsigned stride)
   {
   #ifndef PIPE_SUBSYSTEM_WINDOWS_MINIPORT
- -   struct util_stream *stream;
+ +   struct os_stream *stream;
      struct bmp_file_header bmfh;
      struct bmp_info_header bmih;
      unsigned x, y;
@@@ -632,12 -898,12 +707,12 @@@
      bmih.biClrUsed = 0;
      bmih.biClrImportant = 0;
   
- -   stream = util_stream_create(filename, bmfh.bfSize);
+ +   stream = os_stream_create(filename, bmfh.bfSize);
      if(!stream)
         goto error1;
   
- -   util_stream_write(stream, &bmfh, 14);
- -   util_stream_write(stream, &bmih, 40);
+ +   os_stream_write(stream, &bmfh, 14);
+ +   os_stream_write(stream, &bmih, 40);
   
      y = height;
      while(y--) {
@@@ -649,11 -915,11 +724,11 @@@
            pixel.rgbGreen = float_to_ubyte(ptr[x*4 + 1]);
            pixel.rgbBlue  = float_to_ubyte(ptr[x*4 + 2]);
            pixel.rgbAlpha = 255;
- -         util_stream_write(stream, &pixel, 4);
+ +         os_stream_write(stream, &pixel, 4);
         }
      }
   
- -   util_stream_close(stream);
+ +   os_stream_close(stream);
   error1:
      ;
   #endif
diff --combined src/gallium/auxiliary/util/u_debug.h

index eadc08fe2a0ad83bce8a1185a3d1178fef83cfdd,131c99153919ad3da5985e181f3b99a3b36c2081..efcf065d276d991c557e84aca540758c55ac3a14
--- 1/src/gallium/auxiliary/util/u_debug.h
--- 2/src/gallium/auxiliary/util/u_debug.h
+++ b/src/gallium/auxiliary/util/u_debug.h
@@@ -39,7 -39,9 +39,7 @@@
   #define U_DEBUG_H_
   
   
- -#include <stdarg.h>
- -
- -#include "pipe/p_compiler.h"
+ +#include "os/os_misc.h"
   
   
   #ifdef        __cplusplus
@@@ -47,6 -49,22 +47,6 @@@ extern "C" 
   #endif
   
   
- -#if defined(DBG) || defined(DEBUG)
- -#ifndef DEBUG
- -#define DEBUG 1
- -#endif
- -#else
- -#ifndef NDEBUG
- -#define NDEBUG 1
- -#endif
- -#endif
- -
- -   
- -/* MSVC bebore VC7 does not have the __FUNCTION__ macro */
- -#if defined(_MSC_VER) && _MSC_VER < 1300
- -#define __FUNCTION__ "???"
- -#endif
- -
   #if defined(__GNUC__)
   #define _util_printf_format(fmt, list) __attribute__ ((format (printf, fmt, list)))
   #else
@@@ -137,7 -155,13 +137,7 @@@ void debug_print_format(const char *msg
    * Hard-coded breakpoint.
    */
   #ifdef DEBUG
- -#if (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)) && defined(PIPE_CC_GCC)
- -#define debug_break() __asm("int3")
- -#elif defined(PIPE_CC_MSVC)
- -#define debug_break()  __debugbreak()
- -#else
- -void debug_break(void);
- -#endif
+ +#define debug_break() os_break()
   #else /* !DEBUG */
   #define debug_break() ((void)0)
   #endif /* !DEBUG */
@@@ -304,6 -328,22 +304,6 @@@ debug_get_flags_option(const char *name
                          unsigned long dfault);
   
   
- -void *
- -debug_malloc(const char *file, unsigned line, const char *function,
- -             size_t size);
- -
- -void
- -debug_free(const char *file, unsigned line, const char *function,
- -           void *ptr);
- -
- -void *
- -debug_calloc(const char *file, unsigned line, const char *function,
- -             size_t count, size_t size );
- -
- -void *
- -debug_realloc(const char *file, unsigned line, const char *function,
- -              void *old_ptr, size_t old_size, size_t new_size );
- -
   unsigned long
   debug_memory_begin(void);
   
@@@ -314,6 -354,8 +314,8 @@@ debug_memory_end(unsigned long beginnin
   #ifdef DEBUG
   struct pipe_surface;
   struct pipe_transfer;
+ struct pipe_texture;
+ 
   void debug_dump_image(const char *prefix,
                         unsigned format, unsigned cpp,
                         unsigned width, unsigned height,
@@@ -321,6 -363,8 +323,8 @@@
                         const void *data);
   void debug_dump_surface(const char *prefix,
                           struct pipe_surface *surface);   
+ void debug_dump_texture(const char *prefix,
+                         struct pipe_texture *texture);
   void debug_dump_surface_bmp(const char *filename,
                               struct pipe_surface *surface);
   void debug_dump_transfer_bmp(const char *filename,
diff --combined src/gallium/auxiliary/util/u_ringbuffer.c

index 95d45ebb71fd634d2811764a8f60f469093ae921,e73ba0b3481924afeb2c2681f0294bd84aaa533c..648b105b137524b3851b1dc3710e8333a79d2dc3
--- 1/src/gallium/auxiliary/util/u_ringbuffer.c
--- 2/src/gallium/auxiliary/util/u_ringbuffer.c
+++ b/src/gallium/auxiliary/util/u_ringbuffer.c
@@@ -1,5 -1,5 +1,5 @@@
   
- -#include "pipe/p_thread.h"
+ +#include "os/os_thread.h"
   #include "pipe/p_defines.h"
   #include "util/u_ringbuffer.h"
   #include "util/u_math.h"
@@@ -53,11 -53,22 +53,22 @@@ void util_ringbuffer_destroy( struct ut
      FREE(ring);
   }
   
+ /**
+  * Return number of free entries in the ring
+  */
   static INLINE unsigned util_ringbuffer_space( const struct util_ringbuffer *ring )
   {
      return (ring->tail - (ring->head + 1)) & ring->mask;
   }
   
+ /**
+  * Is the ring buffer empty?
+  */
+ static INLINE boolean util_ringbuffer_empty( const struct util_ringbuffer *ring )
+ {
+    return util_ringbuffer_space(ring) == ring->mask;
+ }
+ 
   void util_ringbuffer_enqueue( struct util_ringbuffer *ring,
                                 const struct util_packet *packet )
   {
@@@ -67,6 -78,10 +78,10 @@@
       */
      pipe_mutex_lock(ring->mutex);
   
+    /* make sure we don't request an impossible amount of space
+     */
+    assert(packet->dwords <= ring->mask);
+ 
      /* Wait for free space:
       */
      while (util_ringbuffer_space(ring) < packet->dwords)
@@@ -104,14 -119,14 +119,14 @@@ enum pipe_error util_ringbuffer_dequeue
       */
      pipe_mutex_lock(ring->mutex);
   
-    /* Wait for free space:
+    /* Get next ring entry:
       */
      if (wait) {
-       while (util_ringbuffer_space(ring) == 0)
+       while (util_ringbuffer_empty(ring))
            pipe_condvar_wait(ring->change, ring->mutex);
      }
      else {
-       if (util_ringbuffer_space(ring) == 0) {
+       if (util_ringbuffer_empty(ring)) {
            ret = PIPE_ERROR_OUT_OF_MEMORY;
            goto out;
         }
diff --combined src/gallium/auxiliary/util/u_surface.c

index 6053c111e340e4ed0c5e970c22c08f01258b60de,70de140ec9d439b20b19cd2dc0d7aba53a2c31a1..c9f1c9c210f7ca342a126cc5ca7a453b77f9a519
--- 1/src/gallium/auxiliary/util/u_surface.c
--- 2/src/gallium/auxiliary/util/u_surface.c
+++ b/src/gallium/auxiliary/util/u_surface.c
@@@ -35,8 -35,8 +35,9 @@@
   #include "pipe/p_screen.h"
   #include "pipe/p_state.h"
   #include "pipe/p_defines.h"
+ +#include "util/u_inlines.h"
   
+ #include "util/u_memory.h"
   #include "util/u_surface.h"
   
   
@@@ -111,3 -111,73 +112,73 @@@ util_destroy_rgba_surface(struct pipe_t
      pipe_texture_reference(&texture, NULL);
   }
   
+ 
+ 
+ /**
+  * Compare pipe_framebuffer_state objects.
+  * \return TRUE if same, FALSE if different
+  */
+ boolean
+ util_framebuffer_state_equal(const struct pipe_framebuffer_state *dst,
+                              const struct pipe_framebuffer_state *src)
+ {
+    unsigned i;
+ 
+    if (dst->width != src->width ||
+        dst->height != src->height)
+       return FALSE;
+ 
+    for (i = 0; i < Elements(src->cbufs); i++) {
+       if (dst->cbufs[i] != src->cbufs[i]) {
+          return FALSE;
+       }
+    }
+ 
+    if (dst->nr_cbufs != src->nr_cbufs) {
+       return FALSE;
+    }
+ 
+    if (dst->zsbuf != src->zsbuf) {
+       return FALSE;
+    }
+ 
+    return TRUE;
+ }
+ 
+ 
+ /**
+  * Copy framebuffer state from src to dst, updating refcounts.
+  */
+ void
+ util_copy_framebuffer_state(struct pipe_framebuffer_state *dst,
+                             const struct pipe_framebuffer_state *src)
+ {
+    unsigned i;
+ 
+    dst->width = src->width;
+    dst->height = src->height;
+ 
+    for (i = 0; i < Elements(src->cbufs); i++) {
+       pipe_surface_reference(&dst->cbufs[i], src->cbufs[i]);
+    }
+ 
+    dst->nr_cbufs = src->nr_cbufs;
+ 
+    pipe_surface_reference(&dst->zsbuf, src->zsbuf);
+ }
+ 
+ 
+ void
+ util_unreference_framebuffer_state(struct pipe_framebuffer_state *fb)
+ {
+    unsigned i;
+ 
+    for (i = 0; i < fb->nr_cbufs; i++) {
+       pipe_surface_reference(&fb->cbufs[i], NULL);
+    }
+ 
+    pipe_surface_reference(&fb->zsbuf, NULL);
+ 
+    fb->width = fb->height = 0;
+    fb->nr_cbufs = 0;
+ }
diff --combined src/gallium/auxiliary/util/u_time.h

index 7580ac0de4ce468ce77e6d6ff7f3bac5cbada059,29fd1cbc67d5a69d19e43ad179ac110b8cbad35c..15899c2c884064d09ce801b59a38bb3301a42259
--- 1/src/gallium/auxiliary/util/u_time.h
--- 2/src/gallium/auxiliary/util/u_time.h
+++ b/src/gallium/auxiliary/util/u_time.h
@@@ -38,7 -38,15 +38,7 @@@
   
   #include "pipe/p_config.h"
   
- -#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE)
- -#include <time.h> /* timeval */
- -#include <unistd.h> /* usleep */
- -#endif
- -
- -#if defined(PIPE_OS_HAIKU)
- -#include <sys/time.h> /* timeval */
- -#include <unistd.h>
- -#endif
+ +#include "os/os_time.h"
   
   #include "pipe/p_compiler.h"
   
@@@ -55,80 -63,52 +55,92 @@@ extern "C" 
    */
   struct util_time 
   {
- -#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU)
- -   struct timeval tv;
- -#else
      int64_t counter;
- -#endif
   };
      
   
- -void 
- -util_time_get(struct util_time *t);
+ +PIPE_DEPRECATED
+ +static INLINE void
+ +util_time_get(struct util_time *t)
+ +{
+ +   t->counter = os_time_get();
+ +}
+ +
   
- -void 
+ /**
+  * Return t2 = t1 + usecs
+  */
+ +PIPE_DEPRECATED
+ +static INLINE void
   util_time_add(const struct util_time *t1,
                 int64_t usecs,
- -              struct util_time *t2);
+ +              struct util_time *t2)
+ +{
+ +   t2->counter = t1->counter + usecs;
+ +}
   
- -/**
- - * Return current time in microseconds
- - */
- -uint64_t
- -util_time_micros( void );
   
- -int64_t
+ /**
+  * Return difference between times, in microseconds
+  */
+ +PIPE_DEPRECATED
+ +static INLINE int64_t
   util_time_diff(const struct util_time *t1, 
- -               const struct util_time *t2);
+ +               const struct util_time *t2)
+ +{
+ +   return t2->counter - t1->counter;
+ +}
+ +
+ +
+ +/**
+ + * Compare two time values.
+ + *
+ + * Not publicly available because it does not take in account wrap-arounds.
+ + * Use util_time_timeout instead.
+ + */
+ +static INLINE int
+ +_util_time_compare(const struct util_time *t1,
+ +                   const struct util_time *t2)
+ +{
+ +   if (t1->counter < t2->counter)
+ +      return -1;
+ +   else if(t1->counter > t2->counter)
+ +      return 1;
+ +   else
+ +      return 0;
+ +}
+ +
   
- -boolean 
+ /**
+  * Returns non-zero when the timeout expires.
+  */
+ +PIPE_DEPRECATED
+ +static INLINE boolean
   util_time_timeout(const struct util_time *start, 
                     const struct util_time *end,
- -                  const struct util_time *curr);
+ +                  const struct util_time *curr)
+ +{
+ +   return os_time_timeout(start->counter, end->counter, curr->counter);
+ +}
   
- -#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU)
- -#define util_time_sleep usleep
- -#else
- -void
- -util_time_sleep(unsigned usecs);
- -#endif
+ +
++/**
++ * Return current time in microseconds
++ */
+ +PIPE_DEPRECATED
+ +static INLINE int64_t
+ +util_time_micros(void)
+ +{
+ +   return os_time_get();
+ +}
+ +
+ +
+ +PIPE_DEPRECATED
+ +static INLINE void
+ +util_time_sleep(int64_t usecs)
+ +{
+ +   os_time_sleep(usecs);
+ +}
   
   
   #ifdef        __cplusplus
diff --combined src/gallium/drivers/llvmpipe/SConscript

index 563220f17fef1c2ca7333d8c4f353f93f0020b2c,d7a396292c2bcf4d8aac466443f086d85f7e6113..840cb0950ec362a90be6863f0a567d26f7ba4c77
--- 1/src/gallium/drivers/llvmpipe/SConscript
--- 2/src/gallium/drivers/llvmpipe/SConscript
+++ b/src/gallium/drivers/llvmpipe/SConscript
@@@ -32,16 -32,16 +32,16 @@@ llvmpipe = env.ConvenienceLibrary
                 'lp_bld_depth.c',
                 'lp_bld_flow.c',
                 'lp_bld_format_aos.c',
-         'lp_bld_format_query.c',
+               'lp_bld_format_query.c',
                 'lp_bld_format_soa.c',
                 'lp_bld_interp.c',
                 'lp_bld_intr.c',
+               'lp_bld_logic.c',
                 'lp_bld_misc.cpp',
-         'lp_bld_pack.c',
-         'lp_bld_sample.c',
+               'lp_bld_pack.c',
+               'lp_bld_sample.c',
                 'lp_bld_sample_soa.c',
                 'lp_bld_struct.c',
-               'lp_bld_logic.c',
                 'lp_bld_swizzle.c',
                 'lp_bld_tgsi_soa.c',            
                 'lp_bld_type.c',
@@@ -49,12 -49,21 +49,21 @@@
                 'lp_clear.c',
                 'lp_context.c',
                 'lp_draw_arrays.c',
+               'lp_fence.c',
                 'lp_flush.c',
                 'lp_jit.c',
-               'lp_prim_vbuf.c',
-               'lp_setup.c',
+               'lp_perf.c',
                 'lp_query.c',
+               'lp_rast.c',
+               'lp_rast_tri.c',
+               'lp_scene.c',
+               'lp_scene_queue.c',
                 'lp_screen.c',
+               'lp_setup.c',
+               'lp_setup_line.c',
+               'lp_setup_point.c',
+               'lp_setup_tri.c',
+               'lp_setup_vbuf.c',
                 'lp_state_blend.c',
                 'lp_state_clip.c',
                 'lp_state_derived.c',
@@@ -65,30 -74,27 +74,28 @@@
                 'lp_state_vertex.c',
                 'lp_state_vs.c',
                 'lp_surface.c',
-               'lp_tex_cache.c',
                 'lp_tex_sample_llvm.c',
                 'lp_texture.c',
-               'lp_tile_cache.c',
                 'lp_tile_soa.c',
         ])
   
   
- -env = env.Clone()
+ +if env['platform'] != 'embedded':
+ +    env = env.Clone()
   
- -env.Prepend(LIBS = [llvmpipe] + gallium)
+ +    env.Prepend(LIBS = [llvmpipe] + gallium)
   
- -tests = [
- -    'format',
- -    'blend',
- -    'conv',
- -]
+ +    tests = [
+ +        'format',
+ +        'blend',
+ +        'conv',
+ +    ]
   
- -for test in tests:
- -    target = env.Program(
- -        target = 'lp_test_' + test,
- -        source = ['lp_test_' + test + '.c', 'lp_test_main.c'],
- -    )
- -    env.InstallProgram(target)
+ +    for test in tests:
+ +        target = env.Program(
+ +            target = 'lp_test_' + test,
+ +            source = ['lp_test_' + test + '.c', 'lp_test_main.c'],
+ +        )
+ +        env.InstallProgram(target)
   
- -Export('llvmpipe')
+ +    Export('llvmpipe')
diff --combined src/gallium/drivers/llvmpipe/lp_bld_logic.c

index 15eacb6df48a410430e7e9610ca2084b1b1872c6,d094a040d6a693509f07c993afa41e9f660c7c1d..d23de4f0ef8ca7fafaf7f557ac9dd380338a6648
--- 1/src/gallium/drivers/llvmpipe/lp_bld_logic.c
--- 2/src/gallium/drivers/llvmpipe/lp_bld_logic.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_logic.c
@@@ -34,7 -34,6 +34,7 @@@
   
   
   #include "util/u_cpu_detect.h"
+ +#include "util/u_debug.h"
   
   #include "lp_bld_type.h"
   #include "lp_bld_const.h"
@@@ -42,13 -41,17 +42,17 @@@
   #include "lp_bld_logic.h"
   
   
+ /**
+  * Build code to compare two values 'a' and 'b' of 'type' using the given func.
+  * \param func  one of PIPE_FUNC_x
+  */
   LLVMValueRef
- lp_build_cmp(struct lp_build_context *bld,
-              unsigned func,
-              LLVMValueRef a,
-              LLVMValueRef b)
+ lp_build_compare(LLVMBuilderRef builder,
+                  const struct lp_type type,
+                  unsigned func,
+                  LLVMValueRef a,
+                  LLVMValueRef b)
   {
-    const struct lp_type type = bld->type;
      LLVMTypeRef vec_type = lp_build_vec_type(type);
      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
      LLVMValueRef zeros = LLVMConstNull(int_vec_type);
@@@ -57,6 -60,9 +61,9 @@@
      LLVMValueRef res;
      unsigned i;
   
+    assert(func >= PIPE_FUNC_NEVER);
+    assert(func <= PIPE_FUNC_ALWAYS);
+ 
      if(func == PIPE_FUNC_NEVER)
         return zeros;
      if(func == PIPE_FUNC_ALWAYS)
@@@ -69,6 -75,7 +76,7 @@@
   #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
      if(type.width * type.length == 128) {
         if(type.floating && util_cpu_caps.has_sse) {
+          /* float[4] comparison */
            LLVMValueRef args[3];
            unsigned cc;
            boolean swap;
@@@ -97,7 -104,7 +105,7 @@@
               break;
            default:
               assert(0);
-             return bld->undef;
+             return lp_build_undef(type);
            }
   
            if(swap) {
@@@ -110,14 -117,15 +118,15 @@@
            }
   
            args[2] = LLVMConstInt(LLVMInt8Type(), cc, 0);
-          res = lp_build_intrinsic(bld->builder,
+          res = lp_build_intrinsic(builder,
                                     "llvm.x86.sse.cmp.ps",
                                     vec_type,
                                     args, 3);
-          res = LLVMBuildBitCast(bld->builder, res, int_vec_type, "");
+          res = LLVMBuildBitCast(builder, res, int_vec_type, "");
            return res;
         }
         else if(util_cpu_caps.has_sse2) {
+          /* int[4] comparison */
            static const struct {
               unsigned swap:1;
               unsigned eq:1;
@@@ -153,7 -161,7 +162,7 @@@
               break;
            default:
               assert(0);
-             return bld->undef;
+             return lp_build_undef(type);
            }
   
            /* There are no signed byte and unsigned word/dword comparison
@@@ -163,8 -171,8 +172,8 @@@
               ((type.width == 8 && type.sign) ||
                (type.width != 8 && !type.sign))) {
               LLVMValueRef msb = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
-             a = LLVMBuildXor(bld->builder, a, msb, "");
-             b = LLVMBuildXor(bld->builder, b, msb, "");
+             a = LLVMBuildXor(builder, a, msb, "");
+             b = LLVMBuildXor(builder, b, msb, "");
            }
   
            if(table[func].swap) {
@@@ -177,14 -185,14 +186,14 @@@
            }
   
            if(table[func].eq)
-             res = lp_build_intrinsic(bld->builder, pcmpeq, vec_type, args, 2);
+             res = lp_build_intrinsic(builder, pcmpeq, vec_type, args, 2);
            else if (table[func].gt)
-             res = lp_build_intrinsic(bld->builder, pcmpgt, vec_type, args, 2);
+             res = lp_build_intrinsic(builder, pcmpgt, vec_type, args, 2);
            else
               res = LLVMConstNull(vec_type);
   
            if(table[func].not)
-             res = LLVMBuildNot(bld->builder, res, "");
+             res = LLVMBuildNot(builder, res, "");
   
            return res;
         }
@@@ -220,28 -228,28 +229,28 @@@
            break;
         default:
            assert(0);
-          return bld->undef;
+          return lp_build_undef(type);
         }
   
   #if 0
         /* XXX: Although valid IR, no LLVM target currently support this */
-       cond = LLVMBuildFCmp(bld->builder, op, a, b, "");
-       res = LLVMBuildSelect(bld->builder, cond, ones, zeros, "");
+       cond = LLVMBuildFCmp(builder, op, a, b, "");
+       res = LLVMBuildSelect(builder, cond, ones, zeros, "");
   #else
         debug_printf("%s: warning: using slow element-wise vector comparison\n",
                      __FUNCTION__);
         res = LLVMGetUndef(int_vec_type);
         for(i = 0; i < type.length; ++i) {
            LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
-          cond = LLVMBuildFCmp(bld->builder, op,
-                               LLVMBuildExtractElement(bld->builder, a, index, ""),
-                               LLVMBuildExtractElement(bld->builder, b, index, ""),
+          cond = LLVMBuildFCmp(builder, op,
+                               LLVMBuildExtractElement(builder, a, index, ""),
+                               LLVMBuildExtractElement(builder, b, index, ""),
                                 "");
-          cond = LLVMBuildSelect(bld->builder, cond,
+          cond = LLVMBuildSelect(builder, cond,
                                   LLVMConstExtractElement(ones, index),
                                   LLVMConstExtractElement(zeros, index),
                                   "");
-          res = LLVMBuildInsertElement(bld->builder, res, cond, index, "");
+          res = LLVMBuildInsertElement(builder, res, cond, index, "");
         }
   #endif
      }
@@@ -268,28 -276,28 +277,28 @@@
            break;
         default:
            assert(0);
-          return bld->undef;
+          return lp_build_undef(type);
         }
   
   #if 0
         /* XXX: Although valid IR, no LLVM target currently support this */
-       cond = LLVMBuildICmp(bld->builder, op, a, b, "");
-       res = LLVMBuildSelect(bld->builder, cond, ones, zeros, "");
+       cond = LLVMBuildICmp(builder, op, a, b, "");
+       res = LLVMBuildSelect(builder, cond, ones, zeros, "");
   #else
-       debug_printf("%s: warning: using slow element-wise vector comparison\n",
+       debug_printf("%s: warning: using slow element-wise int vector comparison\n",
                      __FUNCTION__);
         res = LLVMGetUndef(int_vec_type);
         for(i = 0; i < type.length; ++i) {
            LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
-          cond = LLVMBuildICmp(bld->builder, op,
-                               LLVMBuildExtractElement(bld->builder, a, index, ""),
-                               LLVMBuildExtractElement(bld->builder, b, index, ""),
+          cond = LLVMBuildICmp(builder, op,
+                               LLVMBuildExtractElement(builder, a, index, ""),
+                               LLVMBuildExtractElement(builder, b, index, ""),
                                 "");
-          cond = LLVMBuildSelect(bld->builder, cond,
+          cond = LLVMBuildSelect(builder, cond,
                                   LLVMConstExtractElement(ones, index),
                                   LLVMConstExtractElement(zeros, index),
                                   "");
-          res = LLVMBuildInsertElement(bld->builder, res, cond, index, "");
+          res = LLVMBuildInsertElement(builder, res, cond, index, "");
         }
   #endif
      }
@@@ -298,6 -306,21 +307,21 @@@
   }
   
   
+ 
+ /**
+  * Build code to compare two values 'a' and 'b' using the given func.
+  * \param func  one of PIPE_FUNC_x
+  */
+ LLVMValueRef
+ lp_build_cmp(struct lp_build_context *bld,
+              unsigned func,
+              LLVMValueRef a,
+              LLVMValueRef b)
+ {
+    return lp_build_compare(bld->builder, bld->type, func, a, b);
+ }
+ 
+ 
   LLVMValueRef
   lp_build_select(struct lp_build_context *bld,
                   LLVMValueRef mask,
diff --combined src/gallium/drivers/llvmpipe/lp_buffer.c

index 0dfee2a307c40866966c4b69408846303f6bb746,a5ef221a216e952800e1d109a239f4f854053271..9eda97208184000002bebb0ffb6889677646ed9b
--- 1/src/gallium/drivers/llvmpipe/lp_buffer.c
--- 2/src/gallium/drivers/llvmpipe/lp_buffer.c
+++ b/src/gallium/drivers/llvmpipe/lp_buffer.c
@@@ -26,12 -26,12 +26,12 @@@
    **************************************************************************/
   
   
+ +#include "util/u_inlines.h"
   #include "util/u_memory.h"
   #include "util/u_math.h"
   
   #include "lp_winsys.h"
   #include "lp_screen.h"
- -#include "lp_texture.h"
   #include "lp_buffer.h"
   
   
@@@ -108,32 -108,6 +108,6 @@@ llvmpipe_user_buffer_create(struct pipe
   }
   
   
- static void
- llvmpipe_fence_reference(struct pipe_screen *screen,
-                          struct pipe_fence_handle **ptr,
-                          struct pipe_fence_handle *fence)
- {
- }
- 
- 
- static int
- llvmpipe_fence_signalled(struct pipe_screen *screen,
-                          struct pipe_fence_handle *fence,
-                          unsigned flag)
- {
-    return 0;
- }
- 
- 
- static int
- llvmpipe_fence_finish(struct pipe_screen *screen,
-                       struct pipe_fence_handle *fence,
-                       unsigned flag)
- {
-    return 0;
- }
- 
- 
   void
   llvmpipe_init_screen_buffer_funcs(struct pipe_screen *screen)
   {
@@@ -142,9 -116,4 +116,4 @@@
      screen->buffer_map = llvmpipe_buffer_map;
      screen->buffer_unmap = llvmpipe_buffer_unmap;
      screen->buffer_destroy = llvmpipe_buffer_destroy;
- 
-    screen->fence_reference = llvmpipe_fence_reference;
-    screen->fence_signalled = llvmpipe_fence_signalled;
-    screen->fence_finish = llvmpipe_fence_finish;
- 
   }
diff --combined src/gallium/drivers/llvmpipe/lp_context.c

index d9adf21b6a6f2a7f889c0d9aca0e87d2563ed8cc,51de6f93ca78c1c6cd47b7c89c389512ec1d7860..a76bde390545b268c43a57312c3340b3b9d566cf
--- 1/src/gallium/drivers/llvmpipe/lp_context.c
--- 2/src/gallium/drivers/llvmpipe/lp_context.c
+++ b/src/gallium/drivers/llvmpipe/lp_context.c
@@@ -33,71 -33,21 +33,22 @@@
   #include "draw/draw_context.h"
   #include "draw/draw_vbuf.h"
   #include "pipe/p_defines.h"
+ +#include "util/u_inlines.h"
   #include "util/u_math.h"
   #include "util/u_memory.h"
   #include "lp_clear.h"
   #include "lp_context.h"
   #include "lp_flush.h"
- #include "lp_prim_vbuf.h"
+ #include "lp_perf.h"
   #include "lp_state.h"
   #include "lp_surface.h"
- #include "lp_tile_cache.h"
- #include "lp_tex_cache.h"
   #include "lp_texture.h"
   #include "lp_winsys.h"
   #include "lp_query.h"
+ #include "lp_setup.h"
   
   
   
- /**
-  * Map any drawing surfaces which aren't already mapped
-  */
- void
- llvmpipe_map_transfers(struct llvmpipe_context *lp)
- {
-    struct pipe_screen *screen = lp->pipe.screen;
-    struct pipe_surface *zsbuf = lp->framebuffer.zsbuf;
-    unsigned i;
- 
-    for (i = 0; i < lp->framebuffer.nr_cbufs; i++) {
-       lp_tile_cache_map_transfers(lp->cbuf_cache[i]);
-    }
- 
-    if(zsbuf) {
-       if(!lp->zsbuf_transfer)
-          lp->zsbuf_transfer = screen->get_tex_transfer(screen, zsbuf->texture,
-                                                        zsbuf->face, zsbuf->level, zsbuf->zslice,
-                                                        PIPE_TRANSFER_READ_WRITE,
-                                                        0, 0, zsbuf->width, zsbuf->height);
-       if(lp->zsbuf_transfer && !lp->zsbuf_map)
-          lp->zsbuf_map = screen->transfer_map(screen, lp->zsbuf_transfer);
- 
-    }
- }
- 
- 
- /**
-  * Unmap any mapped drawing surfaces
-  */
- void
- llvmpipe_unmap_transfers(struct llvmpipe_context *lp)
- {
-    uint i;
- 
-    for (i = 0; i < lp->framebuffer.nr_cbufs; i++) {
-       lp_tile_cache_unmap_transfers(lp->cbuf_cache[i]);
-    }
- 
-    if(lp->zsbuf_transfer) {
-       struct pipe_screen *screen = lp->pipe.screen;
- 
-       if(lp->zsbuf_map) {
-          screen->transfer_unmap(screen, lp->zsbuf_transfer);
-          lp->zsbuf_map = NULL;
-       }
-    }
- }
   
   
   static void llvmpipe_destroy( struct pipe_context *pipe )
@@@ -105,22 -55,24 +56,24 @@@
      struct llvmpipe_context *llvmpipe = llvmpipe_context( pipe );
      uint i;
   
+    lp_print_counters();
+ 
+    /* This will also destroy llvmpipe->setup:
+     */
      if (llvmpipe->draw)
         draw_destroy( llvmpipe->draw );
   
      for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
-       lp_destroy_tile_cache(llvmpipe->cbuf_cache[i]);
         pipe_surface_reference(&llvmpipe->framebuffer.cbufs[i], NULL);
      }
+ 
      pipe_surface_reference(&llvmpipe->framebuffer.zsbuf, NULL);
   
      for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
-       lp_destroy_tex_tile_cache(llvmpipe->tex_cache[i]);
         pipe_texture_reference(&llvmpipe->texture[i], NULL);
      }
   
      for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++) {
-       lp_destroy_tex_tile_cache(llvmpipe->vertex_tex_cache[i]);
         pipe_texture_reference(&llvmpipe->vertex_textures[i], NULL);
      }
   
@@@ -139,33 -91,8 +92,8 @@@ llvmpipe_is_texture_referenced( struct 
                                 unsigned face, unsigned level)
   {
      struct llvmpipe_context *llvmpipe = llvmpipe_context( pipe );
-    unsigned i;
- 
-    /* check if any of the bound drawing surfaces are this texture */
-    if(llvmpipe->dirty_render_cache) {
-       for (i = 0; i < llvmpipe->framebuffer.nr_cbufs; i++) {
-          if(llvmpipe->framebuffer.cbufs[i] && 
-             llvmpipe->framebuffer.cbufs[i]->texture == texture)
-             return PIPE_REFERENCED_FOR_WRITE;
-       }
-       if(llvmpipe->framebuffer.zsbuf && 
-          llvmpipe->framebuffer.zsbuf->texture == texture)
-          return PIPE_REFERENCED_FOR_WRITE;
-    }
   
-    /* check if any of the tex_cache textures are this texture */
-    for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
-       if (llvmpipe->tex_cache[i] &&
-             llvmpipe->tex_cache[i]->texture == texture)
-          return PIPE_REFERENCED_FOR_READ;
-    }
-    for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++) {
-       if (llvmpipe->vertex_tex_cache[i] &&
-           llvmpipe->vertex_tex_cache[i]->texture == texture)
-          return PIPE_REFERENCED_FOR_READ;
-    }
-    
-    return PIPE_UNREFERENCED;
+    return lp_setup_is_texture_referenced(llvmpipe->setup, texture);
   }
   
   static unsigned int
@@@ -179,7 -106,6 +107,6 @@@ struct pipe_context 
   llvmpipe_create( struct pipe_screen *screen )
   {
      struct llvmpipe_context *llvmpipe;
-    uint i;
   
      llvmpipe = align_malloc(sizeof(struct llvmpipe_context), 16);
      if (!llvmpipe)
@@@ -243,19 -169,6 +170,6 @@@
      llvmpipe->pipe.is_buffer_referenced = llvmpipe_is_buffer_referenced;
   
      llvmpipe_init_query_funcs( llvmpipe );
-    llvmpipe_init_texture_funcs( llvmpipe );
- 
-    /*
-     * Alloc caches for accessing drawing surfaces and textures.
-     */
-    for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++)
-       llvmpipe->cbuf_cache[i] = lp_create_tile_cache( screen );
- 
-    for (i = 0; i < PIPE_MAX_SAMPLERS; i++)
-       llvmpipe->tex_cache[i] = lp_create_tex_tile_cache( screen );
-    for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++)
-       llvmpipe->vertex_tex_cache[i] = lp_create_tex_tile_cache(screen);
- 
   
      /*
       * Create drawing context and plug our rendering stage into it.
@@@ -269,19 -182,11 +183,11 @@@
      if (debug_get_bool_option( "LP_NO_RAST", FALSE ))
         llvmpipe->no_rast = TRUE;
   
-    llvmpipe->vbuf_backend = lp_create_vbuf_backend(llvmpipe);
-    if (!llvmpipe->vbuf_backend)
-       goto fail;
- 
-    llvmpipe->vbuf = draw_vbuf_stage(llvmpipe->draw, llvmpipe->vbuf_backend);
-    if (!llvmpipe->vbuf)
+    llvmpipe->setup = lp_setup_create( screen,
+                                       llvmpipe->draw );
+    if (!llvmpipe->setup)
         goto fail;
   
-    draw_set_rasterize_stage(llvmpipe->draw, llvmpipe->vbuf);
-    draw_set_render(llvmpipe->draw, llvmpipe->vbuf_backend);
- 
- 
- 
      /* plug in AA line/point stages */
      draw_install_aaline_stage(llvmpipe->draw, &llvmpipe->pipe);
      draw_install_aapoint_stage(llvmpipe->draw, &llvmpipe->pipe);
@@@ -293,6 -198,8 +199,8 @@@
   
      lp_init_surface_functions(llvmpipe);
   
+    lp_reset_counters();
+ 
      return &llvmpipe->pipe;
   
    fail:
diff --combined src/gallium/drivers/llvmpipe/lp_fence.c

index 0000000000000000000000000000000000000000,97c46087da02243fe746c18bc77a834e094340ed..525c117f316592690bc89f515641ac34ab21852a

mode 000000,100644..100644
--- /dev/null
--- 2/src/gallium/drivers/llvmpipe/lp_fence.c
+++ b/src/gallium/drivers/llvmpipe/lp_fence.c
@@@ -1,0 -1,109 +1,110 @@@
+ /**************************************************************************
+  *
+  * Copyright 2009 VMware, Inc.
+  * All Rights Reserved.
+  *
+  * Permission is hereby granted, free of charge, to any person obtaining a
+  * copy of this software and associated documentation files (the
+  * "Software"), to deal in the Software without restriction, including
+  * without limitation the rights to use, copy, modify, merge, publish,
+  * distribute, sub license, and/or sell copies of the Software, and to
+  * permit persons to whom the Software is furnished to do so, subject to
+  * the following conditions:
+  *
+  * The above copyright notice and this permission notice (including the
+  * next paragraph) shall be included in all copies or substantial portions
+  * of the Software.
+  *
+  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+  *
+  **************************************************************************/
+ 
+ 
+ #include "pipe/p_screen.h"
+ #include "util/u_memory.h"
++#include "util/u_inlines.h"
+ #include "lp_fence.h"
+ 
+ 
+ struct lp_fence *
+ lp_fence_create(unsigned rank)
+ {
+    struct lp_fence *fence = CALLOC_STRUCT(lp_fence);
+ 
+    pipe_reference_init(&fence->reference, 1);
+ 
+    pipe_mutex_init(fence->mutex);
+    pipe_condvar_init(fence->signalled);
+ 
+    fence->rank = rank;
+ 
+    return fence;
+ }
+ 
+ 
+ static void
+ lp_fence_destroy(struct lp_fence *fence)
+ {
+    pipe_mutex_destroy(fence->mutex);
+    pipe_condvar_destroy(fence->signalled);
+    FREE(fence);
+ }
+ 
+ 
+ static void
+ llvmpipe_fence_reference(struct pipe_screen *screen,
+                          struct pipe_fence_handle **ptr,
+                          struct pipe_fence_handle *fence)
+ {
+    struct lp_fence *old = (struct lp_fence *) *ptr;
+    struct lp_fence *f = (struct lp_fence *) fence;
+ 
+    if (pipe_reference(&old->reference, &f->reference)) {
+       lp_fence_destroy(old);
+    }
+ }
+ 
+ 
+ static int
+ llvmpipe_fence_signalled(struct pipe_screen *screen,
+                          struct pipe_fence_handle *fence,
+                          unsigned flag)
+ {
+    struct lp_fence *f = (struct lp_fence *) fence;
+ 
+    return f->count == f->rank;
+ }
+ 
+ 
+ static int
+ llvmpipe_fence_finish(struct pipe_screen *screen,
+                       struct pipe_fence_handle *fence_handle,
+                       unsigned flag)
+ {
+    struct lp_fence *fence = (struct lp_fence *) fence_handle;
+ 
+    pipe_mutex_lock(fence->mutex);
+    while (fence->count < fence->rank) {
+       pipe_condvar_wait(fence->signalled, fence->mutex);
+    }
+    pipe_mutex_unlock(fence->mutex);
+ 
+    return 0;
+ }
+ 
+ 
+ 
+ 
+ void
+ llvmpipe_init_screen_fence_funcs(struct pipe_screen *screen)
+ {
+    screen->fence_reference = llvmpipe_fence_reference;
+    screen->fence_signalled = llvmpipe_fence_signalled;
+    screen->fence_finish = llvmpipe_fence_finish;
+ }
diff --combined src/gallium/drivers/llvmpipe/lp_fence.h

index 0000000000000000000000000000000000000000,d45318f9e4793ac590105e2567ad617d6acc3743..c90e6de423ba41d73d0003be98ab41f7ed2fb94d

mode 000000,100644..100644
--- /dev/null
--- 2/src/gallium/drivers/llvmpipe/lp_fence.h
+++ b/src/gallium/drivers/llvmpipe/lp_fence.h
@@@ -1,0 -1,60 +1,60 @@@
- -#include "pipe/p_refcnt.h"
- -#include "pipe/p_thread.h"
+ /**************************************************************************
+  *
+  * Copyright 2009 VMware, Inc.
+  * All Rights Reserved.
+  *
+  * Permission is hereby granted, free of charge, to any person obtaining a
+  * copy of this software and associated documentation files (the
+  * "Software"), to deal in the Software without restriction, including
+  * without limitation the rights to use, copy, modify, merge, publish,
+  * distribute, sub license, and/or sell copies of the Software, and to
+  * permit persons to whom the Software is furnished to do so, subject to
+  * the following conditions:
+  *
+  * The above copyright notice and this permission notice (including the
+  * next paragraph) shall be included in all copies or substantial portions
+  * of the Software.
+  *
+  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+  *
+  **************************************************************************/
+ 
+ 
+ #ifndef LP_FENCE_H
+ #define LP_FENCE_H
+ 
+ 
++#include "os/os_thread.h"
++#include "pipe/p_state.h"
+ 
+ 
+ struct pipe_screen;
+ 
+ 
+ struct lp_fence
+ {
+    struct pipe_reference reference;
+ 
+    pipe_mutex mutex;
+    pipe_condvar signalled;
+ 
+    unsigned rank;
+    unsigned count;
+ };
+ 
+ 
+ struct lp_fence *
+ lp_fence_create(unsigned rank);
+ 
+ 
+ void
+ llvmpipe_init_screen_fence_funcs(struct pipe_screen *screen);
+ 
+ 
+ #endif /* LP_FENCE_H */
diff --combined src/gallium/drivers/llvmpipe/lp_rast.c

index 0000000000000000000000000000000000000000,e27b6528eaf8a641288318a360dfacd92e255e82..54af850467ae89d59466814d8ad14fc60f2bd3d5

mode 000000,100644..100644
--- /dev/null
--- 2/src/gallium/drivers/llvmpipe/lp_rast.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast.c
@@@ -1,0 -1,1024 +1,1023 @@@
- -      printf("rasterize scene:\n");
- -      printf("  data size: %u\n", lp_scene_data_size(scene));
+ /**************************************************************************
+  *
+  * Copyright 2009 VMware, Inc.
+  * All Rights Reserved.
+  *
+  * Permission is hereby granted, free of charge, to any person obtaining a
+  * copy of this software and associated documentation files (the
+  * "Software"), to deal in the Software without restriction, including
+  * without limitation the rights to use, copy, modify, merge, publish,
+  * distribute, sub license, and/or sell copies of the Software, and to
+  * permit persons to whom the Software is furnished to do so, subject to
+  * the following conditions:
+  *
+  * The above copyright notice and this permission notice (including the
+  * next paragraph) shall be included in all copies or substantial portions
+  * of the Software.
+  *
+  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+  *
+  **************************************************************************/
+ 
+ #include <limits.h>
+ #include "util/u_memory.h"
+ #include "util/u_math.h"
+ #include "util/u_cpu_detect.h"
+ #include "util/u_surface.h"
+ 
+ #include "lp_scene_queue.h"
+ #include "lp_debug.h"
+ #include "lp_fence.h"
+ #include "lp_rast.h"
+ #include "lp_rast_priv.h"
+ #include "lp_tile_soa.h"
+ #include "lp_bld_debug.h"
+ #include "lp_scene.h"
+ 
+ 
+ /**
+  * Begin the rasterization phase.
+  * Map the framebuffer surfaces.  Initialize the 'rast' state.
+  */
+ static boolean
+ lp_rast_begin( struct lp_rasterizer *rast,
+                const struct pipe_framebuffer_state *fb,
+                boolean write_color,
+                boolean write_zstencil )
+ {
+    struct pipe_screen *screen = rast->screen;
+    struct pipe_surface *cbuf, *zsbuf;
+    int i;
+ 
+    LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__);
+ 
+    util_copy_framebuffer_state(&rast->state.fb, fb);
+ 
+    rast->state.write_zstencil = write_zstencil;
+    rast->state.write_color = write_color;
+ 
+    rast->check_for_clipped_tiles = (fb->width % TILE_SIZE != 0 ||
+                                     fb->height % TILE_SIZE != 0);
+ 
+    
+    for (i = 0; i < rast->state.fb.nr_cbufs; i++) {
+       cbuf = rast->state.fb.cbufs[i];
+       if (cbuf) {
+        rast->cbuf_transfer[i] = screen->get_tex_transfer(rast->screen,
+                                                          cbuf->texture,
+                                                          cbuf->face,
+                                                          cbuf->level,
+                                                          cbuf->zslice,
+                                                          PIPE_TRANSFER_READ_WRITE,
+                                                          0, 0,
+                                                          cbuf->width, 
+                                                          cbuf->height);
+        if (!rast->cbuf_transfer[i])
+           goto fail;
+ 
+        rast->cbuf_map[i] = screen->transfer_map(rast->screen, 
+                                                 rast->cbuf_transfer[i]);
+        if (!rast->cbuf_map[i])
+           goto fail;
+       }
+    }
+ 
+    zsbuf = rast->state.fb.zsbuf;
+    if (zsbuf) {
+       rast->zsbuf_transfer = screen->get_tex_transfer(rast->screen,
+                                                       zsbuf->texture,
+                                                       zsbuf->face,
+                                                       zsbuf->level,
+                                                       zsbuf->zslice,
+                                                       PIPE_TRANSFER_READ_WRITE,
+                                                       0, 0,
+                                                       zsbuf->width,
+                                                     zsbuf->height);
+       if (!rast->zsbuf_transfer)
+          goto fail;
+ 
+       rast->zsbuf_map = screen->transfer_map(rast->screen, 
+                                             rast->zsbuf_transfer);
+       if (!rast->zsbuf_map)
+        goto fail;
+    }
+ 
+    return TRUE;
+ 
+ fail:
+    /* Unmap and release transfers?
+     */
+    return FALSE;
+ }
+ 
+ 
+ /**
+  * Finish the rasterization phase.
+  * Unmap framebuffer surfaces.
+  */
+ static void
+ lp_rast_end( struct lp_rasterizer *rast )
+ {
+    struct pipe_screen *screen = rast->screen;
+    unsigned i;
+ 
+    for (i = 0; i < rast->state.fb.nr_cbufs; i++) {
+       if (rast->cbuf_map[i]) 
+        screen->transfer_unmap(screen, rast->cbuf_transfer[i]);
+ 
+       if (rast->cbuf_transfer[i])
+        screen->tex_transfer_destroy(rast->cbuf_transfer[i]);
+ 
+       rast->cbuf_transfer[i] = NULL;
+       rast->cbuf_map[i] = NULL;
+    }
+ 
+    if (rast->zsbuf_map) 
+       screen->transfer_unmap(screen, rast->zsbuf_transfer);
+ 
+    if (rast->zsbuf_transfer)
+       screen->tex_transfer_destroy(rast->zsbuf_transfer);
+ 
+    rast->zsbuf_transfer = NULL;
+    rast->zsbuf_map = NULL;
+ }
+ 
+ 
+ /**
+  * Begining rasterization of a tile.
+  * \param x  window X position of the tile, in pixels
+  * \param y  window Y position of the tile, in pixels
+  */
+ static void
+ lp_rast_start_tile( struct lp_rasterizer *rast,
+                     unsigned thread_index,
+                     unsigned x, unsigned y )
+ {
+    LP_DBG(DEBUG_RAST, "%s %d,%d\n", __FUNCTION__, x, y);
+ 
+    rast->tasks[thread_index].x = x;
+    rast->tasks[thread_index].y = y;
+ }
+ 
+ 
+ /**
+  * Clear the rasterizer's current color tile.
+  * This is a bin command called during bin processing.
+  */
+ void lp_rast_clear_color( struct lp_rasterizer *rast,
+                           unsigned thread_index,
+                           const union lp_rast_cmd_arg arg )
+ {
+    const uint8_t *clear_color = arg.clear_color;
+    uint8_t **color_tile = rast->tasks[thread_index].tile.color;
+    unsigned i;
+ 
+    LP_DBG(DEBUG_RAST, "%s 0x%x,0x%x,0x%x,0x%x\n", __FUNCTION__, 
+               clear_color[0],
+               clear_color[1],
+               clear_color[2],
+               clear_color[3]);
+ 
+    if (clear_color[0] == clear_color[1] &&
+        clear_color[1] == clear_color[2] &&
+        clear_color[2] == clear_color[3]) {
+       /* clear to grayscale value {x, x, x, x} */
+       for (i = 0; i < rast->state.fb.nr_cbufs; i++) {
+        memset(color_tile[i], clear_color[0], TILE_SIZE * TILE_SIZE * 4);
+       }
+    }
+    else {
+       /* Non-gray color.
+        * Note: if the swizzled tile layout changes (see TILE_PIXEL) this code
+        * will need to change.  It'll be pretty obvious when clearing no longer
+        * works.
+        */
+       const unsigned chunk = TILE_SIZE / 4;
+       for (i = 0; i < rast->state.fb.nr_cbufs; i++) {
+          uint8_t *c = color_tile[i];
+          unsigned j;
+          for (j = 0; j < 4 * TILE_SIZE; j++) {
+             memset(c, clear_color[0], chunk);
+             c += chunk;
+             memset(c, clear_color[1], chunk);
+             c += chunk;
+             memset(c, clear_color[2], chunk);
+             c += chunk;
+             memset(c, clear_color[3], chunk);
+             c += chunk;
+          }
+          assert(c - color_tile[i] == TILE_SIZE * TILE_SIZE * 4);
+       }
+    }
+ }
+ 
+ 
+ /**
+  * Clear the rasterizer's current z/stencil tile.
+  * This is a bin command called during bin processing.
+  */
+ void lp_rast_clear_zstencil( struct lp_rasterizer *rast,
+                              unsigned thread_index,
+                              const union lp_rast_cmd_arg arg)
+ {
+    unsigned i;
+    uint32_t *depth_tile = rast->tasks[thread_index].tile.depth;
+    
+    LP_DBG(DEBUG_RAST, "%s 0x%x\n", __FUNCTION__, arg.clear_zstencil);
+ 
+    for (i = 0; i < TILE_SIZE * TILE_SIZE; i++)
+       depth_tile[i] = arg.clear_zstencil;
+ }
+ 
+ 
+ /**
+  * Load tile color from the framebuffer surface.
+  * This is a bin command called during bin processing.
+  */
+ void lp_rast_load_color( struct lp_rasterizer *rast,
+                          unsigned thread_index,
+                          const union lp_rast_cmd_arg arg)
+ {
+    struct lp_rasterizer_task *task = &rast->tasks[thread_index];
+    const unsigned x = task->x;
+    const unsigned y = task->y;
+    unsigned i;
+ 
+    LP_DBG(DEBUG_RAST, "%s at %u, %u\n", __FUNCTION__, x, y);
+ 
+    for (i = 0; i < rast->state.fb.nr_cbufs; i++) {
+       struct pipe_transfer *transfer = rast->cbuf_transfer[i];
+       int w = TILE_SIZE;
+       int h = TILE_SIZE;
+ 
+       if (x >= transfer->width)
+        continue;
+ 
+       if (y >= transfer->height)
+        continue;
+ 
+       assert(w >= 0);
+       assert(h >= 0);
+       assert(w <= TILE_SIZE);
+       assert(h <= TILE_SIZE);
+ 
+       lp_tile_read_4ub(transfer->texture->format,
+                      task->tile.color[i],
+                      rast->cbuf_map[i], 
+                      transfer->stride,
+                      x, y,
+                      w, h);
+    }
+ }
+ 
+ 
+ static void
+ lp_tile_read_z32(uint32_t *tile,
+                  const uint8_t *map,
+                  unsigned map_stride,
+                  unsigned x0, unsigned y0, unsigned w, unsigned h)
+ {
+    unsigned x, y;
+    const uint8_t *map_row = map + y0*map_stride;
+    for (y = 0; y < h; ++y) {
+       const uint32_t *map_pixel = (uint32_t *)(map_row + x0*4);
+       for (x = 0; x < w; ++x) {
+          *tile++ = *map_pixel++;
+       }
+       map_row += map_stride;
+    }
+ }
+ 
+ /**
+  * Load tile z/stencil from the framebuffer surface.
+  * This is a bin command called during bin processing.
+  */
+ void lp_rast_load_zstencil( struct lp_rasterizer *rast,
+                             unsigned thread_index,
+                             const union lp_rast_cmd_arg arg )
+ {
+    struct lp_rasterizer_task *task = &rast->tasks[thread_index];
+    const unsigned x = task->x;
+    const unsigned y = task->y;
+    unsigned w = TILE_SIZE;
+    unsigned h = TILE_SIZE;
+ 
+    if (x + w > rast->state.fb.width)
+       w -= x + w - rast->state.fb.width;
+ 
+    if (y + h > rast->state.fb.height)
+       h -= y + h - rast->state.fb.height;
+ 
+    LP_DBG(DEBUG_RAST, "%s %d,%d %dx%d\n", __FUNCTION__, x, y, w, h);
+ 
+    assert(rast->zsbuf_transfer->texture->format == PIPE_FORMAT_Z32_UNORM);
+    lp_tile_read_z32(task->tile.depth,
+                     rast->zsbuf_map, 
+                     rast->zsbuf_transfer->stride,
+                     x, y, w, h);
+ }
+ 
+ 
+ void lp_rast_set_state( struct lp_rasterizer *rast,
+                         unsigned thread_index,
+                         const union lp_rast_cmd_arg arg )
+ {
+    const struct lp_rast_state *state = arg.set_state;
+ 
+    LP_DBG(DEBUG_RAST, "%s %p\n", __FUNCTION__, (void *) state);
+ 
+    /* just set the current state pointer for this rasterizer */
+    rast->tasks[thread_index].current_state = state;
+ }
+ 
+ 
+ 
+ /**
+  * Run the shader on all blocks in a tile.  This is used when a tile is
+  * completely contained inside a triangle.
+  * This is a bin command called during bin processing.
+  */
+ void lp_rast_shade_tile( struct lp_rasterizer *rast,
+                          unsigned thread_index,
+                          const union lp_rast_cmd_arg arg )
+ {
+    struct lp_rasterizer_task *task = &rast->tasks[thread_index];
+    const struct lp_rast_state *state = task->current_state;
+    struct lp_rast_tile *tile = &task->tile;
+    const struct lp_rast_shader_inputs *inputs = arg.shade_tile;
+    const unsigned tile_x = task->x;
+    const unsigned tile_y = task->y;
+    unsigned x, y;
+ 
+    LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__);
+ 
+    /* render the whole 64x64 tile in 4x4 chunks */
+    for (y = 0; y < TILE_SIZE; y += 4){
+       for (x = 0; x < TILE_SIZE; x += 4) {
+          uint8_t *color[PIPE_MAX_COLOR_BUFS];
+          uint32_t *depth;
+          unsigned block_offset, i;
+ 
+          /* offset of the 16x16 pixel block within the tile */
+          block_offset = ((y / 4) * (16 * 16) + (x / 4) * 16);
+ 
+          /* color buffer */
+          for (i = 0; i < rast->state.fb.nr_cbufs; i++)
+             color[i] = tile->color[i] + 4 * block_offset;
+ 
+          /* depth buffer */
+          depth = tile->depth + block_offset;
+ 
+          /* run shader */
+          state->jit_function[0]( &state->jit_context,
+                                  tile_x + x, tile_y + y,
+                                  inputs->a0,
+                                  inputs->dadx,
+                                  inputs->dady,
+                                  color,
+                                  depth,
+                                  INT_MIN, INT_MIN, INT_MIN,
+                                  NULL, NULL, NULL );
+       }
+    }
+ }
+ 
+ 
+ /**
+  * Compute shading for a 4x4 block of pixels.
+  * This is a bin command called during bin processing.
+  */
+ void lp_rast_shade_quads( struct lp_rasterizer *rast,
+                           unsigned thread_index,
+                           const struct lp_rast_shader_inputs *inputs,
+                           unsigned x, unsigned y,
+                           int32_t c1, int32_t c2, int32_t c3)
+ {
+    struct lp_rasterizer_task *task = &rast->tasks[thread_index];
+    const struct lp_rast_state *state = task->current_state;
+    struct lp_rast_tile *tile = &task->tile;
+    uint8_t *color[PIPE_MAX_COLOR_BUFS];
+    void *depth;
+    unsigned i;
+    unsigned ix, iy;
+    int block_offset;
+ 
+ #ifdef DEBUG
+    assert(state);
+ 
+    /* Sanity checks */
+    assert(x % TILE_VECTOR_WIDTH == 0);
+    assert(y % TILE_VECTOR_HEIGHT == 0);
+ 
+    assert((x % 4) == 0);
+    assert((y % 4) == 0);
+ #endif
+ 
+    ix = x % TILE_SIZE;
+    iy = y % TILE_SIZE;
+ 
+    /* offset of the 16x16 pixel block within the tile */
+    block_offset = ((iy / 4) * (16 * 16) + (ix / 4) * 16);
+ 
+    /* color buffer */
+    for (i = 0; i < rast->state.fb.nr_cbufs; i++)
+       color[i] = tile->color[i] + 4 * block_offset;
+ 
+    /* depth buffer */
+    depth = tile->depth + block_offset;
+ 
+ 
+ 
+ #ifdef DEBUG
+    assert(lp_check_alignment(tile->depth, 16));
+    assert(lp_check_alignment(tile->color[0], 16));
+    assert(lp_check_alignment(state->jit_context.blend_color, 16));
+ 
+    assert(lp_check_alignment(inputs->step[0], 16));
+    assert(lp_check_alignment(inputs->step[1], 16));
+    assert(lp_check_alignment(inputs->step[2], 16));
+ #endif
+ 
+    /* run shader */
+    state->jit_function[1]( &state->jit_context,
+                         x, y,
+                         inputs->a0,
+                         inputs->dadx,
+                         inputs->dady,
+                         color,
+                         depth,
+                         c1, c2, c3,
+                         inputs->step[0], inputs->step[1], inputs->step[2]);
+ }
+ 
+ 
+ /**
+  * Set top row and left column of the tile's pixels to white.  For debugging.
+  */
+ static void
+ outline_tile(uint8_t *tile)
+ {
+    const uint8_t val = 0xff;
+    unsigned i;
+ 
+    for (i = 0; i < TILE_SIZE; i++) {
+       TILE_PIXEL(tile, i, 0, 0) = val;
+       TILE_PIXEL(tile, i, 0, 1) = val;
+       TILE_PIXEL(tile, i, 0, 2) = val;
+       TILE_PIXEL(tile, i, 0, 3) = val;
+ 
+       TILE_PIXEL(tile, 0, i, 0) = val;
+       TILE_PIXEL(tile, 0, i, 1) = val;
+       TILE_PIXEL(tile, 0, i, 2) = val;
+       TILE_PIXEL(tile, 0, i, 3) = val;
+    }
+ }
+ 
+ 
+ /**
+  * Draw grid of gray lines at 16-pixel intervals across the tile to
+  * show the sub-tile boundaries.  For debugging.
+  */
+ static void
+ outline_subtiles(uint8_t *tile)
+ {
+    const uint8_t val = 0x80;
+    const unsigned step = 16;
+    unsigned i, j;
+ 
+    for (i = 0; i < TILE_SIZE; i += step) {
+       for (j = 0; j < TILE_SIZE; j++) {
+          TILE_PIXEL(tile, i, j, 0) = val;
+          TILE_PIXEL(tile, i, j, 1) = val;
+          TILE_PIXEL(tile, i, j, 2) = val;
+          TILE_PIXEL(tile, i, j, 3) = val;
+ 
+          TILE_PIXEL(tile, j, i, 0) = val;
+          TILE_PIXEL(tile, j, i, 1) = val;
+          TILE_PIXEL(tile, j, i, 2) = val;
+          TILE_PIXEL(tile, j, i, 3) = val;
+       }
+    }
+ 
+    outline_tile(tile);
+ }
+ 
+ 
+ 
+ /**
+  * Write the rasterizer's color tile to the framebuffer.
+  */
+ static void lp_rast_store_color( struct lp_rasterizer *rast,
+                                  unsigned thread_index)
+ {
+    struct lp_rasterizer_task *task = &rast->tasks[thread_index];
+    const unsigned x = task->x;
+    const unsigned y = task->y;
+    unsigned i;
+ 
+    for (i = 0; i < rast->state.fb.nr_cbufs; i++) {
+       struct pipe_transfer *transfer = rast->cbuf_transfer[i];
+       int w = TILE_SIZE;
+       int h = TILE_SIZE;
+ 
+       if (x >= transfer->width)
+        continue;
+ 
+       if (y >= transfer->height)
+        continue;
+ 
+       LP_DBG(DEBUG_RAST, "%s [%u] %d,%d %dx%d\n", __FUNCTION__,
+            thread_index, x, y, w, h);
+ 
+       if (LP_DEBUG & DEBUG_SHOW_SUBTILES)
+          outline_subtiles(task->tile.color[i]);
+       else if (LP_DEBUG & DEBUG_SHOW_TILES)
+          outline_tile(task->tile.color[i]);
+ 
+       lp_tile_write_4ub(transfer->texture->format,
+                       task->tile.color[i],
+                       rast->cbuf_map[i], 
+                       transfer->stride,
+                       x, y,
+                       w, h);
+    }
+ }
+ 
+ 
+ static void
+ lp_tile_write_z32(const uint32_t *src, uint8_t *dst, unsigned dst_stride,
+                   unsigned x0, unsigned y0, unsigned w, unsigned h)
+ {
+    unsigned x, y;
+    uint8_t *dst_row = dst + y0*dst_stride;
+    for (y = 0; y < h; ++y) {
+       uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*4);
+       for (x = 0; x < w; ++x) {
+          *dst_pixel++ = *src++;
+       }
+       dst_row += dst_stride;
+    }
+ }
+ 
+ /**
+  * Write the rasterizer's z/stencil tile to the framebuffer.
+  */
+ static void lp_rast_store_zstencil( struct lp_rasterizer *rast,
+                                     unsigned thread_index )
+ {
+    struct lp_rasterizer_task *task = &rast->tasks[thread_index];
+    const unsigned x = task->x;
+    const unsigned y = task->y;
+    unsigned w = TILE_SIZE;
+    unsigned h = TILE_SIZE;
+ 
+    if (x + w > rast->state.fb.width)
+       w -= x + w - rast->state.fb.width;
+ 
+    if (y + h > rast->state.fb.height)
+       h -= y + h - rast->state.fb.height;
+ 
+    LP_DBG(DEBUG_RAST, "%s %d,%d %dx%d\n", __FUNCTION__, x, y, w, h);
+ 
+    assert(rast->zsbuf_transfer->texture->format == PIPE_FORMAT_Z32_UNORM);
+    lp_tile_write_z32(task->tile.depth,
+                      rast->zsbuf_map, 
+                      rast->zsbuf_transfer->stride,
+                      x, y, w, h);
+ }
+ 
+ 
+ /**
+  * Write the rasterizer's tiles to the framebuffer.
+  */
+ static void
+ lp_rast_end_tile( struct lp_rasterizer *rast,
+                   unsigned thread_index )
+ {
+    LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__);
+ 
+    if (rast->state.write_color)
+       lp_rast_store_color(rast, thread_index);
+ 
+    if (rast->state.write_zstencil)
+       lp_rast_store_zstencil(rast, thread_index);
+ }
+ 
+ 
+ /**
+  * Signal on a fence.  This is called during bin execution/rasterization.
+  * Called per thread.
+  */
+ void lp_rast_fence( struct lp_rasterizer *rast,
+                     unsigned thread_index,
+                     const union lp_rast_cmd_arg arg )
+ {
+    struct lp_fence *fence = arg.fence;
+ 
+    pipe_mutex_lock( fence->mutex );
+ 
+    fence->count++;
+    assert(fence->count <= fence->rank);
+ 
+    LP_DBG(DEBUG_RAST, "%s count=%u rank=%u\n", __FUNCTION__,
+           fence->count, fence->rank);
+ 
+    pipe_condvar_signal( fence->signalled );
+ 
+    pipe_mutex_unlock( fence->mutex );
+ }
+ 
+ 
+ /**
+  * When all the threads are done rasterizing a scene, one thread will
+  * call this function to reset the scene and put it onto the empty queue.
+  */
+ static void
+ release_scene( struct lp_rasterizer *rast,
+              struct lp_scene *scene )
+ {
+    util_unreference_framebuffer_state( &scene->fb );
+ 
+    lp_scene_reset( scene );
+    lp_scene_enqueue( rast->empty_scenes, scene );
+    rast->curr_scene = NULL;
+ }
+ 
+ 
+ /**
+  * Rasterize commands for a single bin.
+  * \param x, y  position of the bin's tile in the framebuffer
+  * Must be called between lp_rast_begin() and lp_rast_end().
+  * Called per thread.
+  */
+ static void
+ rasterize_bin( struct lp_rasterizer *rast,
+                unsigned thread_index,
+                const struct cmd_bin *bin,
+                int x, int y)
+ {
+    const struct cmd_block_list *commands = &bin->commands;
+    struct cmd_block *block;
+    unsigned k;
+ 
+    lp_rast_start_tile( rast, thread_index, x, y );
+ 
+    /* simply execute each of the commands in the block list */
+    for (block = commands->head; block; block = block->next) {
+       for (k = 0; k < block->count; k++) {
+          block->cmd[k]( rast, thread_index, block->arg[k] );
+       }
+    }
+ 
+    lp_rast_end_tile( rast, thread_index );
+ }
+ 
+ 
+ #define RAST(x) { lp_rast_##x, #x }
+ 
+ static struct {
+    lp_rast_cmd cmd;
+    const char *name;
+ } cmd_names[] = 
+ {
+    RAST(load_color),
+    RAST(load_zstencil),
+    RAST(clear_color),
+    RAST(clear_zstencil),
+    RAST(triangle),
+    RAST(shade_tile),
+    RAST(set_state),
+    RAST(fence),
+ };
+ 
+ static void
+ debug_bin( const struct cmd_bin *bin )
+ {
+    const struct cmd_block *head = bin->commands.head;
+    int i, j;
+ 
+    for (i = 0; i < head->count; i++) {
+       debug_printf("%d: ", i);
+       for (j = 0; j < Elements(cmd_names); j++) {
+          if (head->cmd[i] == cmd_names[j].cmd) {
+             debug_printf("%s\n", cmd_names[j].name);
+             break;
+          }
+       }
+       if (j == Elements(cmd_names))
+          debug_printf("...other\n");
+    }
+ 
+ }
+ 
+ /* An empty bin is one that just loads the contents of the tile and
+  * stores them again unchanged.  This typically happens when bins have
+  * been flushed for some reason in the middle of a frame, or when
+  * incremental updates are being made to a render target.
+  * 
+  * Try to avoid doing pointless work in this case.
+  */
+ static boolean
+ is_empty_bin( const struct cmd_bin *bin )
+ {
+    const struct cmd_block *head = bin->commands.head;
+    int i;
+    
+    if (0)
+       debug_bin(bin);
+    
+    /* We emit at most two load-tile commands at the start of the first
+     * command block.  In addition we seem to emit a couple of
+     * set-state commands even in empty bins.
+     *
+     * As a heuristic, if a bin has more than 4 commands, consider it
+     * non-empty.
+     */
+    if (head->next != NULL ||
+        head->count > 4) {
+       return FALSE;
+    }
+ 
+    for (i = 0; i < head->count; i++)
+       if (head->cmd[i] != lp_rast_load_color &&
+           head->cmd[i] != lp_rast_load_zstencil &&
+           head->cmd[i] != lp_rast_set_state) {
+          return FALSE;
+       }
+ 
+    return TRUE;
+ }
+ 
+ 
+ 
+ /**
+  * Rasterize/execute all bins within a scene.
+  * Called per thread.
+  */
+ static void
+ rasterize_scene( struct lp_rasterizer *rast,
+                 unsigned thread_index,
+                 struct lp_scene *scene,
+                 bool write_depth )
+ {
+    /* loop over scene bins, rasterize each */
+ #if 0
+    {
+       unsigned i, j;
+       for (i = 0; i < scene->tiles_x; i++) {
+          for (j = 0; j < scene->tiles_y; j++) {
+             struct cmd_bin *bin = lp_get_bin(scene, i, j);
+             rasterize_bin( rast, thread_index,
+                            bin, i * TILE_SIZE, j * TILE_SIZE );
+          }
+       }
+    }
+ #else
+    {
+       struct cmd_bin *bin;
+       int x, y;
+ 
+       assert(scene);
+       while ((bin = lp_scene_bin_iter_next(scene, &x, &y))) {
+          if (!is_empty_bin( bin ))
+             rasterize_bin( rast, thread_index, bin, x * TILE_SIZE, y * TILE_SIZE);
+       }
+    }
+ #endif
+ }
+ 
+ 
+ /**
+  * Called by setup module when it has something for us to render.
+  */
+ void
+ lp_rasterize_scene( struct lp_rasterizer *rast,
+                    struct lp_scene *scene,
+                    const struct pipe_framebuffer_state *fb,
+                    bool write_depth )
+ {
+    boolean debug = false;
+ 
+    LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
+ 
+    if (debug) {
+       unsigned x, y;
- -            printf("  bin %u, %u size: %u\n", x, y,
- -                   lp_scene_bin_size(scene, x, y));
++      debug_printf("rasterize scene:\n");
++      debug_printf("  data size: %u\n", lp_scene_data_size(scene));
+       for (y = 0; y < scene->tiles_y; y++) {
+          for (x = 0; x < scene->tiles_x; x++) {
- -static void *
- -thread_func( void *init_data )
++            debug_printf("  bin %u, %u size: %u\n", x, y,
++                         lp_scene_bin_size(scene, x, y));
+          }
+       }
+    }
+ 
+    /* save framebuffer state in the bin */
+    util_copy_framebuffer_state(&scene->fb, fb);
+    scene->write_depth = write_depth;
+ 
+    if (rast->num_threads == 0) {
+       /* no threading */
+ 
+       lp_rast_begin( rast, fb,
+                      fb->nr_cbufs != 0, /* always write color if cbufs present */
+                      fb->zsbuf != NULL && write_depth );
+ 
+       lp_scene_bin_iter_begin( scene );
+       rasterize_scene( rast, 0, scene, write_depth );
+ 
+       release_scene( rast, scene );
+ 
+       lp_rast_end( rast );
+    }
+    else {
+       /* threaded rendering! */
+       unsigned i;
+ 
+       lp_scene_enqueue( rast->full_scenes, scene );
+ 
+       /* signal the threads that there's work to do */
+       for (i = 0; i < rast->num_threads; i++) {
+          pipe_semaphore_signal(&rast->tasks[i].work_ready);
+       }
+ 
+       /* wait for work to complete */
+       for (i = 0; i < rast->num_threads; i++) {
+          pipe_semaphore_wait(&rast->tasks[i].work_done);
+       }
+    }
+ 
+    LP_DBG(DEBUG_SETUP, "%s done \n", __FUNCTION__);
+ }
+ 
+ 
+ /**
+  * This is the thread's main entrypoint.
+  * It's a simple loop:
+  *   1. wait for work
+  *   2. do work
+  *   3. signal that we're done
+  */
++static PIPE_THREAD_ROUTINE( thread_func, init_data )
+ {
+    struct lp_rasterizer_task *task = (struct lp_rasterizer_task *) init_data;
+    struct lp_rasterizer *rast = task->rast;
+    boolean debug = false;
+ 
+    while (1) {
+       /* wait for work */
+       if (debug)
+          debug_printf("thread %d waiting for work\n", task->thread_index);
+       pipe_semaphore_wait(&task->work_ready);
+ 
+       if (task->thread_index == 0) {
+          /* thread[0]:
+           *  - get next scene to rasterize
+           *  - map the framebuffer surfaces
+           */
+          const struct pipe_framebuffer_state *fb;
+          boolean write_depth;
+ 
+          rast->curr_scene = lp_scene_dequeue( rast->full_scenes, TRUE );
+ 
+          lp_scene_bin_iter_begin( rast->curr_scene );
+ 
+          fb = &rast->curr_scene->fb;
+          write_depth = rast->curr_scene->write_depth;
+ 
+          lp_rast_begin( rast, fb,
+                         fb->nr_cbufs != 0,
+                         fb->zsbuf != NULL && write_depth );
+       }
+ 
+       /* Wait for all threads to get here so that threads[1+] don't
+        * get a null rast->curr_scene pointer.
+        */
+       pipe_barrier_wait( &rast->barrier );
+ 
+       /* do work */
+       if (debug)
+          debug_printf("thread %d doing work\n", task->thread_index);
+       rasterize_scene(rast, 
+                    task->thread_index,
+                      rast->curr_scene, 
+                    rast->curr_scene->write_depth);
+       
+       /* wait for all threads to finish with this scene */
+       pipe_barrier_wait( &rast->barrier );
+ 
+       if (task->thread_index == 0) {
+          /* thread[0]:
+           * - release the scene object
+           * - unmap the framebuffer surfaces
+           */
+          release_scene( rast, rast->curr_scene );
+          lp_rast_end( rast );
+       }
+ 
+       /* signal done with work */
+       if (debug)
+          debug_printf("thread %d done working\n", task->thread_index);
+       pipe_semaphore_signal(&task->work_done);
+    }
+ 
+    return NULL;
+ }
+ 
+ 
+ /**
+  * Initialize semaphores and spawn the threads.
+  */
+ static void
+ create_rast_threads(struct lp_rasterizer *rast)
+ {
+    unsigned i;
+ 
+    rast->num_threads = util_cpu_caps.nr_cpus;
+    rast->num_threads = debug_get_num_option("LP_NUM_THREADS", rast->num_threads);
+    rast->num_threads = MIN2(rast->num_threads, MAX_THREADS);
+ 
+    /* NOTE: if num_threads is zero, we won't use any threads */
+    for (i = 0; i < rast->num_threads; i++) {
+       pipe_semaphore_init(&rast->tasks[i].work_ready, 0);
+       pipe_semaphore_init(&rast->tasks[i].work_done, 0);
+       rast->threads[i] = pipe_thread_create(thread_func,
+                                             (void *) &rast->tasks[i]);
+    }
+ }
+ 
+ 
+ 
+ /**
+  * Create new lp_rasterizer.
+  * \param empty  the queue to put empty scenes on after we've finished
+  *               processing them.
+  */
+ struct lp_rasterizer *
+ lp_rast_create( struct pipe_screen *screen, struct lp_scene_queue *empty )
+ {
+    struct lp_rasterizer *rast;
+    unsigned i, cbuf;
+ 
+    rast = CALLOC_STRUCT(lp_rasterizer);
+    if(!rast)
+       return NULL;
+ 
+    rast->screen = screen;
+ 
+    rast->empty_scenes = empty;
+    rast->full_scenes = lp_scene_queue_create();
+ 
+    for (i = 0; i < Elements(rast->tasks); i++) {
+       struct lp_rasterizer_task *task = &rast->tasks[i];
+ 
+       for (cbuf = 0; cbuf < PIPE_MAX_COLOR_BUFS; cbuf++ )
+        task->tile.color[cbuf] = align_malloc(TILE_SIZE * TILE_SIZE * 4, 16);
+ 
+       task->tile.depth = align_malloc(TILE_SIZE * TILE_SIZE * 4, 16);
+       task->rast = rast;
+       task->thread_index = i;
+    }
+ 
+    create_rast_threads(rast);
+ 
+    /* for synchronizing rasterization threads */
+    pipe_barrier_init( &rast->barrier, rast->num_threads );
+ 
+    return rast;
+ }
+ 
+ 
+ /* Shutdown:
+  */
+ void lp_rast_destroy( struct lp_rasterizer *rast )
+ {
+    unsigned i, cbuf;
+ 
+    util_unreference_framebuffer_state(&rast->state.fb);
+ 
+    for (i = 0; i < Elements(rast->tasks); i++) {
+       align_free(rast->tasks[i].tile.depth);
+       for (cbuf = 0; cbuf < PIPE_MAX_COLOR_BUFS; cbuf++ )
+        align_free(rast->tasks[i].tile.color[cbuf]);
+    }
+ 
+    /* for synchronizing rasterization threads */
+    pipe_barrier_destroy( &rast->barrier );
+ 
+    FREE(rast);
+ }
+ 
+ 
+ /** Return number of rasterization threads */
+ unsigned
+ lp_rast_get_num_threads( struct lp_rasterizer *rast )
+ {
+    return rast->num_threads;
+ }
diff --combined src/gallium/drivers/llvmpipe/lp_rast_priv.h

index 0000000000000000000000000000000000000000,607968e3459dee871d0a3b70af99086efb159df5..71e3a301e61014cd867de7cc3e7333b7b121bc00

mode 000000,100644..100644
--- /dev/null
--- 2/src/gallium/drivers/llvmpipe/lp_rast_priv.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
@@@ -1,0 -1,172 +1,172 @@@
- -#include "pipe/p_thread.h"
+ /**************************************************************************
+  *
+  * Copyright 2009 VMware, Inc.
+  * All Rights Reserved.
+  *
+  * Permission is hereby granted, free of charge, to any person obtaining a
+  * copy of this software and associated documentation files (the
+  * "Software"), to deal in the Software without restriction, including
+  * without limitation the rights to use, copy, modify, merge, publish,
+  * distribute, sub license, and/or sell copies of the Software, and to
+  * permit persons to whom the Software is furnished to do so, subject to
+  * the following conditions:
+  *
+  * The above copyright notice and this permission notice (including the
+  * next paragraph) shall be included in all copies or substantial portions
+  * of the Software.
+  *
+  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+  *
+  **************************************************************************/
+ 
+ #ifndef LP_RAST_PRIV_H
+ #define LP_RAST_PRIV_H
+ 
++#include "os/os_thread.h"
+ #include "lp_rast.h"
+ #include "lp_tile_soa.h"
+ 
+ 
+ #define MAX_THREADS 8  /* XXX probably temporary here */
+ 
+ 
+ struct pipe_transfer;
+ struct pipe_screen;
+ struct lp_rasterizer;
+ 
+ 
+ /**
+  * A tile's color and depth memory.
+  * We can choose whatever layout for the internal tile storage we prefer.
+  */
+ struct lp_rast_tile
+ {
+    uint8_t *color[PIPE_MAX_COLOR_BUFS];
+ 
+    uint32_t *depth;
+ };
+ 
+ 
+ /**
+  * Per-thread rasterization state
+  */
+ struct lp_rasterizer_task
+ {
+    struct lp_rast_tile tile;   /** Tile color/z/stencil memory */
+ 
+    unsigned x, y;          /**< Pos of this tile in framebuffer, in pixels */
+ 
+    const struct lp_rast_state *current_state;
+ 
+    /** "back" pointer */
+    struct lp_rasterizer *rast;
+ 
+    /** "my" index */
+    unsigned thread_index;
+ 
+    pipe_semaphore work_ready;
+    pipe_semaphore work_done;
+ };
+ 
+ 
+ /**
+  * This is the state required while rasterizing tiles.
+  * Note that this contains per-thread information too.
+  * The tile size is TILE_SIZE x TILE_SIZE pixels.
+  */
+ struct lp_rasterizer
+ {
+    boolean clipped_tile;
+    boolean check_for_clipped_tiles;
+ 
+    /* Framebuffer stuff
+     */
+    struct pipe_screen *screen;
+    struct pipe_transfer *cbuf_transfer[PIPE_MAX_COLOR_BUFS];
+    struct pipe_transfer *zsbuf_transfer;
+    void *cbuf_map[PIPE_MAX_COLOR_BUFS];
+    void *zsbuf_map;
+ 
+    struct {
+       struct pipe_framebuffer_state fb;
+       boolean write_color;
+       boolean write_zstencil;
+       unsigned clear_color;
+       unsigned clear_depth;
+       char clear_stencil;
+    } state;
+ 
+    /** The incoming queue of scenes ready to rasterize */
+    struct lp_scene_queue *full_scenes;
+    /** The outgoing queue of processed scenes to return to setup modulee */
+    struct lp_scene_queue *empty_scenes;
+ 
+    /** The scene currently being rasterized by the threads */
+    struct lp_scene *curr_scene;
+ 
+    /** A task object for each rasterization thread */
+    struct lp_rasterizer_task tasks[MAX_THREADS];
+ 
+    unsigned num_threads;
+    pipe_thread threads[MAX_THREADS];
+ 
+    /** For synchronizing the rasterization threads */
+    pipe_barrier barrier;
+ };
+ 
+ 
+ void lp_rast_shade_quads( struct lp_rasterizer *rast,
+                           unsigned thread_index,
+                           const struct lp_rast_shader_inputs *inputs,
+                           unsigned x, unsigned y,
+                           int32_t c1, int32_t c2, int32_t c3);
+ 
+ 
+ /**
+  * Shade all pixels in a 4x4 block.  The fragment code omits the
+  * triangle in/out tests.
+  * \param x, y location of 4x4 block in window coords
+  */
+ static INLINE void
+ lp_rast_shade_quads_all( struct lp_rasterizer *rast,
+                          unsigned thread_index,
+                          const struct lp_rast_shader_inputs *inputs,
+                          unsigned x, unsigned y )
+ {
+    const struct lp_rast_state *state = rast->tasks[thread_index].current_state;
+    struct lp_rast_tile *tile = &rast->tasks[thread_index].tile;
+    const unsigned ix = x % TILE_SIZE, iy = y % TILE_SIZE;
+    uint8_t *color[PIPE_MAX_COLOR_BUFS];
+    void *depth;
+    unsigned block_offset, i;
+ 
+    /* offset of the containing 16x16 pixel block within the tile */
+    block_offset = (iy / 4) * (16 * 16) + (ix / 4) * 16;
+ 
+    /* color buffer */
+    for (i = 0; i < rast->state.fb.nr_cbufs; i++)
+       color[i] = tile->color[i] + 4 * block_offset;
+ 
+    /* depth buffer */
+    depth = tile->depth + block_offset;
+ 
+    /* run shader */
+    state->jit_function[0]( &state->jit_context,
+                            x, y,
+                            inputs->a0,
+                            inputs->dadx,
+                            inputs->dady,
+                            color,
+                            depth,
+                            INT_MIN, INT_MIN, INT_MIN,
+                            NULL, NULL, NULL );
+ }
+ 
+ 
+ #endif
diff --combined src/gallium/drivers/llvmpipe/lp_scene.c

index 0000000000000000000000000000000000000000,191122de7db60599d1c1fd37918f3a2166f7b36e..0421c506d8200740ea553f35b5ecb4d7d2208455

mode 000000,100644..100644
--- /dev/null
--- 2/src/gallium/drivers/llvmpipe/lp_scene.c
+++ b/src/gallium/drivers/llvmpipe/lp_scene.c
@@@ -1,0 -1,391 +1,392 @@@
+ /**************************************************************************
+  *
+  * Copyright 2009 VMware, Inc.
+  * All Rights Reserved.
+  *
+  * Permission is hereby granted, free of charge, to any person obtaining a
+  * copy of this software and associated documentation files (the
+  * "Software"), to deal in the Software without restriction, including
+  * without limitation the rights to use, copy, modify, merge, publish,
+  * distribute, sub license, and/or sell copies of the Software, and to
+  * permit persons to whom the Software is furnished to do so, subject to
+  * the following conditions:
+  *
+  * The above copyright notice and this permission notice (including the
+  * next paragraph) shall be included in all copies or substantial portions
+  * of the Software.
+  *
+  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+  *
+  **************************************************************************/
+ 
+ #include "util/u_math.h"
+ #include "util/u_memory.h"
++#include "util/u_inlines.h"
+ #include "util/u_simple_list.h"
+ #include "lp_scene.h"
+ 
+ 
+ struct lp_scene *
+ lp_scene_create(void)
+ {
+    struct lp_scene *scene = CALLOC_STRUCT(lp_scene);
+    if (scene)
+       lp_scene_init(scene);
+    return scene;
+ }
+ 
+ 
+ void
+ lp_scene_destroy(struct lp_scene *scene)
+ {
+    lp_scene_reset(scene);
+    lp_scene_free_bin_data(scene);
+    FREE(scene);
+ }
+ 
+ 
+ void
+ lp_scene_init(struct lp_scene *scene)
+ {
+    unsigned i, j;
+    for (i = 0; i < TILES_X; i++)
+       for (j = 0; j < TILES_Y; j++) {
+          struct cmd_bin *bin = lp_scene_get_bin(scene, i, j);
+          bin->commands.head = bin->commands.tail = CALLOC_STRUCT(cmd_block);
+       }
+ 
+    scene->data.head =
+       scene->data.tail = CALLOC_STRUCT(data_block);
+ 
+    make_empty_list(&scene->textures);
+ 
+    pipe_mutex_init(scene->mutex);
+ }
+ 
+ 
+ /**
+  * Check if the scene's bins are all empty.
+  * For debugging purposes.
+  */
+ boolean
+ lp_scene_is_empty(struct lp_scene *scene )
+ {
+    unsigned x, y;
+ 
+    for (y = 0; y < TILES_Y; y++) {
+       for (x = 0; x < TILES_X; x++) {
+          const struct cmd_bin *bin = lp_scene_get_bin(scene, x, y);
+          const struct cmd_block_list *list = &bin->commands;
+          if (list->head != list->tail || list->head->count > 0) {
+             return FALSE;
+          }
+       }
+    }
+    return TRUE;
+ }
+ 
+ 
+ void
+ lp_scene_bin_reset(struct lp_scene *scene, unsigned x, unsigned y)
+ {
+    struct cmd_bin *bin = lp_scene_get_bin(scene, x, y);
+    struct cmd_block_list *list = &bin->commands;
+    struct cmd_block *block;
+    struct cmd_block *tmp;
+ 
+    for (block = list->head; block != list->tail; block = tmp) {
+       tmp = block->next;
+       FREE(block);
+    }
+ 
+    assert(list->tail->next == NULL);
+    list->head = list->tail;
+    list->head->count = 0;
+ }
+ 
+ 
+ /**
+  * Set scene to empty state.
+  */
+ void
+ lp_scene_reset(struct lp_scene *scene )
+ {
+    unsigned i, j;
+ 
+    /* Free all but last binner command lists:
+     */
+    for (i = 0; i < scene->tiles_x; i++) {
+       for (j = 0; j < scene->tiles_y; j++) {
+          lp_scene_bin_reset(scene, i, j);
+       }
+    }
+ 
+    assert(lp_scene_is_empty(scene));
+ 
+    /* Free all but last binned data block:
+     */
+    {
+       struct data_block_list *list = &scene->data;
+       struct data_block *block, *tmp;
+ 
+       for (block = list->head; block != list->tail; block = tmp) {
+          tmp = block->next;
+          FREE(block);
+       }
+          
+       assert(list->tail->next == NULL);
+       list->head = list->tail;
+       list->head->used = 0;
+    }
+ 
+    /* Release texture refs
+     */
+    {
+       struct texture_ref *ref, *next, *ref_list = &scene->textures;
+       for (ref = ref_list->next; ref != ref_list; ref = next) {
+          next = next_elem(ref);
+          pipe_texture_reference(&ref->texture, NULL);
+          FREE(ref);
+       }
+       make_empty_list(ref_list);
+    }
+ }
+ 
+ 
+ /**
+  * Free all data associated with the given bin, but don't free(scene).
+  */
+ void
+ lp_scene_free_bin_data(struct lp_scene *scene)
+ {
+    unsigned i, j;
+ 
+    for (i = 0; i < TILES_X; i++)
+       for (j = 0; j < TILES_Y; j++) {
+          struct cmd_bin *bin = lp_scene_get_bin(scene, i, j);
+          /* lp_reset_scene() should have been already called */
+          assert(bin->commands.head == bin->commands.tail);
+          FREE(bin->commands.head);
+          bin->commands.head = NULL;
+          bin->commands.tail = NULL;
+       }
+ 
+    FREE(scene->data.head);
+    scene->data.head = NULL;
+ 
+    pipe_mutex_destroy(scene->mutex);
+ }
+ 
+ 
+ void
+ lp_scene_set_framebuffer_size( struct lp_scene *scene,
+                                unsigned width, unsigned height )
+ {
+    assert(lp_scene_is_empty(scene));
+ 
+    scene->tiles_x = align(width, TILE_SIZE) / TILE_SIZE;
+    scene->tiles_y = align(height, TILE_SIZE) / TILE_SIZE;
+ }
+ 
+ 
+ void
+ lp_bin_new_cmd_block( struct cmd_block_list *list )
+ {
+    struct cmd_block *block = MALLOC_STRUCT(cmd_block);
+    list->tail->next = block;
+    list->tail = block;
+    block->next = NULL;
+    block->count = 0;
+ }
+ 
+ 
+ void
+ lp_bin_new_data_block( struct data_block_list *list )
+ {
+    struct data_block *block = MALLOC_STRUCT(data_block);
+    list->tail->next = block;
+    list->tail = block;
+    block->next = NULL;
+    block->used = 0;
+ }
+ 
+ 
+ /** Return number of bytes used for all bin data within a scene */
+ unsigned
+ lp_scene_data_size( const struct lp_scene *scene )
+ {
+    unsigned size = 0;
+    const struct data_block *block;
+    for (block = scene->data.head; block; block = block->next) {
+       size += block->used;
+    }
+    return size;
+ }
+ 
+ 
+ /** Return number of bytes used for a single bin */
+ unsigned
+ lp_scene_bin_size( const struct lp_scene *scene, unsigned x, unsigned y )
+ {
+    struct cmd_bin *bin = lp_scene_get_bin((struct lp_scene *) scene, x, y);
+    const struct cmd_block *cmd;
+    unsigned size = 0;
+    for (cmd = bin->commands.head; cmd; cmd = cmd->next) {
+       size += (cmd->count *
+                (sizeof(lp_rast_cmd) + sizeof(union lp_rast_cmd_arg)));
+    }
+    return size;
+ }
+ 
+ 
+ /**
+  * Add a reference to a texture by the scene.
+  */
+ void
+ lp_scene_texture_reference( struct lp_scene *scene,
+                             struct pipe_texture *texture )
+ {
+    struct texture_ref *ref = CALLOC_STRUCT(texture_ref);
+    if (ref) {
+       struct texture_ref *ref_list = &scene->textures;
+       pipe_texture_reference(&ref->texture, texture);
+       insert_at_tail(ref_list, ref);
+    }
+ }
+ 
+ 
+ /**
+  * Does this scene have a reference to the given texture?
+  */
+ boolean
+ lp_scene_is_textured_referenced( const struct lp_scene *scene,
+                                  const struct pipe_texture *texture )
+ {
+    const struct texture_ref *ref_list = &scene->textures;
+    const struct texture_ref *ref;
+    foreach (ref, ref_list) {
+       if (ref->texture == texture)
+          return TRUE;
+    }
+    return FALSE;
+ }
+ 
+ 
+ /**
+  * Return last command in the bin
+  */
+ static lp_rast_cmd
+ lp_get_last_command( const struct cmd_bin *bin )
+ {
+    const struct cmd_block *tail = bin->commands.tail;
+    const unsigned i = tail->count;
+    if (i > 0)
+       return tail->cmd[i - 1];
+    else
+       return NULL;
+ }
+ 
+ 
+ /**
+  * Replace the arg of the last command in the bin.
+  */
+ static void
+ lp_replace_last_command_arg( struct cmd_bin *bin,
+                              const union lp_rast_cmd_arg arg )
+ {
+    struct cmd_block *tail = bin->commands.tail;
+    const unsigned i = tail->count;
+    assert(i > 0);
+    tail->arg[i - 1] = arg;
+ }
+ 
+ 
+ 
+ /**
+  * Put a state-change command into all bins.
+  * If we find that the last command in a bin was also a state-change
+  * command, we can simply replace that one with the new one.
+  */
+ void
+ lp_scene_bin_state_command( struct lp_scene *scene,
+                             lp_rast_cmd cmd,
+                             const union lp_rast_cmd_arg arg )
+ {
+    unsigned i, j;
+    for (i = 0; i < scene->tiles_x; i++) {
+       for (j = 0; j < scene->tiles_y; j++) {
+          struct cmd_bin *bin = lp_scene_get_bin(scene, i, j);
+          lp_rast_cmd last_cmd = lp_get_last_command(bin);
+          if (last_cmd == cmd) {
+             lp_replace_last_command_arg(bin, arg);
+          }
+          else {
+             lp_scene_bin_command( scene, i, j, cmd, arg );
+          }
+       }
+    }
+ }
+ 
+ 
+ /** advance curr_x,y to the next bin */
+ static boolean
+ next_bin(struct lp_scene *scene)
+ {
+    scene->curr_x++;
+    if (scene->curr_x >= scene->tiles_x) {
+       scene->curr_x = 0;
+       scene->curr_y++;
+    }
+    if (scene->curr_y >= scene->tiles_y) {
+       /* no more bins */
+       return FALSE;
+    }
+    return TRUE;
+ }
+ 
+ 
+ void
+ lp_scene_bin_iter_begin( struct lp_scene *scene )
+ {
+    scene->curr_x = scene->curr_y = -1;
+ }
+ 
+ 
+ /**
+  * Return pointer to next bin to be rendered.
+  * The lp_scene::curr_x and ::curr_y fields will be advanced.
+  * Multiple rendering threads will call this function to get a chunk
+  * of work (a bin) to work on.
+  */
+ struct cmd_bin *
+ lp_scene_bin_iter_next( struct lp_scene *scene, int *bin_x, int *bin_y )
+ {
+    struct cmd_bin *bin = NULL;
+ 
+    pipe_mutex_lock(scene->mutex);
+ 
+    if (scene->curr_x < 0) {
+       /* first bin */
+       scene->curr_x = 0;
+       scene->curr_y = 0;
+    }
+    else if (!next_bin(scene)) {
+       /* no more bins left */
+       goto end;
+    }
+ 
+    bin = lp_scene_get_bin(scene, scene->curr_x, scene->curr_y);
+    *bin_x = scene->curr_x;
+    *bin_y = scene->curr_y;
+ 
+ end:
+    /*printf("return bin %p at %d, %d\n", (void *) bin, *bin_x, *bin_y);*/
+    pipe_mutex_unlock(scene->mutex);
+    return bin;
+ }
diff --combined src/gallium/drivers/llvmpipe/lp_scene.h

index 0000000000000000000000000000000000000000,86facf8eac2a531f757a6876a05eda147b7fef3d..7db2165cf144a2944452042805062588eb5e4dcd

mode 000000,100644..100644
--- /dev/null
--- 2/src/gallium/drivers/llvmpipe/lp_scene.h
+++ b/src/gallium/drivers/llvmpipe/lp_scene.h
@@@ -1,0 -1,301 +1,301 @@@
- -#include "pipe/p_thread.h"
+ /**************************************************************************
+  *
+  * Copyright 2009 VMware, Inc.
+  * All Rights Reserved.
+  *
+  * Permission is hereby granted, free of charge, to any person obtaining a
+  * copy of this software and associated documentation files (the
+  * "Software"), to deal in the Software without restriction, including
+  * without limitation the rights to use, copy, modify, merge, publish,
+  * distribute, sub license, and/or sell copies of the Software, and to
+  * permit persons to whom the Software is furnished to do so, subject to
+  * the following conditions:
+  *
+  * The above copyright notice and this permission notice (including the
+  * next paragraph) shall be included in all copies or substantial portions
+  * of the Software.
+  *
+  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+  *
+  **************************************************************************/
+ 
+ 
+ /**
+  * Binner data structures and bin-related functions.
+  * Note: the "setup" code is concerned with building scenes while
+  * The "rast" code is concerned with consuming/executing scenes.
+  */
+ 
+ #ifndef LP_SCENE_H
+ #define LP_SCENE_H
+ 
++#include "os/os_thread.h"
+ #include "lp_tile_soa.h"
+ #include "lp_rast.h"
+ 
+ 
+ /* We're limited to 2K by 2K for 32bit fixed point rasterization.
+  * Will need a 64-bit version for larger framebuffers.
+  */
+ #define MAXHEIGHT 2048
+ #define MAXWIDTH 2048
+ #define TILES_X (MAXWIDTH / TILE_SIZE)
+ #define TILES_Y (MAXHEIGHT / TILE_SIZE)
+ 
+ 
+ #define CMD_BLOCK_MAX 128
+ #define DATA_BLOCK_SIZE (16 * 1024 - sizeof(unsigned) - sizeof(void *))
+    
+ 
+ 
+ /* switch to a non-pointer value for this:
+  */
+ typedef void (*lp_rast_cmd)( struct lp_rasterizer *,
+                              unsigned thread_index,
+                              const union lp_rast_cmd_arg );
+ 
+ struct cmd_block {
+    lp_rast_cmd cmd[CMD_BLOCK_MAX];
+    union lp_rast_cmd_arg arg[CMD_BLOCK_MAX];
+    unsigned count;
+    struct cmd_block *next;
+ };
+ 
+ struct data_block {
+    ubyte data[DATA_BLOCK_SIZE];
+    unsigned used;
+    struct data_block *next;
+ };
+ 
+ struct cmd_block_list {
+    struct cmd_block *head;
+    struct cmd_block *tail;
+ };
+ 
+ /**
+  * For each screen tile we have one of these bins.
+  */
+ struct cmd_bin {
+    struct cmd_block_list commands;
+ };
+    
+ 
+ /**
+  * This stores bulk data which is shared by all bins within a scene.
+  * Examples include triangle data and state data.  The commands in
+  * the per-tile bins will point to chunks of data in this structure.
+  */
+ struct data_block_list {
+    struct data_block *head;
+    struct data_block *tail;
+ };
+ 
+ 
+ /** List of texture references */
+ struct texture_ref {
+    struct pipe_texture *texture;
+    struct texture_ref *prev, *next;  /**< linked list w/ u_simple_list.h */
+ };
+ 
+ 
+ /**
+  * All bins and bin data are contained here.
+  * Per-bin data goes into the 'tile' bins.
+  * Shared data goes into the 'data' buffer.
+  *
+  * When there are multiple threads, will want to double-buffer between
+  * scenes:
+  */
+ struct lp_scene {
+    struct cmd_bin tile[TILES_X][TILES_Y];
+    struct data_block_list data;
+ 
+    /** the framebuffer to render the scene into */
+    struct pipe_framebuffer_state fb;
+ 
+    /** list of textures referenced by the scene commands */
+    struct texture_ref textures;
+ 
+    boolean write_depth;
+ 
+    /**
+     * Number of active tiles in each dimension.
+     * This basically the framebuffer size divided by tile size
+     */
+    unsigned tiles_x, tiles_y;
+ 
+    int curr_x, curr_y;  /**< for iterating over bins */
+    pipe_mutex mutex;
+ };
+ 
+ 
+ 
+ struct lp_scene *lp_scene_create(void);
+ 
+ void lp_scene_destroy(struct lp_scene *scene);
+ 
+ 
+ void lp_scene_init(struct lp_scene *scene);
+ 
+ boolean lp_scene_is_empty(struct lp_scene *scene );
+ 
+ void lp_scene_reset(struct lp_scene *scene );
+ 
+ void lp_scene_free_bin_data(struct lp_scene *scene);
+ 
+ void lp_scene_set_framebuffer_size( struct lp_scene *scene,
+                                   unsigned width, unsigned height );
+ 
+ void lp_bin_new_data_block( struct data_block_list *list );
+ 
+ void lp_bin_new_cmd_block( struct cmd_block_list *list );
+ 
+ unsigned lp_scene_data_size( const struct lp_scene *scene );
+ 
+ unsigned lp_scene_bin_size( const struct lp_scene *scene, unsigned x, unsigned y );
+ 
+ void lp_scene_texture_reference( struct lp_scene *scene,
+                                  struct pipe_texture *texture );
+ 
+ boolean lp_scene_is_textured_referenced( const struct lp_scene *scene,
+                                          const struct pipe_texture *texture );
+ 
+ 
+ /**
+  * Allocate space for a command/data in the bin's data buffer.
+  * Grow the block list if needed.
+  */
+ static INLINE void *
+ lp_scene_alloc( struct lp_scene *scene, unsigned size)
+ {
+    struct data_block_list *list = &scene->data;
+ 
+    if (list->tail->used + size > DATA_BLOCK_SIZE) {
+       lp_bin_new_data_block( list );
+    }
+ 
+    {
+       struct data_block *tail = list->tail;
+       ubyte *data = tail->data + tail->used;
+       tail->used += size;
+       return data;
+    }
+ }
+ 
+ 
+ /**
+  * As above, but with specific alignment.
+  */
+ static INLINE void *
+ lp_scene_alloc_aligned( struct lp_scene *scene, unsigned size,
+                       unsigned alignment )
+ {
+    struct data_block_list *list = &scene->data;
+ 
+    if (list->tail->used + size + alignment - 1 > DATA_BLOCK_SIZE) {
+       lp_bin_new_data_block( list );
+    }
+ 
+    {
+       struct data_block *tail = list->tail;
+       ubyte *data = tail->data + tail->used;
+       unsigned offset = (((uintptr_t)data + alignment - 1) & ~(alignment - 1)) - (uintptr_t)data;
+       tail->used += offset + size;
+       return data + offset;
+    }
+ }
+ 
+ 
+ /* Put back data if we decide not to use it, eg. culled triangles.
+  */
+ static INLINE void
+ lp_scene_putback_data( struct lp_scene *scene, unsigned size)
+ {
+    struct data_block_list *list = &scene->data;
+    assert(list->tail->used >= size);
+    list->tail->used -= size;
+ }
+ 
+ 
+ /** Return pointer to a particular tile's bin. */
+ static INLINE struct cmd_bin *
+ lp_scene_get_bin(struct lp_scene *scene, unsigned x, unsigned y)
+ {
+    return &scene->tile[x][y];
+ }
+ 
+ 
+ /** Remove all commands from a bin */
+ void
+ lp_scene_bin_reset(struct lp_scene *scene, unsigned x, unsigned y);
+ 
+ 
+ /* Add a command to bin[x][y].
+  */
+ static INLINE void
+ lp_scene_bin_command( struct lp_scene *scene,
+                 unsigned x, unsigned y,
+                 lp_rast_cmd cmd,
+                 union lp_rast_cmd_arg arg )
+ {
+    struct cmd_bin *bin = lp_scene_get_bin(scene, x, y);
+    struct cmd_block_list *list = &bin->commands;
+ 
+    assert(x < scene->tiles_x);
+    assert(y < scene->tiles_y);
+ 
+    if (list->tail->count == CMD_BLOCK_MAX) {
+       lp_bin_new_cmd_block( list );
+    }
+ 
+    {
+       struct cmd_block *tail = list->tail;
+       unsigned i = tail->count;
+       tail->cmd[i] = cmd;
+       tail->arg[i] = arg;
+       tail->count++;
+    }
+ }
+ 
+ 
+ /* Add a command to all active bins.
+  */
+ static INLINE void
+ lp_scene_bin_everywhere( struct lp_scene *scene,
+                        lp_rast_cmd cmd,
+                        const union lp_rast_cmd_arg arg )
+ {
+    unsigned i, j;
+    for (i = 0; i < scene->tiles_x; i++)
+       for (j = 0; j < scene->tiles_y; j++)
+          lp_scene_bin_command( scene, i, j, cmd, arg );
+ }
+ 
+ 
+ void
+ lp_scene_bin_state_command( struct lp_scene *scene,
+                           lp_rast_cmd cmd,
+                           const union lp_rast_cmd_arg arg );
+ 
+ 
+ static INLINE unsigned
+ lp_scene_get_num_bins( const struct lp_scene *scene )
+ {
+    return scene->tiles_x * scene->tiles_y;
+ }
+ 
+ 
+ void
+ lp_scene_bin_iter_begin( struct lp_scene *scene );
+ 
+ struct cmd_bin *
+ lp_scene_bin_iter_next( struct lp_scene *scene, int *bin_x, int *bin_y );
+ 
+ 
+ #endif /* LP_BIN_H */
diff --combined src/gallium/drivers/llvmpipe/lp_setup.c

index 92baa980bcc22ea8e12223432738c849f212c9af,f8fc912fa1bfb336668f64ce348401f0598807cb..2e3ef4ae5cda8bf351a8ccf3515018faa2d5da7a
--- 1/src/gallium/drivers/llvmpipe/lp_setup.c
--- 2/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@@ -26,1477 -26,704 +26,704 @@@
    **************************************************************************/
   
   /**
-  * \brief  Primitive rasterization/rendering (points, lines, triangles)
+  * Tiling engine.
    *
-  * \author  Keith Whitwell <keith@tungstengraphics.com>
-  * \author  Brian Paul
+  * Builds per-tile display lists and executes them on calls to
+  * lp_setup_flush().
    */
   
- #include "lp_context.h"
- #include "lp_quad.h"
- #include "lp_setup.h"
- #include "lp_state.h"
- #include "draw/draw_context.h"
- #include "draw/draw_vertex.h"
- #include "pipe/p_shader_tokens.h"
- #include "util/u_format.h"
- #include "util/u_math.h"
+ #include "pipe/p_defines.h"
- -#include "pipe/p_inlines.h"
++#include "util/u_inlines.h"
   #include "util/u_memory.h"
- #include "lp_bld_debug.h"
- #include "lp_tile_cache.h"
- #include "lp_tile_soa.h"
- 
- 
- #define DEBUG_VERTS 0
- #define DEBUG_FRAGS 0
- 
- /**
-  * Triangle edge info
-  */
- struct edge {
-    float dx;          /**< X(v1) - X(v0), used only during setup */
-    float dy;          /**< Y(v1) - Y(v0), used only during setup */
-    float dxdy;                /**< dx/dy */
-    float sx, sy;      /**< first sample point coord */
-    int lines;         /**< number of lines on this edge */
- };
- 
- 
- #define MAX_QUADS 16
- 
+ #include "util/u_pack_color.h"
+ #include "util/u_surface.h"
+ #include "lp_scene.h"
+ #include "lp_scene_queue.h"
+ #include "lp_buffer.h"
+ #include "lp_texture.h"
+ #include "lp_debug.h"
+ #include "lp_fence.h"
+ #include "lp_rast.h"
+ #include "lp_setup_context.h"
   
- /**
-  * Triangle setup info (derived from draw_stage).
-  * Also used for line drawing (taking some liberties).
-  */
- struct setup_context {
-    struct llvmpipe_context *llvmpipe;
- 
-    /* Vertices are just an array of floats making up each attribute in
-     * turn.  Currently fixed at 4 floats, but should change in time.
-     * Codegen will help cope with this.
-     */
-    const float (*vmax)[4];
-    const float (*vmid)[4];
-    const float (*vmin)[4];
-    const float (*vprovoke)[4];
- 
-    struct edge ebot;
-    struct edge etop;
-    struct edge emaj;
- 
-    float oneoverarea;
-    int facing;
+ #include "draw/draw_context.h"
+ #include "draw/draw_vbuf.h"
   
-    float pixel_offset;
   
-    struct quad_header quad[MAX_QUADS];
-    struct quad_header *quad_ptrs[MAX_QUADS];
-    unsigned count;
+ static void set_scene_state( struct setup_context *, unsigned );
   
-    struct quad_interp_coef coef;
   
-    struct {
-       int left[2];   /**< [0] = row0, [1] = row1 */
-       int right[2];
-       int y;
-    } span;
+ struct lp_scene *
+ lp_setup_get_current_scene(struct setup_context *setup)
+ {
+    if (!setup->scene) {
   
- #if DEBUG_FRAGS
-    uint numFragsEmitted;  /**< per primitive */
-    uint numFragsWritten;  /**< per primitive */
- #endif
+       /* wait for a free/empty scene
+        */
+       setup->scene = lp_scene_dequeue(setup->empty_scenes, TRUE);
   
-    unsigned winding;          /* which winding to cull */
- };
+       if(0)lp_scene_reset( setup->scene ); /* XXX temporary? */
   
+       lp_scene_set_framebuffer_size(setup->scene,
+                                     setup->fb.width, 
+                                     setup->fb.height);
+    }
+    return setup->scene;
+ }
   
   
- /**
-  * Execute fragment shader for the four fragments in the quad.
-  */
- PIPE_ALIGN_STACK
   static void
- shade_quads(struct llvmpipe_context *llvmpipe,
-             struct quad_header *quads[],
-             unsigned nr)
+ first_triangle( struct setup_context *setup,
+                 const float (*v0)[4],
+                 const float (*v1)[4],
+                 const float (*v2)[4])
   {
-    struct lp_fragment_shader *fs = llvmpipe->fs;
-    struct quad_header *quad = quads[0];
-    const unsigned x = quad->input.x0;
-    const unsigned y = quad->input.y0;
-    uint8_t *tile;
-    uint8_t *color;
-    void *depth;
-    PIPE_ALIGN_VAR(16) uint32_t mask[4][NUM_CHANNELS];
-    unsigned chan_index;
-    unsigned q;
- 
-    assert(fs->current);
-    if(!fs->current)
-       return;
- 
-    /* Sanity checks */
-    assert(nr * QUAD_SIZE == TILE_VECTOR_HEIGHT * TILE_VECTOR_WIDTH);
-    assert(x % TILE_VECTOR_WIDTH == 0);
-    assert(y % TILE_VECTOR_HEIGHT == 0);
-    for (q = 0; q < nr; ++q) {
-       assert(quads[q]->input.x0 == x + q*2);
-       assert(quads[q]->input.y0 == y);
-    }
- 
-    /* mask */
-    for (q = 0; q < 4; ++q)
-       for (chan_index = 0; chan_index < NUM_CHANNELS; ++chan_index)
-          mask[q][chan_index] = quads[q]->inout.mask & (1 << chan_index) ? ~0 : 0;
+    set_scene_state( setup, SETUP_ACTIVE );
+    lp_setup_choose_triangle( setup );
+    setup->triangle( setup, v0, v1, v2 );
+ }
   
-    /* color buffer */
-    if(llvmpipe->framebuffer.nr_cbufs >= 1 &&
-       llvmpipe->framebuffer.cbufs[0]) {
-       tile = lp_get_cached_tile(llvmpipe->cbuf_cache[0], x, y);
-       color = &TILE_PIXEL(tile, x & (TILE_SIZE-1), y & (TILE_SIZE-1), 0);
-    }
-    else
-       color = NULL;
- 
-    /* depth buffer */
-    if(llvmpipe->zsbuf_map) {
-       assert((x % 2) == 0);
-       assert((y % 2) == 0);
-       depth = llvmpipe->zsbuf_map +
-               y*llvmpipe->zsbuf_transfer->stride +
-               2*x*util_format_get_blocksize(llvmpipe->zsbuf_transfer->texture->format);
-    }
-    else
-       depth = NULL;
- 
-    /* XXX: This will most likely fail on 32bit x86 without -mstackrealign */
-    assert(lp_check_alignment(mask, 16));
- 
-    assert(lp_check_alignment(depth, 16));
-    assert(lp_check_alignment(color, 16));
-    assert(lp_check_alignment(llvmpipe->jit_context.blend_color, 16));
- 
-    /* run shader */
-    fs->current->jit_function( &llvmpipe->jit_context,
-                               x, y,
-                               quad->coef->a0,
-                               quad->coef->dadx,
-                               quad->coef->dady,
-                               &mask[0][0],
-                               color,
-                               depth);
+ static void
+ first_line( struct setup_context *setup,
+           const float (*v0)[4],
+           const float (*v1)[4])
+ {
+    set_scene_state( setup, SETUP_ACTIVE );
+    lp_setup_choose_line( setup );
+    setup->line( setup, v0, v1 );
   }
   
+ static void
+ first_point( struct setup_context *setup,
+            const float (*v0)[4])
+ {
+    set_scene_state( setup, SETUP_ACTIVE );
+    lp_setup_choose_point( setup );
+    setup->point( setup, v0 );
+ }
   
+ static void reset_context( struct setup_context *setup )
+ {
+    LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
   
+    /* Reset derived state */
+    setup->constants.stored_size = 0;
+    setup->constants.stored_data = NULL;
+    setup->fs.stored = NULL;
+    setup->dirty = ~0;
   
- /**
-  * Do triangle cull test using tri determinant (sign indicates orientation)
-  * \return true if triangle is to be culled.
-  */
- static INLINE boolean
- cull_tri(const struct setup_context *setup, float det)
- {
-    if (det != 0) {   
-       /* if (det < 0 then Z points toward camera and triangle is 
-        * counter-clockwise winding.
-        */
-       unsigned winding = (det < 0) ? PIPE_WINDING_CCW : PIPE_WINDING_CW;
+    /* no current bin */
+    setup->scene = NULL;
   
-       if ((winding & setup->winding) == 0)
-        return FALSE;
-    }
+    /* Reset some state:
+     */
+    setup->clear.flags = 0;
   
-    /* Culled:
+    /* Have an explicit "start-binning" call and get rid of this
+     * pointer twiddling?
       */
-    return TRUE;
+    setup->line = first_line;
+    setup->point = first_point;
+    setup->triangle = first_triangle;
   }
   
   
- 
- /**
-  * Clip setup->quad against the scissor/surface bounds.
-  */
- static INLINE void
- quad_clip( struct setup_context *setup, struct quad_header *quad )
+ /** Rasterize all scene's bins */
+ static void
+ lp_setup_rasterize_scene( struct setup_context *setup,
+                        boolean write_depth )
   {
-    const struct pipe_scissor_state *cliprect = &setup->llvmpipe->cliprect;
-    const int minx = (int) cliprect->minx;
-    const int maxx = (int) cliprect->maxx;
-    const int miny = (int) cliprect->miny;
-    const int maxy = (int) cliprect->maxy;
- 
-    if (quad->input.x0 >= maxx ||
-        quad->input.y0 >= maxy ||
-        quad->input.x0 + 1 < minx ||
-        quad->input.y0 + 1 < miny) {
-       /* totally clipped */
-       quad->inout.mask = 0x0;
-       return;
-    }
-    if (quad->input.x0 < minx)
-       quad->inout.mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT);
-    if (quad->input.y0 < miny)
-       quad->inout.mask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT);
-    if (quad->input.x0 == maxx - 1)
-       quad->inout.mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
-    if (quad->input.y0 == maxy - 1)
-       quad->inout.mask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT);
- }
+    struct lp_scene *scene = lp_setup_get_current_scene(setup);
   
+    lp_rasterize_scene(setup->rast,
+                       scene,
+                       &setup->fb,
+                       write_depth);
   
+    reset_context( setup );
   
- /**
-  * Given an X or Y coordinate, return the block/quad coordinate that it
-  * belongs to.
-  */
- static INLINE int block( int x )
- {
-    return x & ~(2-1);
+    LP_DBG(DEBUG_SETUP, "%s done \n", __FUNCTION__);
   }
   
- static INLINE int block_x( int x )
+ 
+ 
+ static void
+ begin_binning( struct setup_context *setup )
   {
-    return x & ~(TILE_VECTOR_WIDTH - 1);
+    struct lp_scene *scene = lp_setup_get_current_scene(setup);
+ 
+    LP_DBG(DEBUG_SETUP, "%s color: %s depth: %s\n", __FUNCTION__,
+           (setup->clear.flags & PIPE_CLEAR_COLOR) ? "clear": "load",
+           (setup->clear.flags & PIPE_CLEAR_DEPTHSTENCIL) ? "clear": "load");
+ 
+    if (setup->fb.nr_cbufs) {
+       if (setup->clear.flags & PIPE_CLEAR_COLOR)
+          lp_scene_bin_everywhere( scene, 
+                                 lp_rast_clear_color, 
+                                 setup->clear.color );
+       else
+          lp_scene_bin_everywhere( scene,
+                                 lp_rast_load_color,
+                                 lp_rast_arg_null() );
+    }
+ 
+    if (setup->fb.zsbuf) {
+       if (setup->clear.flags & PIPE_CLEAR_DEPTHSTENCIL)
+          lp_scene_bin_everywhere( scene, 
+                                 lp_rast_clear_zstencil, 
+                                 setup->clear.zstencil );
+       else
+          lp_scene_bin_everywhere( scene,
+                                 lp_rast_load_zstencil,
+                                 lp_rast_arg_null() );
+    }
+ 
+    LP_DBG(DEBUG_SETUP, "%s done\n", __FUNCTION__);
   }
   
   
- /**
-  * Emit a quad (pass to next stage) with clipping.
+ /* This basically bins and then flushes any outstanding full-screen
+  * clears.  
+  *
+  * TODO: fast path for fullscreen clears and no triangles.
    */
- static INLINE void
- clip_emit_quad( struct setup_context *setup, struct quad_header *quad )
+ static void
+ execute_clears( struct setup_context *setup )
   {
-    quad_clip( setup, quad );
- 
-    if (quad->inout.mask) {
-       struct llvmpipe_context *lp = setup->llvmpipe;
- 
- #if 1
-       /* XXX: The blender expects 4 quads. This is far from efficient, but
-        * until we codegenerate single-quad variants of the fragment pipeline
-        * we need this hack. */
-       const unsigned nr_quads = TILE_VECTOR_HEIGHT*TILE_VECTOR_WIDTH/QUAD_SIZE;
-       struct quad_header quads[4];
-       struct quad_header *quad_ptrs[4];
-       int x0 = block_x(quad->input.x0);
-       unsigned i;
- 
-       assert(nr_quads == 4);
- 
-       for(i = 0; i < nr_quads; ++i) {
-          int x = x0 + 2*i;
-          if(x == quad->input.x0)
-             memcpy(&quads[i], quad, sizeof quads[i]);
-          else {
-             memset(&quads[i], 0, sizeof quads[i]);
-             quads[i].input.x0 = x;
-             quads[i].input.y0 = quad->input.y0;
-             quads[i].coef = quad->coef;
-          }
-          quad_ptrs[i] = &quads[i];
-       }
+    LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
   
-       shade_quads( lp, quad_ptrs, nr_quads );
- #else
-       shade_quads( lp, &quad, 1 );
- #endif
-    }
+    begin_binning( setup );
+    lp_setup_rasterize_scene( setup, TRUE );
   }
   
   
- /**
-  * Render a horizontal span of quads
-  */
- static void flush_spans( struct setup_context *setup )
+ static void
+ set_scene_state( struct setup_context *setup,
+            unsigned new_state )
   {
-    const int step = TILE_VECTOR_WIDTH;
-    const int xleft0 = setup->span.left[0];
-    const int xleft1 = setup->span.left[1];
-    const int xright0 = setup->span.right[0];
-    const int xright1 = setup->span.right[1];
- 
- 
-    int minleft = block_x(MIN2(xleft0, xleft1));
-    int maxright = MAX2(xright0, xright1);
-    int x;
- 
-    for (x = minleft; x < maxright; x += step) {
-       unsigned skip_left0 = CLAMP(xleft0 - x, 0, step);
-       unsigned skip_left1 = CLAMP(xleft1 - x, 0, step);
-       unsigned skip_right0 = CLAMP(x + step - xright0, 0, step);
-       unsigned skip_right1 = CLAMP(x + step - xright1, 0, step);
-       unsigned lx = x;
-       const unsigned nr_quads = TILE_VECTOR_HEIGHT*TILE_VECTOR_WIDTH/QUAD_SIZE;
-       unsigned q = 0;
- 
-       unsigned skipmask_left0 = (1U << skip_left0) - 1U;
-       unsigned skipmask_left1 = (1U << skip_left1) - 1U;
- 
-       /* These calculations fail when step == 32 and skip_right == 0.
-        */
-       unsigned skipmask_right0 = ~0U << (unsigned)(step - skip_right0);
-       unsigned skipmask_right1 = ~0U << (unsigned)(step - skip_right1);
- 
-       unsigned mask0 = ~skipmask_left0 & ~skipmask_right0;
-       unsigned mask1 = ~skipmask_left1 & ~skipmask_right1;
- 
-       if (mask0 | mask1) {
-          for(q = 0; q < nr_quads; ++q) {
-             unsigned quadmask = (mask0 & 3) | ((mask1 & 3) << 2);
-             setup->quad[q].input.x0 = lx;
-             setup->quad[q].input.y0 = setup->span.y;
-             setup->quad[q].inout.mask = quadmask;
-             setup->quad_ptrs[q] = &setup->quad[q];
-             mask0 >>= 2;
-             mask1 >>= 2;
-             lx += 2;
-          }
-          assert(!(mask0 | mask1));
+    unsigned old_state = setup->state;
   
-          shade_quads(setup->llvmpipe, setup->quad_ptrs, nr_quads );
+    if (old_state == new_state)
+       return;
+        
+    LP_DBG(DEBUG_SETUP, "%s old %d new %d\n", __FUNCTION__, old_state, new_state);
+ 
+    switch (new_state) {
+    case SETUP_ACTIVE:
+       begin_binning( setup );
+       break;
+ 
+    case SETUP_CLEARED:
+       if (old_state == SETUP_ACTIVE) {
+          assert(0);
+          return;
         }
+       break;
+       
+    case SETUP_FLUSHED:
+       if (old_state == SETUP_CLEARED)
+          execute_clears( setup );
+       else
+          lp_setup_rasterize_scene( setup, TRUE );
+       break;
      }
   
- 
-    setup->span.y = 0;
-    setup->span.right[0] = 0;
-    setup->span.right[1] = 0;
-    setup->span.left[0] = 1000000;     /* greater than right[0] */
-    setup->span.left[1] = 1000000;     /* greater than right[1] */
+    setup->state = new_state;
   }
   
   
- #if DEBUG_VERTS
- static void print_vertex(const struct setup_context *setup,
-                          const float (*v)[4])
+ void
+ lp_setup_flush( struct setup_context *setup,
+                 unsigned flags )
   {
-    int i;
-    debug_printf("   Vertex: (%p)\n", v);
-    for (i = 0; i < setup->quad[0].nr_attrs; i++) {
-       debug_printf("     %d: %f %f %f %f\n",  i,
-               v[i][0], v[i][1], v[i][2], v[i][3]);
-       if (util_is_inf_or_nan(v[i][0])) {
-          debug_printf("   NaN!\n");
-       }
-    }
+    LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
+ 
+    set_scene_state( setup, SETUP_FLUSHED );
   }
- #endif
   
- /**
-  * Sort the vertices from top to bottom order, setting up the triangle
-  * edge fields (ebot, emaj, etop).
-  * \return FALSE if coords are inf/nan (cull the tri), TRUE otherwise
-  */
- static boolean setup_sort_vertices( struct setup_context *setup,
-                                     float det,
-                                     const float (*v0)[4],
-                                     const float (*v1)[4],
-                                     const float (*v2)[4] )
- {
-    setup->vprovoke = v2;
- 
-    /* determine bottom to top order of vertices */
-    {
-       float y0 = v0[0][1];
-       float y1 = v1[0][1];
-       float y2 = v2[0][1];
-       if (y0 <= y1) {
-        if (y1 <= y2) {
-           /* y0<=y1<=y2 */
-           setup->vmin = v0;
-           setup->vmid = v1;
-           setup->vmax = v2;
-        }
-        else if (y2 <= y0) {
-           /* y2<=y0<=y1 */
-           setup->vmin = v2;
-           setup->vmid = v0;
-           setup->vmax = v1;
-        }
-        else {
-           /* y0<=y2<=y1 */
-           setup->vmin = v0;
-           setup->vmid = v2;
-           setup->vmax = v1;
-        }
-       }
-       else {
-        if (y0 <= y2) {
-           /* y1<=y0<=y2 */
-           setup->vmin = v1;
-           setup->vmid = v0;
-           setup->vmax = v2;
-        }
-        else if (y2 <= y1) {
-           /* y2<=y1<=y0 */
-           setup->vmin = v2;
-           setup->vmid = v1;
-           setup->vmax = v0;
-        }
-        else {
-           /* y1<=y2<=y0 */
-           setup->vmin = v1;
-           setup->vmid = v2;
-           setup->vmax = v0;
-        }
-       }
-    }
   
-    setup->ebot.dx = setup->vmid[0][0] - setup->vmin[0][0];
-    setup->ebot.dy = setup->vmid[0][1] - setup->vmin[0][1];
-    setup->emaj.dx = setup->vmax[0][0] - setup->vmin[0][0];
-    setup->emaj.dy = setup->vmax[0][1] - setup->vmin[0][1];
-    setup->etop.dx = setup->vmax[0][0] - setup->vmid[0][0];
-    setup->etop.dy = setup->vmax[0][1] - setup->vmid[0][1];
- 
-    /*
-     * Compute triangle's area.  Use 1/area to compute partial
-     * derivatives of attributes later.
-     *
-     * The area will be the same as prim->det, but the sign may be
-     * different depending on how the vertices get sorted above.
-     *
-     * To determine whether the primitive is front or back facing we
-     * use the prim->det value because its sign is correct.
-     */
-    {
-       const float area = (setup->emaj.dx * setup->ebot.dy -
-                           setup->ebot.dx * setup->emaj.dy);
- 
-       setup->oneoverarea = 1.0f / area;
- 
-       /*
-       debug_printf("%s one-over-area %f  area %f  det %f\n",
-                    __FUNCTION__, setup->oneoverarea, area, det );
-       */
-       if (util_is_inf_or_nan(setup->oneoverarea))
-          return FALSE;
-    }
+ void
+ lp_setup_bind_framebuffer( struct setup_context *setup,
+                            const struct pipe_framebuffer_state *fb )
+ {
+    struct lp_scene *scene = lp_setup_get_current_scene(setup);
   
-    /* We need to know if this is a front or back-facing triangle for:
-     *  - the GLSL gl_FrontFacing fragment attribute (bool)
-     *  - two-sided stencil test
-     */
-    setup->facing = 
-       ((det > 0.0) ^ 
-        (setup->llvmpipe->rasterizer->front_winding == PIPE_WINDING_CW));
+    LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
   
-    /* Prepare pixel offset for rasterisation:
-     *  - pixel center (0.5, 0.5) for GL, or
-     *  - assume (0.0, 0.0) for other APIs.
-     */
-    if (setup->llvmpipe->rasterizer->gl_rasterization_rules) {
-       setup->pixel_offset = 0.5f;
-    } else {
-       setup->pixel_offset = 0.0f;
-    }
+    set_scene_state( setup, SETUP_FLUSHED );
   
-    return TRUE;
- }
+    /* re-get scene pointer, may have a new scene after flushing */
+    scene = lp_setup_get_current_scene(setup);
   
+    util_copy_framebuffer_state(&setup->fb, fb);
   
- /**
-  * Compute a0, dadx and dady for a linearly interpolated coefficient,
-  * for a triangle.
-  */
- static void tri_pos_coeff( struct setup_context *setup,
-                            uint vertSlot, unsigned i)
- {
-    float botda = setup->vmid[vertSlot][i] - setup->vmin[vertSlot][i];
-    float majda = setup->vmax[vertSlot][i] - setup->vmin[vertSlot][i];
-    float a = setup->ebot.dy * majda - botda * setup->emaj.dy;
-    float b = setup->emaj.dx * botda - majda * setup->ebot.dx;
-    float dadx = a * setup->oneoverarea;
-    float dady = b * setup->oneoverarea;
- 
-    assert(i <= 3);
- 
-    setup->coef.dadx[0][i] = dadx;
-    setup->coef.dady[0][i] = dady;
- 
-    /* calculate a0 as the value which would be sampled for the
-     * fragment at (0,0), taking into account that we want to sample at
-     * pixel centers, in other words (pixel_offset, pixel_offset).
-     *
-     * this is neat but unfortunately not a good way to do things for
-     * triangles with very large values of dadx or dady as it will
-     * result in the subtraction and re-addition from a0 of a very
-     * large number, which means we'll end up loosing a lot of the
-     * fractional bits and precision from a0.  the way to fix this is
-     * to define a0 as the sample at a pixel center somewhere near vmin
-     * instead - i'll switch to this later.
-     */
-    setup->coef.a0[0][i] = (setup->vmin[vertSlot][i] -
-                            (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
-                             dady * (setup->vmin[0][1] - setup->pixel_offset)));
- 
-    /*
-    debug_printf("attr[%d].%c: %f dx:%f dy:%f\n",
-                 slot, "xyzw"[i],
-                 setup->coef[slot].a0[i],
-                 setup->coef[slot].dadx[i],
-                 setup->coef[slot].dady[i]);
-    */
+    lp_scene_set_framebuffer_size(scene, setup->fb.width, setup->fb.height);
   }
   
   
- /**
-  * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
-  * The value value comes from vertex[slot][i].
-  * The result will be put into setup->coef[slot].a0[i].
-  * \param slot  which attribute slot
-  * \param i  which component of the slot (0..3)
-  */
- static void const_pos_coeff( struct setup_context *setup,
-                              uint vertSlot, unsigned i)
+ void
+ lp_setup_clear( struct setup_context *setup,
+                 const float *color,
+                 double depth,
+                 unsigned stencil,
+                 unsigned flags )
   {
-    setup->coef.dadx[0][i] = 0;
-    setup->coef.dady[0][i] = 0;
- 
-    /* need provoking vertex info!
-     */
-    setup->coef.a0[0][i] = setup->vprovoke[vertSlot][i];
- }
+    struct lp_scene *scene = lp_setup_get_current_scene(setup);
+    unsigned i;
   
+    LP_DBG(DEBUG_SETUP, "%s state %d\n", __FUNCTION__, setup->state);
   
- /**
-  * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
-  * The value value comes from vertex[slot][i].
-  * The result will be put into setup->coef[slot].a0[i].
-  * \param slot  which attribute slot
-  * \param i  which component of the slot (0..3)
-  */
- static void const_coeff( struct setup_context *setup,
-                          unsigned attrib,
-                          uint vertSlot)
- {
-    unsigned i;
-    for (i = 0; i < NUM_CHANNELS; ++i) {
-       setup->coef.dadx[1 + attrib][i] = 0;
-       setup->coef.dady[1 + attrib][i] = 0;
   
-       /* need provoking vertex info!
-        */
-       setup->coef.a0[1 + attrib][i] = setup->vprovoke[vertSlot][i];
+    if (flags & PIPE_CLEAR_COLOR) {
+       for (i = 0; i < 4; ++i)
+          setup->clear.color.clear_color[i] = float_to_ubyte(color[i]);
      }
- }
   
- 
- /**
-  * Compute a0, dadx and dady for a linearly interpolated coefficient,
-  * for a triangle.
-  */
- static void tri_linear_coeff( struct setup_context *setup,
-                               unsigned attrib,
-                               uint vertSlot)
- {
-    unsigned i;
-    for (i = 0; i < NUM_CHANNELS; ++i) {
-       float botda = setup->vmid[vertSlot][i] - setup->vmin[vertSlot][i];
-       float majda = setup->vmax[vertSlot][i] - setup->vmin[vertSlot][i];
-       float a = setup->ebot.dy * majda - botda * setup->emaj.dy;
-       float b = setup->emaj.dx * botda - majda * setup->ebot.dx;
-       float dadx = a * setup->oneoverarea;
-       float dady = b * setup->oneoverarea;
- 
-       assert(i <= 3);
- 
-       setup->coef.dadx[1 + attrib][i] = dadx;
-       setup->coef.dady[1 + attrib][i] = dady;
- 
-       /* calculate a0 as the value which would be sampled for the
-        * fragment at (0,0), taking into account that we want to sample at
-        * pixel centers, in other words (0.5, 0.5).
-        *
-        * this is neat but unfortunately not a good way to do things for
-        * triangles with very large values of dadx or dady as it will
-        * result in the subtraction and re-addition from a0 of a very
-        * large number, which means we'll end up loosing a lot of the
-        * fractional bits and precision from a0.  the way to fix this is
-        * to define a0 as the sample at a pixel center somewhere near vmin
-        * instead - i'll switch to this later.
-        */
-       setup->coef.a0[1 + attrib][i] = (setup->vmin[vertSlot][i] -
-                      (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
-                       dady * (setup->vmin[0][1] - setup->pixel_offset)));
- 
-       /*
-       debug_printf("attr[%d].%c: %f dx:%f dy:%f\n",
-                    slot, "xyzw"[i],
-                    setup->coef[slot].a0[i],
-                    setup->coef[slot].dadx[i],
-                    setup->coef[slot].dady[i]);
-       */
+    if (flags & PIPE_CLEAR_DEPTHSTENCIL) {
+       setup->clear.zstencil.clear_zstencil = 
+          util_pack_z_stencil(setup->fb.zsbuf->format, 
+                              depth,
+                              stencil);
      }
- }
   
+    if (setup->state == SETUP_ACTIVE) {
+       /* Add the clear to existing scene.  In the unusual case where
+        * both color and depth-stencil are being cleared when there's
+        * already been some rendering, we could discard the currently
+        * binned scene and start again, but I don't see that as being
+        * a common usage.
+        */
+       if (flags & PIPE_CLEAR_COLOR)
+          lp_scene_bin_everywhere( scene, 
+                                   lp_rast_clear_color,
+                                   setup->clear.color );
   
- /**
-  * Compute a0, dadx and dady for a perspective-corrected interpolant,
-  * for a triangle.
-  * We basically multiply the vertex value by 1/w before computing
-  * the plane coefficients (a0, dadx, dady).
-  * Later, when we compute the value at a particular fragment position we'll
-  * divide the interpolated value by the interpolated W at that fragment.
-  */
- static void tri_persp_coeff( struct setup_context *setup,
-                              unsigned attrib,
-                              uint vertSlot)
- {
-    unsigned i;
-    for (i = 0; i < NUM_CHANNELS; ++i) {
-       /* premultiply by 1/w  (v[0][3] is always W):
+       if (setup->clear.flags & PIPE_CLEAR_DEPTHSTENCIL)
+          lp_scene_bin_everywhere( scene, 
+                                   lp_rast_clear_zstencil,
+                                   setup->clear.zstencil );
+    }
+    else {
+       /* Put ourselves into the 'pre-clear' state, specifically to try
+        * and accumulate multiple clears to color and depth_stencil
+        * buffers which the app or state-tracker might issue
+        * separately.
          */
-       float mina = setup->vmin[vertSlot][i] * setup->vmin[0][3];
-       float mida = setup->vmid[vertSlot][i] * setup->vmid[0][3];
-       float maxa = setup->vmax[vertSlot][i] * setup->vmax[0][3];
-       float botda = mida - mina;
-       float majda = maxa - mina;
-       float a = setup->ebot.dy * majda - botda * setup->emaj.dy;
-       float b = setup->emaj.dx * botda - majda * setup->ebot.dx;
-       float dadx = a * setup->oneoverarea;
-       float dady = b * setup->oneoverarea;
- 
-       /*
-       debug_printf("tri persp %d,%d: %f %f %f\n", vertSlot, i,
-                    setup->vmin[vertSlot][i],
-                    setup->vmid[vertSlot][i],
-                    setup->vmax[vertSlot][i]
-              );
-       */
-       assert(i <= 3);
- 
-       setup->coef.dadx[1 + attrib][i] = dadx;
-       setup->coef.dady[1 + attrib][i] = dady;
-       setup->coef.a0[1 + attrib][i] = (mina -
-                      (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
-                       dady * (setup->vmin[0][1] - setup->pixel_offset)));
+       set_scene_state( setup, SETUP_CLEARED );
+ 
+       setup->clear.flags |= flags;
      }
   }
   
   
   /**
-  * Special coefficient setup for gl_FragCoord.
-  * X and Y are trivial, though Y has to be inverted for OpenGL.
-  * Z and W are copied from posCoef which should have already been computed.
-  * We could do a bit less work if we'd examine gl_FragCoord's swizzle mask.
+  * Emit a fence.
    */
- static void
- setup_fragcoord_coeff(struct setup_context *setup, uint slot)
+ struct pipe_fence_handle *
+ lp_setup_fence( struct setup_context *setup )
   {
-    /*X*/
-    setup->coef.a0[1 + slot][0] = 0;
-    setup->coef.dadx[1 + slot][0] = 1.0;
-    setup->coef.dady[1 + slot][0] = 0.0;
-    /*Y*/
-    setup->coef.a0[1 + slot][1] = 0.0;
-    setup->coef.dadx[1 + slot][1] = 0.0;
-    setup->coef.dady[1 + slot][1] = 1.0;
-    /*Z*/
-    setup->coef.a0[1 + slot][2] = setup->coef.a0[0][2];
-    setup->coef.dadx[1 + slot][2] = setup->coef.dadx[0][2];
-    setup->coef.dady[1 + slot][2] = setup->coef.dady[0][2];
-    /*W*/
-    setup->coef.a0[1 + slot][3] = setup->coef.a0[0][3];
-    setup->coef.dadx[1 + slot][3] = setup->coef.dadx[0][3];
-    setup->coef.dady[1 + slot][3] = setup->coef.dady[0][3];
- }
+    struct lp_scene *scene = lp_setup_get_current_scene(setup);
+    const unsigned rank = lp_scene_get_num_bins( scene ); /* xxx */
+    struct lp_fence *fence = lp_fence_create(rank);
   
+    LP_DBG(DEBUG_SETUP, "%s rank %u\n", __FUNCTION__, rank);
   
+    set_scene_state( setup, SETUP_ACTIVE );
   
- /**
-  * Compute the setup->coef[] array dadx, dady, a0 values.
-  * Must be called after setup->vmin,vmid,vmax,vprovoke are initialized.
-  */
- static void setup_tri_coefficients( struct setup_context *setup )
- {
-    struct llvmpipe_context *llvmpipe = setup->llvmpipe;
-    const struct lp_fragment_shader *lpfs = llvmpipe->fs;
-    const struct vertex_info *vinfo = llvmpipe_get_vertex_info(llvmpipe);
-    uint fragSlot;
+    /* insert the fence into all command bins */
+    lp_scene_bin_everywhere( scene,
+                           lp_rast_fence,
+                           lp_rast_arg_fence(fence) );
   
-    /* z and w are done by linear interpolation:
-     */
-    tri_pos_coeff(setup, 0, 2);
-    tri_pos_coeff(setup, 0, 3);
+    return (struct pipe_fence_handle *) fence;
+ }
   
-    /* setup interpolation for all the remaining attributes:
-     */
-    for (fragSlot = 0; fragSlot < lpfs->info.num_inputs; fragSlot++) {
-       const uint vertSlot = vinfo->attrib[fragSlot].src_index;
   
-       switch (vinfo->attrib[fragSlot].interp_mode) {
-       case INTERP_CONSTANT:
-          const_coeff(setup, fragSlot, vertSlot);
-          break;
-       case INTERP_LINEAR:
-          tri_linear_coeff(setup, fragSlot, vertSlot);
-          break;
-       case INTERP_PERSPECTIVE:
-          tri_persp_coeff(setup, fragSlot, vertSlot);
-          break;
-       case INTERP_POS:
-          setup_fragcoord_coeff(setup, fragSlot);
-          break;
-       default:
-          assert(0);
-       }
+ void 
+ lp_setup_set_triangle_state( struct setup_context *setup,
+                              unsigned cull_mode,
+                              boolean ccw_is_frontface,
+                              boolean scissor )
+ {
+    LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
   
-       if (lpfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FACE) {
-          setup->coef.a0[1 + fragSlot][0] = 1.0f - setup->facing;
-          setup->coef.dadx[1 + fragSlot][0] = 0.0;
-          setup->coef.dady[1 + fragSlot][0] = 0.0;
-       }
-    }
+    setup->ccw_is_frontface = ccw_is_frontface;
+    setup->cullmode = cull_mode;
+    setup->triangle = first_triangle;
+    setup->scissor_test = scissor;
   }
   
   
   
- static void setup_tri_edges( struct setup_context *setup )
+ void
+ lp_setup_set_fs_inputs( struct setup_context *setup,
+                         const struct lp_shader_input *input,
+                         unsigned nr )
   {
-    float vmin_x = setup->vmin[0][0] + setup->pixel_offset;
-    float vmid_x = setup->vmid[0][0] + setup->pixel_offset;
- 
-    float vmin_y = setup->vmin[0][1] - setup->pixel_offset;
-    float vmid_y = setup->vmid[0][1] - setup->pixel_offset;
-    float vmax_y = setup->vmax[0][1] - setup->pixel_offset;
- 
-    setup->emaj.sy = ceilf(vmin_y);
-    setup->emaj.lines = (int) ceilf(vmax_y - setup->emaj.sy);
-    setup->emaj.dxdy = setup->emaj.dx / setup->emaj.dy;
-    setup->emaj.sx = vmin_x + (setup->emaj.sy - vmin_y) * setup->emaj.dxdy;
- 
-    setup->etop.sy = ceilf(vmid_y);
-    setup->etop.lines = (int) ceilf(vmax_y - setup->etop.sy);
-    setup->etop.dxdy = setup->etop.dx / setup->etop.dy;
-    setup->etop.sx = vmid_x + (setup->etop.sy - vmid_y) * setup->etop.dxdy;
- 
-    setup->ebot.sy = ceilf(vmin_y);
-    setup->ebot.lines = (int) ceilf(vmid_y - setup->ebot.sy);
-    setup->ebot.dxdy = setup->ebot.dx / setup->ebot.dy;
-    setup->ebot.sx = vmin_x + (setup->ebot.sy - vmin_y) * setup->ebot.dxdy;
- }
+    LP_DBG(DEBUG_SETUP, "%s %p %u\n", __FUNCTION__, (void *) input, nr);
   
+    memcpy( setup->fs.input, input, nr * sizeof input[0] );
+    setup->fs.nr_inputs = nr;
+ }
   
- /**
-  * Render the upper or lower half of a triangle.
-  * Scissoring/cliprect is applied here too.
-  */
- static void subtriangle( struct setup_context *setup,
-                        struct edge *eleft,
-                        struct edge *eright,
-                        unsigned lines )
+ void
+ lp_setup_set_fs_functions( struct setup_context *setup,
+                            lp_jit_frag_func jit_function0,
+                            lp_jit_frag_func jit_function1,
+                            boolean opaque )
   {
-    const struct pipe_scissor_state *cliprect = &setup->llvmpipe->cliprect;
-    const int minx = (int) cliprect->minx;
-    const int maxx = (int) cliprect->maxx;
-    const int miny = (int) cliprect->miny;
-    const int maxy = (int) cliprect->maxy;
-    int y, start_y, finish_y;
-    int sy = (int)eleft->sy;
- 
-    assert((int)eleft->sy == (int) eright->sy);
- 
-    /* clip top/bottom */
-    start_y = sy;
-    if (start_y < miny)
-       start_y = miny;
- 
-    finish_y = sy + lines;
-    if (finish_y > maxy)
-       finish_y = maxy;
- 
-    start_y -= sy;
-    finish_y -= sy;
- 
-    /*
-    debug_printf("%s %d %d\n", __FUNCTION__, start_y, finish_y);
-    */
- 
-    for (y = start_y; y < finish_y; y++) {
- 
-       /* avoid accumulating adds as floats don't have the precision to
-        * accurately iterate large triangle edges that way.  luckily we
-        * can just multiply these days.
-        *
-        * this is all drowned out by the attribute interpolation anyway.
-        */
-       int left = (int)(eleft->sx + y * eleft->dxdy);
-       int right = (int)(eright->sx + y * eright->dxdy);
- 
-       /* clip left/right */
-       if (left < minx)
-          left = minx;
-       if (right > maxx)
-          right = maxx;
- 
-       if (left < right) {
-          int _y = sy + y;
-          if (block(_y) != setup->span.y) {
-             flush_spans(setup);
-             setup->span.y = block(_y);
-          }
+    LP_DBG(DEBUG_SETUP, "%s %p\n", __FUNCTION__, (void *) jit_function0);
+    /* FIXME: reference count */
   
-          setup->span.left[_y&1] = left;
-          setup->span.right[_y&1] = right;
-       }
-    }
- 
- 
-    /* save the values so that emaj can be restarted:
-     */
-    eleft->sx += lines * eleft->dxdy;
-    eright->sx += lines * eright->dxdy;
-    eleft->sy += lines;
-    eright->sy += lines;
+    setup->fs.current.jit_function[0] = jit_function0;
+    setup->fs.current.jit_function[1] = jit_function1;
+    setup->fs.current.opaque = opaque;
+    setup->dirty |= LP_SETUP_NEW_FS;
   }
   
- 
- /**
-  * Recalculate prim's determinant.  This is needed as we don't have
-  * get this information through the vbuf_render interface & we must
-  * calculate it here.
-  */
- static float
- calc_det( const float (*v0)[4],
-           const float (*v1)[4],
-           const float (*v2)[4] )
+ void
+ lp_setup_set_fs_constants(struct setup_context *setup,
+                           struct pipe_buffer *buffer)
   {
-    /* edge vectors e = v0 - v2, f = v1 - v2 */
-    const float ex = v0[0][0] - v2[0][0];
-    const float ey = v0[0][1] - v2[0][1];
-    const float fx = v1[0][0] - v2[0][0];
-    const float fy = v1[0][1] - v2[0][1];
- 
-    /* det = cross(e,f).z */
-    return ex * fy - ey * fx;
+    LP_DBG(DEBUG_SETUP, "%s %p\n", __FUNCTION__, (void *) buffer);
+ 
+    pipe_buffer_reference(&setup->constants.current, buffer);
+ 
+    setup->dirty |= LP_SETUP_NEW_CONSTANTS;
   }
   
   
- /**
-  * Do setup for triangle rasterization, then render the triangle.
-  */
- void llvmpipe_setup_tri( struct setup_context *setup,
-                 const float (*v0)[4],
-                 const float (*v1)[4],
-                 const float (*v2)[4] )
+ void
+ lp_setup_set_alpha_ref_value( struct setup_context *setup,
+                               float alpha_ref_value )
   {
-    float det;
- 
- #if DEBUG_VERTS
-    debug_printf("Setup triangle:\n");
-    print_vertex(setup, v0);
-    print_vertex(setup, v1);
-    print_vertex(setup, v2);
- #endif
+    LP_DBG(DEBUG_SETUP, "%s %f\n", __FUNCTION__, alpha_ref_value);
   
-    if (setup->llvmpipe->no_rast)
-       return;
-    
-    det = calc_det(v0, v1, v2);
-    /*
-    debug_printf("%s\n", __FUNCTION__ );
-    */
+    if(setup->fs.current.jit_context.alpha_ref_value != alpha_ref_value) {
+       setup->fs.current.jit_context.alpha_ref_value = alpha_ref_value;
+       setup->dirty |= LP_SETUP_NEW_FS;
+    }
+ }
   
- #if DEBUG_FRAGS
-    setup->numFragsEmitted = 0;
-    setup->numFragsWritten = 0;
- #endif
+ void
+ lp_setup_set_blend_color( struct setup_context *setup,
+                           const struct pipe_blend_color *blend_color )
+ {
+    LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
   
-    if (cull_tri( setup, det ))
-       return;
+    assert(blend_color);
   
-    if (!setup_sort_vertices( setup, det, v0, v1, v2 ))
-       return;
-    setup_tri_coefficients( setup );
-    setup_tri_edges( setup );
+    if(memcmp(&setup->blend_color.current, blend_color, sizeof *blend_color) != 0) {
+       memcpy(&setup->blend_color.current, blend_color, sizeof *blend_color);
+       setup->dirty |= LP_SETUP_NEW_BLEND_COLOR;
+    }
+ }
   
-    assert(setup->llvmpipe->reduced_prim == PIPE_PRIM_TRIANGLES);
   
-    setup->span.y = 0;
-    setup->span.right[0] = 0;
-    setup->span.right[1] = 0;
-    /*   setup->span.z_mode = tri_z_mode( setup->ctx ); */
+ void
+ lp_setup_set_scissor( struct setup_context *setup,
+                       const struct pipe_scissor_state *scissor )
+ {
+    LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
   
-    /*   init_constant_attribs( setup ); */
+    assert(scissor);
   
-    if (setup->oneoverarea < 0.0) {
-       /* emaj on left:
-        */
-       subtriangle( setup, &setup->emaj, &setup->ebot, setup->ebot.lines );
-       subtriangle( setup, &setup->emaj, &setup->etop, setup->etop.lines );
-    }
-    else {
-       /* emaj on right:
-        */
-       subtriangle( setup, &setup->ebot, &setup->emaj, setup->ebot.lines );
-       subtriangle( setup, &setup->etop, &setup->emaj, setup->etop.lines );
+    if (memcmp(&setup->scissor.current, scissor, sizeof(*scissor)) != 0) {
+       setup->scissor.current = *scissor; /* struct copy */
+       setup->dirty |= LP_SETUP_NEW_SCISSOR;
      }
- 
-    flush_spans( setup );
- 
- #if DEBUG_FRAGS
-    printf("Tri: %u frags emitted, %u written\n",
-           setup->numFragsEmitted,
-           setup->numFragsWritten);
- #endif
   }
   
   
- 
- /**
-  * Compute a0, dadx and dady for a linearly interpolated coefficient,
-  * for a line.
-  */
- static void
- linear_pos_coeff(struct setup_context *setup,
-                  uint vertSlot, uint i)
+ void 
+ lp_setup_set_flatshade_first( struct setup_context *setup,
+                               boolean flatshade_first )
   {
-    const float da = setup->vmax[vertSlot][i] - setup->vmin[vertSlot][i];
-    const float dadx = da * setup->emaj.dx * setup->oneoverarea;
-    const float dady = da * setup->emaj.dy * setup->oneoverarea;
-    setup->coef.dadx[0][i] = dadx;
-    setup->coef.dady[0][i] = dady;
-    setup->coef.a0[0][i] = (setup->vmin[vertSlot][i] -
-                            (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
-                             dady * (setup->vmin[0][1] - setup->pixel_offset)));
+    setup->flatshade_first = flatshade_first;
   }
   
   
- /**
-  * Compute a0, dadx and dady for a linearly interpolated coefficient,
-  * for a line.
-  */
- static void
- line_linear_coeff(struct setup_context *setup,
-                   unsigned attrib,
-                   uint vertSlot)
+ void 
+ lp_setup_set_vertex_info( struct setup_context *setup,
+                           struct vertex_info *vertex_info )
   {
-    unsigned i;
-    for (i = 0; i < NUM_CHANNELS; ++i) {
-       const float da = setup->vmax[vertSlot][i] - setup->vmin[vertSlot][i];
-       const float dadx = da * setup->emaj.dx * setup->oneoverarea;
-       const float dady = da * setup->emaj.dy * setup->oneoverarea;
-       setup->coef.dadx[1 + attrib][i] = dadx;
-       setup->coef.dady[1 + attrib][i] = dady;
-       setup->coef.a0[1 + attrib][i] = (setup->vmin[vertSlot][i] -
-                      (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
-                       dady * (setup->vmin[0][1] - setup->pixel_offset)));
-    }
+    /* XXX: just silently holding onto the pointer:
+     */
+    setup->vertex_info = vertex_info;
   }
   
   
   /**
-  * Compute a0, dadx and dady for a perspective-corrected interpolant,
-  * for a line.
+  * Called during state validation when LP_NEW_TEXTURE is set.
    */
- static void
- line_persp_coeff(struct setup_context *setup,
-                  unsigned attrib,
-                  uint vertSlot)
+ void
+ lp_setup_set_sampler_textures( struct setup_context *setup,
+                                unsigned num, struct pipe_texture **texture)
   {
      unsigned i;
-    for (i = 0; i < NUM_CHANNELS; ++i) {
-       /* XXX double-check/verify this arithmetic */
-       const float a0 = setup->vmin[vertSlot][i] * setup->vmin[0][3];
-       const float a1 = setup->vmax[vertSlot][i] * setup->vmax[0][3];
-       const float da = a1 - a0;
-       const float dadx = da * setup->emaj.dx * setup->oneoverarea;
-       const float dady = da * setup->emaj.dy * setup->oneoverarea;
-       setup->coef.dadx[1 + attrib][i] = dadx;
-       setup->coef.dady[1 + attrib][i] = dady;
-       setup->coef.a0[1 + attrib][i] = (setup->vmin[vertSlot][i] -
-                      (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
-                       dady * (setup->vmin[0][1] - setup->pixel_offset)));
+ 
+    LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
+ 
+    assert(num <= PIPE_MAX_SAMPLERS);
+ 
+    for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+       struct pipe_texture *tex = i < num ? texture[i] : NULL;
+ 
+       if(tex) {
+          struct llvmpipe_texture *lp_tex = llvmpipe_texture(tex);
+          struct lp_jit_texture *jit_tex;
+          jit_tex = &setup->fs.current.jit_context.textures[i];
+          jit_tex->width = tex->width0;
+          jit_tex->height = tex->height0;
+          jit_tex->stride = lp_tex->stride[0];
+          if(!lp_tex->dt)
+             jit_tex->data = lp_tex->data;
+          else
+             /* FIXME: map the rendertarget */
+             assert(0);
+ 
+          /* the scene references this texture */
+          {
+             struct lp_scene *scene = lp_setup_get_current_scene(setup);
+             lp_scene_texture_reference(scene, tex);
+          }
+       }
      }
+ 
+    setup->dirty |= LP_SETUP_NEW_FS;
   }
   
   
   /**
-  * Compute the setup->coef[] array dadx, dady, a0 values.
-  * Must be called after setup->vmin,vmax are initialized.
+  * Is the given texture referenced by any scene?
+  * Note: we have to check all scenes including any scenes currently
+  * being rendered and the current scene being built.
    */
- static INLINE boolean
- setup_line_coefficients(struct setup_context *setup,
-                         const float (*v0)[4],
-                         const float (*v1)[4])
+ unsigned
+ lp_setup_is_texture_referenced( const struct setup_context *setup,
+                                 const struct pipe_texture *texture )
   {
-    struct llvmpipe_context *llvmpipe = setup->llvmpipe;
-    const struct lp_fragment_shader *lpfs = llvmpipe->fs;
-    const struct vertex_info *vinfo = llvmpipe_get_vertex_info(llvmpipe);
-    uint fragSlot;
-    float area;
- 
-    /* use setup->vmin, vmax to point to vertices */
-    if (llvmpipe->rasterizer->flatshade_first)
-       setup->vprovoke = v0;
-    else
-       setup->vprovoke = v1;
-    setup->vmin = v0;
-    setup->vmax = v1;
- 
-    setup->emaj.dx = setup->vmax[0][0] - setup->vmin[0][0];
-    setup->emaj.dy = setup->vmax[0][1] - setup->vmin[0][1];
- 
-    /* NOTE: this is not really area but something proportional to it */
-    area = setup->emaj.dx * setup->emaj.dx + setup->emaj.dy * setup->emaj.dy;
-    if (area == 0.0f || util_is_inf_or_nan(area))
-       return FALSE;
-    setup->oneoverarea = 1.0f / area;
- 
-    /* z and w are done by linear interpolation:
-     */
-    linear_pos_coeff(setup, 0, 2);
-    linear_pos_coeff(setup, 0, 3);
- 
-    /* setup interpolation for all the remaining attributes:
-     */
-    for (fragSlot = 0; fragSlot < lpfs->info.num_inputs; fragSlot++) {
-       const uint vertSlot = vinfo->attrib[fragSlot].src_index;
- 
-       switch (vinfo->attrib[fragSlot].interp_mode) {
-       case INTERP_CONSTANT:
-          const_coeff(setup, fragSlot, vertSlot);
-          break;
-       case INTERP_LINEAR:
-          line_linear_coeff(setup, fragSlot, vertSlot);
-          break;
-       case INTERP_PERSPECTIVE:
-          line_persp_coeff(setup, fragSlot, vertSlot);
-          break;
-       case INTERP_POS:
-          setup_fragcoord_coeff(setup, fragSlot);
-          break;
-       default:
-          assert(0);
-       }
+    unsigned i;
   
-       if (lpfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FACE) {
-          setup->coef.a0[1 + fragSlot][0] = 1.0f - setup->facing;
-          setup->coef.dadx[1 + fragSlot][0] = 0.0;
-          setup->coef.dady[1 + fragSlot][0] = 0.0;
-       }
+    /* check the render targets */
+    for (i = 0; i < setup->fb.nr_cbufs; i++) {
+       if (setup->fb.cbufs[i]->texture == texture)
+          return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
+    }
+    if (setup->fb.zsbuf && setup->fb.zsbuf->texture == texture) {
+       return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
      }
-    return TRUE;
- }
- 
   
- /**
-  * Plot a pixel in a line segment.
-  */
- static INLINE void
- plot(struct setup_context *setup, int x, int y)
- {
-    const int iy = y & 1;
-    const int ix = x & 1;
-    const int quadX = x - ix;
-    const int quadY = y - iy;
-    const int mask = (1 << ix) << (2 * iy);
- 
-    if (quadX != setup->quad[0].input.x0 ||
-        quadY != setup->quad[0].input.y0)
-    {
-       /* flush prev quad, start new quad */
- 
-       if (setup->quad[0].input.x0 != -1)
-          clip_emit_quad( setup, &setup->quad[0] );
- 
-       setup->quad[0].input.x0 = quadX;
-       setup->quad[0].input.y0 = quadY;
-       setup->quad[0].inout.mask = 0x0;
+    /* check textures referenced by the scene */
+    for (i = 0; i < Elements(setup->scenes); i++) {
+       if (lp_scene_is_textured_referenced(setup->scenes[i], texture)) {
+          return PIPE_REFERENCED_FOR_READ;
+       }
      }
   
-    setup->quad[0].inout.mask |= mask;
+    return PIPE_UNREFERENCED;
   }
   
   
   /**
-  * Do setup for line rasterization, then render the line.
-  * Single-pixel width, no stipple, etc.  We rely on the 'draw' module
-  * to handle stippling and wide lines.
+  * Called by vbuf code when we're about to draw something.
    */
   void
- llvmpipe_setup_line(struct setup_context *setup,
-            const float (*v0)[4],
-            const float (*v1)[4])
+ lp_setup_update_state( struct setup_context *setup )
   {
-    int x0 = (int) v0[0][0];
-    int x1 = (int) v1[0][0];
-    int y0 = (int) v0[0][1];
-    int y1 = (int) v1[0][1];
-    int dx = x1 - x0;
-    int dy = y1 - y0;
-    int xstep, ystep;
- 
- #if DEBUG_VERTS
-    debug_printf("Setup line:\n");
-    print_vertex(setup, v0);
-    print_vertex(setup, v1);
- #endif
- 
-    if (setup->llvmpipe->no_rast)
-       return;
- 
-    if (dx == 0 && dy == 0)
-       return;
+    struct lp_scene *scene = lp_setup_get_current_scene(setup);
   
-    if (!setup_line_coefficients(setup, v0, v1))
-       return;
- 
-    assert(v0[0][0] < 1.0e9);
-    assert(v0[0][1] < 1.0e9);
-    assert(v1[0][0] < 1.0e9);
-    assert(v1[0][1] < 1.0e9);
- 
-    if (dx < 0) {
-       dx = -dx;   /* make positive */
-       xstep = -1;
-    }
-    else {
-       xstep = 1;
-    }
+    LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
   
-    if (dy < 0) {
-       dy = -dy;   /* make positive */
-       ystep = -1;
-    }
-    else {
-       ystep = 1;
-    }
+    assert(setup->fs.current.jit_function);
   
-    assert(dx >= 0);
-    assert(dy >= 0);
-    assert(setup->llvmpipe->reduced_prim == PIPE_PRIM_LINES);
+    if(setup->dirty & LP_SETUP_NEW_BLEND_COLOR) {
+       uint8_t *stored;
+       unsigned i, j;
   
-    setup->quad[0].input.x0 = setup->quad[0].input.y0 = -1;
-    setup->quad[0].inout.mask = 0x0;
+       stored = lp_scene_alloc_aligned(scene, 4 * 16, 16);
   
-    /* XXX temporary: set coverage to 1.0 so the line appears
-     * if AA mode happens to be enabled.
-     */
-    setup->quad[0].input.coverage[0] =
-    setup->quad[0].input.coverage[1] =
-    setup->quad[0].input.coverage[2] =
-    setup->quad[0].input.coverage[3] = 1.0;
- 
-    if (dx > dy) {
-       /*** X-major line ***/
-       int i;
-       const int errorInc = dy + dy;
-       int error = errorInc - dx;
-       const int errorDec = error - dx;
- 
-       for (i = 0; i < dx; i++) {
-          plot(setup, x0, y0);
- 
-          x0 += xstep;
-          if (error < 0) {
-             error += errorInc;
-          }
-          else {
-             error += errorDec;
-             y0 += ystep;
-          }
-       }
-    }
-    else {
-       /*** Y-major line ***/
-       int i;
-       const int errorInc = dx + dx;
-       int error = errorInc - dy;
-       const int errorDec = error - dy;
- 
-       for (i = 0; i < dy; i++) {
-          plot(setup, x0, y0);
- 
-          y0 += ystep;
-          if (error < 0) {
-             error += errorInc;
-          }
-          else {
-             error += errorDec;
-             x0 += xstep;
-          }
+       /* smear each blend color component across 16 ubyte elements */
+       for (i = 0; i < 4; ++i) {
+          uint8_t c = float_to_ubyte(setup->blend_color.current.color[i]);
+          for (j = 0; j < 16; ++j)
+             stored[i*16 + j] = c;
         }
-    }
   
-    /* draw final quad */
-    if (setup->quad[0].inout.mask) {
-       clip_emit_quad( setup, &setup->quad[0] );
+       setup->blend_color.stored = stored;
+ 
+       setup->fs.current.jit_context.blend_color = setup->blend_color.stored;
+       setup->dirty |= LP_SETUP_NEW_FS;
      }
- }
   
+    if (setup->dirty & LP_SETUP_NEW_SCISSOR) {
+       float *stored;
   
- static void
- point_persp_coeff(struct setup_context *setup,
-                   const float (*vert)[4],
-                   unsigned attrib,
-                   uint vertSlot)
- {
-    unsigned i;
-    for(i = 0; i < NUM_CHANNELS; ++i) {
-       setup->coef.dadx[1 + attrib][i] = 0.0F;
-       setup->coef.dady[1 + attrib][i] = 0.0F;
-       setup->coef.a0[1 + attrib][i] = vert[vertSlot][i] * vert[0][3];
-    }
- }
+       stored = lp_scene_alloc_aligned(scene, 4 * sizeof(int32_t), 16);
   
+       stored[0] = (float) setup->scissor.current.minx;
+       stored[1] = (float) setup->scissor.current.miny;
+       stored[2] = (float) setup->scissor.current.maxx;
+       stored[3] = (float) setup->scissor.current.maxy;
   
- /**
-  * Do setup for point rasterization, then render the point.
-  * Round or square points...
-  * XXX could optimize a lot for 1-pixel points.
-  */
- void
- llvmpipe_setup_point( struct setup_context *setup,
-              const float (*v0)[4] )
- {
-    struct llvmpipe_context *llvmpipe = setup->llvmpipe;
-    const struct lp_fragment_shader *lpfs = llvmpipe->fs;
-    const int sizeAttr = setup->llvmpipe->psize_slot;
-    const float size
-       = sizeAttr > 0 ? v0[sizeAttr][0]
-       : setup->llvmpipe->rasterizer->point_size;
-    const float halfSize = 0.5F * size;
-    const boolean round = (boolean) setup->llvmpipe->rasterizer->point_smooth;
-    const float x = v0[0][0];  /* Note: data[0] is always position */
-    const float y = v0[0][1];
-    const struct vertex_info *vinfo = llvmpipe_get_vertex_info(llvmpipe);
-    uint fragSlot;
- 
- #if DEBUG_VERTS
-    debug_printf("Setup point:\n");
-    print_vertex(setup, v0);
- #endif
- 
-    if (llvmpipe->no_rast)
-       return;
+       setup->scissor.stored = stored;
   
-    assert(setup->llvmpipe->reduced_prim == PIPE_PRIM_POINTS);
- 
-    /* For points, all interpolants are constant-valued.
-     * However, for point sprites, we'll need to setup texcoords appropriately.
-     * XXX: which coefficients are the texcoords???
-     * We may do point sprites as textured quads...
-     *
-     * KW: We don't know which coefficients are texcoords - ultimately
-     * the choice of what interpolation mode to use for each attribute
-     * should be determined by the fragment program, using
-     * per-attribute declaration statements that include interpolation
-     * mode as a parameter.  So either the fragment program will have
-     * to be adjusted for pointsprite vs normal point behaviour, or
-     * otherwise a special interpolation mode will have to be defined
-     * which matches the required behaviour for point sprites.  But -
-     * the latter is not a feature of normal hardware, and as such
-     * probably should be ruled out on that basis.
-     */
-    setup->vprovoke = v0;
+       setup->fs.current.jit_context.scissor_xmin = stored[0];
+       setup->fs.current.jit_context.scissor_ymin = stored[1];
+       setup->fs.current.jit_context.scissor_xmax = stored[2];
+       setup->fs.current.jit_context.scissor_ymax = stored[3];
   
-    /* setup Z, W */
-    const_pos_coeff(setup, 0, 2);
-    const_pos_coeff(setup, 0, 3);
+       setup->dirty |= LP_SETUP_NEW_FS;
+    }
   
-    for (fragSlot = 0; fragSlot < lpfs->info.num_inputs; fragSlot++) {
-       const uint vertSlot = vinfo->attrib[fragSlot].src_index;
+    if(setup->dirty & LP_SETUP_NEW_CONSTANTS) {
+       struct pipe_buffer *buffer = setup->constants.current;
   
-       switch (vinfo->attrib[fragSlot].interp_mode) {
-       case INTERP_CONSTANT:
-          /* fall-through */
-       case INTERP_LINEAR:
-          const_coeff(setup, fragSlot, vertSlot);
-          break;
-       case INTERP_PERSPECTIVE:
-          point_persp_coeff(setup, setup->vprovoke, fragSlot, vertSlot);
-          break;
-       case INTERP_POS:
-          setup_fragcoord_coeff(setup, fragSlot);
-          break;
-       default:
-          assert(0);
-       }
+       if(buffer) {
+          unsigned current_size = buffer->size;
+          const void *current_data = llvmpipe_buffer(buffer)->data;
   
-       if (lpfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FACE) {
-          setup->coef.a0[1 + fragSlot][0] = 1.0f - setup->facing;
-          setup->coef.dadx[1 + fragSlot][0] = 0.0;
-          setup->coef.dady[1 + fragSlot][0] = 0.0;
-       }
-    }
+          /* TODO: copy only the actually used constants? */
   
+          if(setup->constants.stored_size != current_size ||
+             !setup->constants.stored_data ||
+             memcmp(setup->constants.stored_data,
+                    current_data,
+                    current_size) != 0) {
+             void *stored;
   
-    if (halfSize <= 0.5 && !round) {
-       /* special case for 1-pixel points */
-       const int ix = ((int) x) & 1;
-       const int iy = ((int) y) & 1;
-       setup->quad[0].input.x0 = (int) x - ix;
-       setup->quad[0].input.y0 = (int) y - iy;
-       setup->quad[0].inout.mask = (1 << ix) << (2 * iy);
-       clip_emit_quad( setup, &setup->quad[0] );
-    }
-    else {
-       if (round) {
-          /* rounded points */
-          const int ixmin = block((int) (x - halfSize));
-          const int ixmax = block((int) (x + halfSize));
-          const int iymin = block((int) (y - halfSize));
-          const int iymax = block((int) (y + halfSize));
-          const float rmin = halfSize - 0.7071F;  /* 0.7071 = sqrt(2)/2 */
-          const float rmax = halfSize + 0.7071F;
-          const float rmin2 = MAX2(0.0F, rmin * rmin);
-          const float rmax2 = rmax * rmax;
-          const float cscale = 1.0F / (rmax2 - rmin2);
-          int ix, iy;
- 
-          for (iy = iymin; iy <= iymax; iy += 2) {
-             for (ix = ixmin; ix <= ixmax; ix += 2) {
-                float dx, dy, dist2, cover;
- 
-                setup->quad[0].inout.mask = 0x0;
- 
-                dx = (ix + 0.5f) - x;
-                dy = (iy + 0.5f) - y;
-                dist2 = dx * dx + dy * dy;
-                if (dist2 <= rmax2) {
-                   cover = 1.0F - (dist2 - rmin2) * cscale;
-                   setup->quad[0].input.coverage[QUAD_TOP_LEFT] = MIN2(cover, 1.0f);
-                   setup->quad[0].inout.mask |= MASK_TOP_LEFT;
-                }
- 
-                dx = (ix + 1.5f) - x;
-                dy = (iy + 0.5f) - y;
-                dist2 = dx * dx + dy * dy;
-                if (dist2 <= rmax2) {
-                   cover = 1.0F - (dist2 - rmin2) * cscale;
-                   setup->quad[0].input.coverage[QUAD_TOP_RIGHT] = MIN2(cover, 1.0f);
-                   setup->quad[0].inout.mask |= MASK_TOP_RIGHT;
-                }
- 
-                dx = (ix + 0.5f) - x;
-                dy = (iy + 1.5f) - y;
-                dist2 = dx * dx + dy * dy;
-                if (dist2 <= rmax2) {
-                   cover = 1.0F - (dist2 - rmin2) * cscale;
-                   setup->quad[0].input.coverage[QUAD_BOTTOM_LEFT] = MIN2(cover, 1.0f);
-                   setup->quad[0].inout.mask |= MASK_BOTTOM_LEFT;
-                }
- 
-                dx = (ix + 1.5f) - x;
-                dy = (iy + 1.5f) - y;
-                dist2 = dx * dx + dy * dy;
-                if (dist2 <= rmax2) {
-                   cover = 1.0F - (dist2 - rmin2) * cscale;
-                   setup->quad[0].input.coverage[QUAD_BOTTOM_RIGHT] = MIN2(cover, 1.0f);
-                   setup->quad[0].inout.mask |= MASK_BOTTOM_RIGHT;
-                }
- 
-                if (setup->quad[0].inout.mask) {
-                   setup->quad[0].input.x0 = ix;
-                   setup->quad[0].input.y0 = iy;
-                   clip_emit_quad( setup, &setup->quad[0] );
-                }
+             stored = lp_scene_alloc(scene, current_size);
+             if(stored) {
+                memcpy(stored,
+                       current_data,
+                       current_size);
+                setup->constants.stored_size = current_size;
+                setup->constants.stored_data = stored;
               }
            }
         }
         else {
-          /* square points */
-          const int xmin = (int) (x + 0.75 - halfSize);
-          const int ymin = (int) (y + 0.25 - halfSize);
-          const int xmax = xmin + (int) size;
-          const int ymax = ymin + (int) size;
-          /* XXX could apply scissor to xmin,ymin,xmax,ymax now */
-          const int ixmin = block(xmin);
-          const int ixmax = block(xmax - 1);
-          const int iymin = block(ymin);
-          const int iymax = block(ymax - 1);
-          int ix, iy;
- 
-          /*
-          debug_printf("(%f, %f) -> X:%d..%d Y:%d..%d\n", x, y, xmin, xmax,ymin,ymax);
-          */
-          for (iy = iymin; iy <= iymax; iy += 2) {
-             uint rowMask = 0xf;
-             if (iy < ymin) {
-                /* above the top edge */
-                rowMask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT);
-             }
-             if (iy + 1 >= ymax) {
-                /* below the bottom edge */
-                rowMask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT);
-             }
+          setup->constants.stored_size = 0;
+          setup->constants.stored_data = NULL;
+       }
   
-             for (ix = ixmin; ix <= ixmax; ix += 2) {
-                uint mask = rowMask;
- 
-                if (ix < xmin) {
-                   /* fragment is past left edge of point, turn off left bits */
-                   mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT);
-                }
-                if (ix + 1 >= xmax) {
-                   /* past the right edge */
-                   mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
-                }
- 
-                setup->quad[0].inout.mask = mask;
-                setup->quad[0].input.x0 = ix;
-                setup->quad[0].input.y0 = iy;
-                clip_emit_quad( setup, &setup->quad[0] );
-             }
+       setup->fs.current.jit_context.constants = setup->constants.stored_data;
+       setup->dirty |= LP_SETUP_NEW_FS;
+    }
+ 
+ 
+    if(setup->dirty & LP_SETUP_NEW_FS) {
+       if(!setup->fs.stored ||
+          memcmp(setup->fs.stored,
+                 &setup->fs.current,
+                 sizeof setup->fs.current) != 0) {
+          /* The fs state that's been stored in the scene is different from
+           * the new, current state.  So allocate a new lp_rast_state object
+           * and append it to the bin's setup data buffer.
+           */
+          struct lp_rast_state *stored =
+             (struct lp_rast_state *) lp_scene_alloc(scene, sizeof *stored);
+          if(stored) {
+             memcpy(stored,
+                    &setup->fs.current,
+                    sizeof setup->fs.current);
+             setup->fs.stored = stored;
+ 
+             /* put the state-set command into all bins */
+             lp_scene_bin_state_command( scene,
+                                       lp_rast_set_state, 
+                                       lp_rast_arg_state(setup->fs.stored) );
            }
         }
      }
+ 
+    setup->dirty = 0;
+ 
+    assert(setup->fs.stored);
   }
   
- void llvmpipe_setup_prepare( struct setup_context *setup )
+ 
+ 
+ /* Only caller is lp_setup_vbuf_destroy()
+  */
+ void 
+ lp_setup_destroy( struct setup_context *setup )
   {
-    struct llvmpipe_context *lp = setup->llvmpipe;
+    reset_context( setup );
   
-    if (lp->dirty) {
-       llvmpipe_update_derived(lp);
-    }
+    pipe_buffer_reference(&setup->constants.current, NULL);
   
-    if (lp->reduced_api_prim == PIPE_PRIM_TRIANGLES &&
-        lp->rasterizer->fill_cw == PIPE_POLYGON_MODE_FILL &&
-        lp->rasterizer->fill_ccw == PIPE_POLYGON_MODE_FILL) {
-       /* we'll do culling */
-       setup->winding = lp->rasterizer->cull_mode;
-    }
-    else {
-       /* 'draw' will do culling */
-       setup->winding = PIPE_WINDING_NONE;
+    /* free the scenes in the 'empty' queue */
+    while (1) {
+       struct lp_scene *scene = lp_scene_dequeue(setup->empty_scenes, FALSE);
+       if (!scene)
+          break;
+       lp_scene_destroy(scene);
      }
- }
- 
   
+    lp_rast_destroy( setup->rast );
   
- void llvmpipe_setup_destroy_context( struct setup_context *setup )
- {
-    align_free( setup );
+    FREE( setup );
   }
   
   
   /**
-  * Create a new primitive setup/render stage.
+  * Create a new primitive tiling engine.  Plug it into the backend of
+  * the draw module.  Currently also creates a rasterizer to use with
+  * it.
    */
- struct setup_context *llvmpipe_setup_create_context( struct llvmpipe_context *llvmpipe )
+ struct setup_context *
+ lp_setup_create( struct pipe_screen *screen,
+                  struct draw_context *draw )
   {
-    struct setup_context *setup;
      unsigned i;
+    struct setup_context *setup = CALLOC_STRUCT(setup_context);
   
-    setup = align_malloc(sizeof(struct setup_context), 16);
      if (!setup)
         return NULL;
   
-    memset(setup, 0, sizeof *setup);
-    setup->llvmpipe = llvmpipe;
+    lp_setup_init_vbuf(setup);
+ 
+    setup->empty_scenes = lp_scene_queue_create();
+    if (!setup->empty_scenes)
+       goto fail;
   
-    for (i = 0; i < MAX_QUADS; i++) {
-       setup->quad[i].coef = &setup->coef;
+    setup->rast = lp_rast_create( screen, setup->empty_scenes );
+    if (!setup->rast) 
+       goto fail;
+ 
+    setup->vbuf = draw_vbuf_stage(draw, &setup->base);
+    if (!setup->vbuf)
+       goto fail;
+ 
+    draw_set_rasterize_stage(draw, setup->vbuf);
+    draw_set_render(draw, &setup->base);
+ 
+    /* create some empty scenes */
+    for (i = 0; i < MAX_SCENES; i++) {
+       setup->scenes[i] = lp_scene_create();
+       lp_scene_enqueue(setup->empty_scenes, setup->scenes[i]);
      }
   
-    setup->span.left[0] = 1000000;     /* greater than right[0] */
-    setup->span.left[1] = 1000000;     /* greater than right[1] */
+    setup->triangle = first_triangle;
+    setup->line     = first_line;
+    setup->point    = first_point;
+    
+    setup->dirty = ~0;
   
      return setup;
+ 
+ fail:
+    if (setup->rast)
+       lp_rast_destroy( setup->rast );
+    
+    if (setup->vbuf)
+       ;
+ 
+    if (setup->empty_scenes)
+       lp_scene_queue_destroy(setup->empty_scenes);
+ 
+    FREE(setup);
+    return NULL;
   }
   
diff --combined src/gallium/drivers/llvmpipe/lp_state_fs.c

index f3263cf37700610e438b12a45389021b0c7239cb,0602e940d925994033aca025b4a0c0eb59f9e6b7..c5f6df23a1693b6a133eacfe07158ccb791d93e6
--- 1/src/gallium/drivers/llvmpipe/lp_state_fs.c
--- 2/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@@ -31,6 -31,8 +31,8 @@@
    * Code generate the whole fragment pipeline.
    *
    * The fragment pipeline consists of the following stages:
+  * - triangle edge in/out testing
+  * - scissor test
    * - stipple (TBI)
    * - early depth test
    * - fragment shader
@@@ -58,11 -60,12 +60,13 @@@
    * @author Jose Fonseca <jfonseca@vmware.com>
    */
   
+ #include <limits.h>
   #include "pipe/p_defines.h"
+ +#include "util/u_inlines.h"
   #include "util/u_memory.h"
   #include "util/u_format.h"
   #include "util/u_debug_dump.h"
+ #include "util/u_time.h"
   #include "pipe/p_shader_tokens.h"
   #include "draw/draw_context.h"
   #include "tgsi/tgsi_dump.h"
@@@ -81,12 -84,14 +85,14 @@@
   #include "lp_bld_swizzle.h"
   #include "lp_bld_flow.h"
   #include "lp_bld_debug.h"
- #include "lp_screen.h"
- #include "lp_context.h"
   #include "lp_buffer.h"
+ #include "lp_context.h"
+ #include "lp_debug.h"
+ #include "lp_perf.h"
+ #include "lp_screen.h"
+ #include "lp_setup.h"
   #include "lp_state.h"
   #include "lp_tex_sample.h"
- #include "lp_debug.h"
   
   
   static const unsigned char quad_offset_x[4] = {0, 1, 0, 1};
@@@ -185,8 -190,188 +191,188 @@@ generate_depth(LLVMBuilderRef builder
   }
   
   
+ /**
+  * Generate the code to do inside/outside triangle testing for the
+  * four pixels in a 2x2 quad.  This will set the four elements of the
+  * quad mask vector to 0 or ~0.
+  * \param i  which quad of the quad group to test, in [0,3]
+  */
+ static void
+ generate_tri_edge_mask(LLVMBuilderRef builder,
+                        unsigned i,
+                        LLVMValueRef *mask,      /* ivec4, out */
+                        LLVMValueRef c0,         /* int32 */
+                        LLVMValueRef c1,         /* int32 */
+                        LLVMValueRef c2,         /* int32 */
+                        LLVMValueRef step0_ptr,  /* ivec4 */
+                        LLVMValueRef step1_ptr,  /* ivec4 */
+                        LLVMValueRef step2_ptr)  /* ivec4 */
+ {
+ #define OPTIMIZE_IN_OUT_TEST 0
+ #if OPTIMIZE_IN_OUT_TEST
+    struct lp_build_if_state ifctx;
+    LLVMValueRef not_draw_all;
+ #endif
+    struct lp_build_flow_context *flow;
+    struct lp_type i32_type;
+    LLVMTypeRef i32vec4_type, mask_type;
+    LLVMValueRef c0_vec, c1_vec, c2_vec;
+    LLVMValueRef in_out_mask;
+ 
+    assert(i < 4);
+    
+    /* int32 vector type */
+    memset(&i32_type, 0, sizeof i32_type);
+    i32_type.floating = FALSE; /* values are integers */
+    i32_type.sign = TRUE;      /* values are signed */
+    i32_type.norm = FALSE;     /* values are not normalized */
+    i32_type.width = 32;       /* 32-bit int values */
+    i32_type.length = 4;       /* 4 elements per vector */
+ 
+    i32vec4_type = lp_build_int32_vec4_type();
+ 
+    mask_type = LLVMIntType(32 * 4);
+ 
+    /*
+     * Use a conditional here to do detailed pixel in/out testing.
+     * We only have to do this if c0 != INT_MIN.
+     */
+    flow = lp_build_flow_create(builder);
+    lp_build_flow_scope_begin(flow);
+ 
+    {
+ #if OPTIMIZE_IN_OUT_TEST
+       /* not_draw_all = (c0 != INT_MIN) */
+       not_draw_all = LLVMBuildICmp(builder,
+                                    LLVMIntNE,
+                                    c0,
+                                    LLVMConstInt(LLVMInt32Type(), INT_MIN, 0),
+                                    "");
+ 
+       in_out_mask = lp_build_int_const_scalar(i32_type, ~0);
+ 
+ 
+       lp_build_flow_scope_declare(flow, &in_out_mask);
+ 
+       /* if (not_draw_all) {... */
+       lp_build_if(&ifctx, flow, builder, not_draw_all);
+ #endif
+       {
+          LLVMValueRef step0_vec, step1_vec, step2_vec;
+          LLVMValueRef m0_vec, m1_vec, m2_vec;
+          LLVMValueRef index, m;
+ 
+          /* c0_vec = {c0, c0, c0, c0}
+           * Note that we emit this code four times but LLVM optimizes away
+           * three instances of it.
+           */
+          c0_vec = lp_build_broadcast(builder, i32vec4_type, c0);
+          c1_vec = lp_build_broadcast(builder, i32vec4_type, c1);
+          c2_vec = lp_build_broadcast(builder, i32vec4_type, c2);
+          lp_build_name(c0_vec, "edgeconst0vec");
+          lp_build_name(c1_vec, "edgeconst1vec");
+          lp_build_name(c2_vec, "edgeconst2vec");
+ 
+          /* load step0vec, step1, step2 vec from memory */
+          index = LLVMConstInt(LLVMInt32Type(), i, 0);
+          step0_vec = LLVMBuildLoad(builder, LLVMBuildGEP(builder, step0_ptr, &index, 1, ""), "");
+          step1_vec = LLVMBuildLoad(builder, LLVMBuildGEP(builder, step1_ptr, &index, 1, ""), "");
+          step2_vec = LLVMBuildLoad(builder, LLVMBuildGEP(builder, step2_ptr, &index, 1, ""), "");
+          lp_build_name(step0_vec, "step0vec");
+          lp_build_name(step1_vec, "step1vec");
+          lp_build_name(step2_vec, "step2vec");
+ 
+          /* m0_vec = step0_ptr[i] > c0_vec */
+          m0_vec = lp_build_compare(builder, i32_type, PIPE_FUNC_GREATER, step0_vec, c0_vec);
+          m1_vec = lp_build_compare(builder, i32_type, PIPE_FUNC_GREATER, step1_vec, c1_vec);
+          m2_vec = lp_build_compare(builder, i32_type, PIPE_FUNC_GREATER, step2_vec, c2_vec);
+ 
+          /* in_out_mask = m0_vec & m1_vec & m2_vec */
+          m = LLVMBuildAnd(builder, m0_vec, m1_vec, "");
+          in_out_mask = LLVMBuildAnd(builder, m, m2_vec, "");
+          lp_build_name(in_out_mask, "inoutmaskvec");
+       }
+ #if OPTIMIZE_IN_OUT_TEST
+       lp_build_endif(&ifctx);
+ #endif
+ 
+    }
+    lp_build_flow_scope_end(flow);
+    lp_build_flow_destroy(flow);
+ 
+    /* This is the initial alive/dead pixel mask for a quad of four pixels.
+     * It's an int[4] vector with each word set to 0 or ~0.
+     * Words will get cleared when pixels faile the Z test, etc.
+     */
+    *mask = in_out_mask;
+ }
+ 
+ 
+ static LLVMValueRef
+ generate_scissor_test(LLVMBuilderRef builder,
+                       LLVMValueRef context_ptr,
+                       const struct lp_build_interp_soa_context *interp,
+                       struct lp_type type)
+ {
+    LLVMTypeRef vec_type = lp_build_vec_type(type);
+    LLVMValueRef xpos = interp->pos[0], ypos = interp->pos[1];
+    LLVMValueRef xmin, ymin, xmax, ymax;
+    LLVMValueRef m0, m1, m2, m3, m;
+ 
+    /* xpos, ypos contain the window coords for the four pixels in the quad */
+    assert(xpos);
+    assert(ypos);
+ 
+    /* get the current scissor bounds, convert to vectors */
+    xmin = lp_jit_context_scissor_xmin_value(builder, context_ptr);
+    xmin = lp_build_broadcast(builder, vec_type, xmin);
+ 
+    ymin = lp_jit_context_scissor_ymin_value(builder, context_ptr);
+    ymin = lp_build_broadcast(builder, vec_type, ymin);
+ 
+    xmax = lp_jit_context_scissor_xmax_value(builder, context_ptr);
+    xmax = lp_build_broadcast(builder, vec_type, xmax);
+ 
+    ymax = lp_jit_context_scissor_ymax_value(builder, context_ptr);
+    ymax = lp_build_broadcast(builder, vec_type, ymax);
+ 
+    /* compare the fragment's position coordinates against the scissor bounds */
+    m0 = lp_build_compare(builder, type, PIPE_FUNC_GEQUAL, xpos, xmin);
+    m1 = lp_build_compare(builder, type, PIPE_FUNC_GEQUAL, ypos, ymin);
+    m2 = lp_build_compare(builder, type, PIPE_FUNC_LESS, xpos, xmax);
+    m3 = lp_build_compare(builder, type, PIPE_FUNC_LESS, ypos, ymax);
+ 
+    /* AND all the masks together */
+    m = LLVMBuildAnd(builder, m0, m1, "");
+    m = LLVMBuildAnd(builder, m, m2, "");
+    m = LLVMBuildAnd(builder, m, m3, "");
+ 
+    lp_build_name(m, "scissormask");
+ 
+    return m;
+ }
+ 
+ 
+ static LLVMValueRef
+ build_int32_vec_const(int value)
+ {
+    struct lp_type i32_type;
+ 
+    memset(&i32_type, 0, sizeof i32_type);
+    i32_type.floating = FALSE; /* values are integers */
+    i32_type.sign = TRUE;      /* values are signed */
+    i32_type.norm = FALSE;     /* values are not normalized */
+    i32_type.width = 32;       /* 32-bit int values */
+    i32_type.length = 4;       /* 4 elements per vector */
+    return lp_build_int_const_scalar(i32_type, value);
+ }
+ 
+ 
+ 
   /**
    * Generate the fragment shader, depth/stencil test, and alpha tests.
+  * \param i  which quad in the tile, in range [0,3]
+  * \param do_tri_test  if 1, do triangle edge in/out testing
    */
   static void
   generate_fs(struct llvmpipe_context *lp,
@@@ -199,8 -384,15 +385,15 @@@
               const struct lp_build_interp_soa_context *interp,
               struct lp_build_sampler_soa *sampler,
               LLVMValueRef *pmask,
-             LLVMValueRef *color,
-             LLVMValueRef depth_ptr)
+             LLVMValueRef (*color)[4],
+             LLVMValueRef depth_ptr,
+             unsigned do_tri_test,
+             LLVMValueRef c0,
+             LLVMValueRef c1,
+             LLVMValueRef c2,
+             LLVMValueRef step0_ptr,
+             LLVMValueRef step1_ptr,
+             LLVMValueRef step2_ptr)
   {
      const struct tgsi_token *tokens = shader->base.tokens;
      LLVMTypeRef elem_type;
@@@ -214,6 -406,9 +407,9 @@@
      boolean early_depth_test;
      unsigned attrib;
      unsigned chan;
+    unsigned cbuf;
+ 
+    assert(i < 4);
   
      elem_type = lp_build_elem_type(type);
      vec_type = lp_build_vec_type(type);
@@@ -228,14 -423,32 +424,32 @@@
      lp_build_flow_scope_begin(flow);
   
      /* Declare the color and z variables */
-    for(chan = 0; chan < NUM_CHANNELS; ++chan) {
-       color[chan] = LLVMGetUndef(vec_type);
-       lp_build_flow_scope_declare(flow, &color[chan]);
+    for(cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
+       for(chan = 0; chan < NUM_CHANNELS; ++chan) {
+        color[cbuf][chan] = LLVMGetUndef(vec_type);
+        lp_build_flow_scope_declare(flow, &color[cbuf][chan]);
+       }
      }
      lp_build_flow_scope_declare(flow, &z);
   
+    /* do triangle edge testing */
+    if (do_tri_test) {
+       generate_tri_edge_mask(builder, i, pmask,
+                              c0, c1, c2, step0_ptr, step1_ptr, step2_ptr);
+    }
+    else {
+       *pmask = build_int32_vec_const(~0);
+    }
+ 
+    /* 'mask' will control execution based on quad's pixel alive/killed state */
      lp_build_mask_begin(&mask, flow, type, *pmask);
   
+    if (key->scissor) {
+       LLVMValueRef smask =
+          generate_scissor_test(builder, context_ptr, interp, type);
+       lp_build_mask_update(&mask, smask);
+    }
+ 
      early_depth_test =
         key->depth.enabled &&
         !key->alpha.enabled &&
@@@ -265,6 -478,7 +479,7 @@@
   
                     /* Alpha test */
                     /* XXX: should the alpha reference value be passed separately? */
+                 /* XXX: should only test the final assignment to alpha */
                     if(cbuf == 0 && chan == 3) {
                        LLVMValueRef alpha = outputs[attrib][chan];
                        LLVMValueRef alpha_ref_value;
@@@ -274,9 -488,7 +489,7 @@@
                                            &mask, alpha, alpha_ref_value);
                     }
   
-                   if(cbuf == 0)
-                      color[chan] = outputs[attrib][chan];
- 
+                 color[cbuf][chan] = outputs[attrib][chan];
                     break;
                  }
   
@@@ -331,6 -543,8 +544,8 @@@ generate_blend(const struct pipe_blend_
      lp_build_context_init(&bld, builder, type);
   
      flow = lp_build_flow_create(builder);
+ 
+    /* we'll use this mask context to skip blending if all pixels are dead */
      lp_build_mask_begin(&mask_ctx, flow, type, mask);
   
      vec_type = lp_build_vec_type(type);
@@@ -368,14 -582,18 +583,18 @@@
   
   /**
    * Generate the runtime callable function for the whole fragment pipeline.
+  * Note that the function which we generate operates on a block of 16
+  * pixels at at time.  The block contains 2x2 quads.  Each quad contains
+  * 2x2 pixels.
    */
- static struct lp_fragment_shader_variant *
+ static void
   generate_fragment(struct llvmpipe_context *lp,
                     struct lp_fragment_shader *shader,
-                   const struct lp_fragment_shader_variant_key *key)
+                   struct lp_fragment_shader_variant *variant,
+                   unsigned do_tri_test)
   {
      struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen);
-    struct lp_fragment_shader_variant *variant;
+    const struct lp_fragment_shader_variant_key *key = &variant->key;
      struct lp_type fs_type;
      struct lp_type blend_type;
      LLVMTypeRef fs_elem_type;
@@@ -383,17 -601,18 +602,18 @@@
      LLVMTypeRef fs_int_vec_type;
      LLVMTypeRef blend_vec_type;
      LLVMTypeRef blend_int_vec_type;
-    LLVMTypeRef arg_types[9];
+    LLVMTypeRef arg_types[14];
      LLVMTypeRef func_type;
+    LLVMTypeRef int32_vec4_type = lp_build_int32_vec4_type();
      LLVMValueRef context_ptr;
      LLVMValueRef x;
      LLVMValueRef y;
      LLVMValueRef a0_ptr;
      LLVMValueRef dadx_ptr;
      LLVMValueRef dady_ptr;
-    LLVMValueRef mask_ptr;
-    LLVMValueRef color_ptr;
+    LLVMValueRef color_ptr_ptr;
      LLVMValueRef depth_ptr;
+    LLVMValueRef c0, c1, c2, step0_ptr, step1_ptr, step2_ptr;
      LLVMBasicBlockRef block;
      LLVMBuilderRef builder;
      LLVMValueRef x0;
@@@ -401,71 -620,15 +621,15 @@@
      struct lp_build_sampler_soa *sampler;
      struct lp_build_interp_soa_context interp;
      LLVMValueRef fs_mask[LP_MAX_VECTOR_LENGTH];
-    LLVMValueRef fs_out_color[NUM_CHANNELS][LP_MAX_VECTOR_LENGTH];
+    LLVMValueRef fs_out_color[PIPE_MAX_COLOR_BUFS][NUM_CHANNELS][LP_MAX_VECTOR_LENGTH];
      LLVMValueRef blend_mask;
      LLVMValueRef blend_in_color[NUM_CHANNELS];
+    LLVMValueRef function;
      unsigned num_fs;
      unsigned i;
      unsigned chan;
+    unsigned cbuf;
   
-    if (LP_DEBUG & DEBUG_JIT) {
-       tgsi_dump(shader->base.tokens, 0);
-       if(key->depth.enabled) {
-          debug_printf("depth.format = %s\n", pf_name(key->zsbuf_format));
-          debug_printf("depth.func = %s\n", debug_dump_func(key->depth.func, TRUE));
-          debug_printf("depth.writemask = %u\n", key->depth.writemask);
-       }
-       if(key->alpha.enabled) {
-          debug_printf("alpha.func = %s\n", debug_dump_func(key->alpha.func, TRUE));
-          debug_printf("alpha.ref_value = %f\n", key->alpha.ref_value);
-       }
-       if(key->blend.logicop_enable) {
-          debug_printf("blend.logicop_func = %u\n", key->blend.logicop_func);
-       }
-       else if(key->blend.rt[0].blend_enable) {
-          debug_printf("blend.rgb_func = %s\n",   debug_dump_blend_func  (key->blend.rt[0].rgb_func, TRUE));
-          debug_printf("rgb_src_factor = %s\n",   debug_dump_blend_factor(key->blend.rt[0].rgb_src_factor, TRUE));
-          debug_printf("rgb_dst_factor = %s\n",   debug_dump_blend_factor(key->blend.rt[0].rgb_dst_factor, TRUE));
-          debug_printf("alpha_func = %s\n",       debug_dump_blend_func  (key->blend.rt[0].alpha_func, TRUE));
-          debug_printf("alpha_src_factor = %s\n", debug_dump_blend_factor(key->blend.rt[0].alpha_src_factor, TRUE));
-          debug_printf("alpha_dst_factor = %s\n", debug_dump_blend_factor(key->blend.rt[0].alpha_dst_factor, TRUE));
-       }
-       debug_printf("blend.colormask = 0x%x\n", key->blend.rt[0].colormask);
-       for(i = 0; i < PIPE_MAX_SAMPLERS; ++i) {
-          if(key->sampler[i].format) {
-             debug_printf("sampler[%u] = \n", i);
-             debug_printf("  .format = %s\n",
-                          pf_name(key->sampler[i].format));
-             debug_printf("  .target = %s\n",
-                          debug_dump_tex_target(key->sampler[i].target, TRUE));
-             debug_printf("  .pot = %u %u %u\n",
-                          key->sampler[i].pot_width,
-                          key->sampler[i].pot_height,
-                          key->sampler[i].pot_depth);
-             debug_printf("  .wrap = %s %s %s\n",
-                          debug_dump_tex_wrap(key->sampler[i].wrap_s, TRUE),
-                          debug_dump_tex_wrap(key->sampler[i].wrap_t, TRUE),
-                          debug_dump_tex_wrap(key->sampler[i].wrap_r, TRUE));
-             debug_printf("  .min_img_filter = %s\n",
-                          debug_dump_tex_filter(key->sampler[i].min_img_filter, TRUE));
-             debug_printf("  .min_mip_filter = %s\n",
-                          debug_dump_tex_mipfilter(key->sampler[i].min_mip_filter, TRUE));
-             debug_printf("  .mag_img_filter = %s\n",
-                          debug_dump_tex_filter(key->sampler[i].mag_img_filter, TRUE));
-             if(key->sampler[i].compare_mode != PIPE_TEX_COMPARE_NONE)
-                debug_printf("  .compare_func = %s\n", debug_dump_func(key->sampler[i].compare_func, TRUE));
-             debug_printf("  .normalized_coords = %u\n", key->sampler[i].normalized_coords);
-             debug_printf("  .prefilter = %u\n", key->sampler[i].prefilter);
-          }
-       }
-    }
- 
-    variant = CALLOC_STRUCT(lp_fragment_shader_variant);
-    if(!variant)
-       return NULL;
- 
-    variant->shader = shader;
-    memcpy(&variant->key, key, sizeof *key);
   
      /* TODO: actually pick these based on the fs and color buffer
       * characteristics. */
@@@ -475,8 -638,8 +639,8 @@@
      fs_type.sign = TRUE;     /* values are signed */
      fs_type.norm = FALSE;    /* values are not limited to [0,1] or [-1,1] */
      fs_type.width = 32;      /* 32-bit float */
-    fs_type.length = 4;      /* 4 element per vector */
-    num_fs = 4;
+    fs_type.length = 4;      /* 4 elements per vector */
+    num_fs = 4;              /* number of quads per block */
   
      memset(&blend_type, 0, sizeof blend_type);
      blend_type.floating = FALSE; /* values are integers */
@@@ -503,27 -666,47 +667,47 @@@
      arg_types[3] = LLVMPointerType(fs_elem_type, 0);    /* a0 */
      arg_types[4] = LLVMPointerType(fs_elem_type, 0);    /* dadx */
      arg_types[5] = LLVMPointerType(fs_elem_type, 0);    /* dady */
-    arg_types[6] = LLVMPointerType(fs_int_vec_type, 0); /* mask */
-    arg_types[7] = LLVMPointerType(blend_vec_type, 0);  /* color */
-    arg_types[8] = LLVMPointerType(fs_int_vec_type, 0); /* depth */
+    arg_types[6] = LLVMPointerType(LLVMPointerType(blend_vec_type, 0), 0);  /* color */
+    arg_types[7] = LLVMPointerType(fs_int_vec_type, 0); /* depth */
+    arg_types[8] = LLVMInt32Type();                     /* c0 */
+    arg_types[9] = LLVMInt32Type();                     /* c1 */
+    arg_types[10] = LLVMInt32Type();                    /* c2 */
+    /* Note: the step arrays are built as int32[16] but we interpret
+     * them here as int32_vec4[4].
+     */
+    arg_types[11] = LLVMPointerType(int32_vec4_type, 0);/* step0 */
+    arg_types[12] = LLVMPointerType(int32_vec4_type, 0);/* step1 */
+    arg_types[13] = LLVMPointerType(int32_vec4_type, 0);/* step2 */
   
      func_type = LLVMFunctionType(LLVMVoidType(), arg_types, Elements(arg_types), 0);
   
-    variant->function = LLVMAddFunction(screen->module, "shader", func_type);
-    LLVMSetFunctionCallConv(variant->function, LLVMCCallConv);
+    function = LLVMAddFunction(screen->module, "shader", func_type);
+    LLVMSetFunctionCallConv(function, LLVMCCallConv);
+ 
+    variant->function[do_tri_test] = function;
+ 
+ 
+    /* XXX: need to propagate noalias down into color param now we are
+     * passing a pointer-to-pointer?
+     */
      for(i = 0; i < Elements(arg_types); ++i)
         if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind)
-          LLVMAddAttribute(LLVMGetParam(variant->function, i), LLVMNoAliasAttribute);
- 
-    context_ptr  = LLVMGetParam(variant->function, 0);
-    x            = LLVMGetParam(variant->function, 1);
-    y            = LLVMGetParam(variant->function, 2);
-    a0_ptr       = LLVMGetParam(variant->function, 3);
-    dadx_ptr     = LLVMGetParam(variant->function, 4);
-    dady_ptr     = LLVMGetParam(variant->function, 5);
-    mask_ptr     = LLVMGetParam(variant->function, 6);
-    color_ptr    = LLVMGetParam(variant->function, 7);
-    depth_ptr    = LLVMGetParam(variant->function, 8);
+          LLVMAddAttribute(LLVMGetParam(function, i), LLVMNoAliasAttribute);
+ 
+    context_ptr  = LLVMGetParam(function, 0);
+    x            = LLVMGetParam(function, 1);
+    y            = LLVMGetParam(function, 2);
+    a0_ptr       = LLVMGetParam(function, 3);
+    dadx_ptr     = LLVMGetParam(function, 4);
+    dady_ptr     = LLVMGetParam(function, 5);
+    color_ptr_ptr = LLVMGetParam(function, 6);
+    depth_ptr    = LLVMGetParam(function, 7);
+    c0           = LLVMGetParam(function, 8);
+    c1           = LLVMGetParam(function, 9);
+    c2           = LLVMGetParam(function, 10);
+    step0_ptr    = LLVMGetParam(function, 11);
+    step1_ptr    = LLVMGetParam(function, 12);
+    step2_ptr    = LLVMGetParam(function, 13);
   
      lp_build_name(context_ptr, "context");
      lp_build_name(x, "x");
@@@ -531,36 -714,45 +715,45 @@@
      lp_build_name(a0_ptr, "a0");
      lp_build_name(dadx_ptr, "dadx");
      lp_build_name(dady_ptr, "dady");
-    lp_build_name(mask_ptr, "mask");
-    lp_build_name(color_ptr, "color");
+    lp_build_name(color_ptr_ptr, "color_ptr");
      lp_build_name(depth_ptr, "depth");
+    lp_build_name(c0, "c0");
+    lp_build_name(c1, "c1");
+    lp_build_name(c2, "c2");
+    lp_build_name(step0_ptr, "step0");
+    lp_build_name(step1_ptr, "step1");
+    lp_build_name(step2_ptr, "step2");
   
      /*
       * Function body
       */
   
-    block = LLVMAppendBasicBlock(variant->function, "entry");
+    block = LLVMAppendBasicBlock(function, "entry");
      builder = LLVMCreateBuilder();
      LLVMPositionBuilderAtEnd(builder, block);
   
      generate_pos0(builder, x, y, &x0, &y0);
   
-    lp_build_interp_soa_init(&interp, shader->base.tokens, builder, fs_type,
+    lp_build_interp_soa_init(&interp, 
+                             shader->base.tokens,
+                             key->flatshade,
+                             builder, fs_type,
                               a0_ptr, dadx_ptr, dady_ptr,
-                             x0, y0, 2, 0);
+                             x0, y0);
   
      /* code generated texture sampling */
      sampler = lp_llvm_sampler_soa_create(key->sampler, context_ptr);
   
+    /* loop over quads in the block */
      for(i = 0; i < num_fs; ++i) {
         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
-       LLVMValueRef out_color[NUM_CHANNELS];
+       LLVMValueRef out_color[PIPE_MAX_COLOR_BUFS][NUM_CHANNELS];
         LLVMValueRef depth_ptr_i;
+       int cbuf;
   
         if(i != 0)
-          lp_build_interp_soa_update(&interp);
+          lp_build_interp_soa_update(&interp, i);
   
-       fs_mask[i] = LLVMBuildLoad(builder, LLVMBuildGEP(builder, mask_ptr, &index, 1, ""), "");
         depth_ptr_i = LLVMBuildGEP(builder, depth_ptr, &index, 1, "");
   
         generate_fs(lp, shader, key,
@@@ -570,71 -762,163 +763,163 @@@
                     i,
                     &interp,
                     sampler,
-                   &fs_mask[i],
+                   &fs_mask[i], /* output */
                     out_color,
-                   depth_ptr_i);
- 
-       for(chan = 0; chan < NUM_CHANNELS; ++chan)
-          fs_out_color[chan][i] = out_color[chan];
+                   depth_ptr_i,
+                   do_tri_test,
+                   c0, c1, c2,
+                   step0_ptr, step1_ptr, step2_ptr);
+ 
+       for(cbuf = 0; cbuf < key->nr_cbufs; cbuf++)
+        for(chan = 0; chan < NUM_CHANNELS; ++chan)
+           fs_out_color[cbuf][chan][i] = out_color[cbuf][chan];
      }
   
      sampler->destroy(sampler);
   
-    /* 
-     * Convert the fs's output color and mask to fit to the blending type. 
+    /* Loop over color outputs / color buffers to do blending.
       */
+    for(cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
+       LLVMValueRef color_ptr;
+       LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), cbuf, 0);
   
-    for(chan = 0; chan < NUM_CHANNELS; ++chan) {
-       lp_build_conv(builder, fs_type, blend_type,
-                     fs_out_color[chan], num_fs,
-                     &blend_in_color[chan], 1);
-       lp_build_name(blend_in_color[chan], "color.%c", "rgba"[chan]);
+       /* 
+        * Convert the fs's output color and mask to fit to the blending type. 
+        */
+       for(chan = 0; chan < NUM_CHANNELS; ++chan) {
+        lp_build_conv(builder, fs_type, blend_type,
+                      fs_out_color[cbuf][chan], num_fs,
+                      &blend_in_color[chan], 1);
+        lp_build_name(blend_in_color[chan], "color%d.%c", cbuf, "rgba"[chan]);
+       }
   
+       lp_build_conv_mask(builder, fs_type, blend_type,
+                        fs_mask, num_fs,
+                        &blend_mask, 1);
+ 
+       color_ptr = LLVMBuildLoad(builder, 
+                               LLVMBuildGEP(builder, color_ptr_ptr, &index, 1, ""),
+                               "");
+       lp_build_name(color_ptr, "color_ptr%d", cbuf);
+ 
+       /*
+        * Blending.
+        */
+       generate_blend(&key->blend,
+                    builder,
+                    blend_type,
+                    context_ptr,
+                    blend_mask,
+                    blend_in_color,
+                    color_ptr);
      }
   
-    lp_build_conv_mask(builder, fs_type, blend_type,
-                       fs_mask, num_fs,
-                       &blend_mask, 1);
- 
-    /*
-     * Blending.
-     */
- 
-    generate_blend(&key->blend,
-                   builder,
-                   blend_type,
-                   context_ptr,
-                   blend_mask,
-                   blend_in_color,
-                   color_ptr);
- 
      LLVMBuildRetVoid(builder);
   
      LLVMDisposeBuilder(builder);
   
-    /*
-     * Translate the LLVM IR into machine code.
-     */
   
+    /* Verify the LLVM IR.  If invalid, dump and abort */
   #ifdef DEBUG
-    if(LLVMVerifyFunction(variant->function, LLVMPrintMessageAction)) {
-       LLVMDumpValue(variant->function);
-       assert(0);
+    if(LLVMVerifyFunction(function, LLVMPrintMessageAction)) {
+       if (1)
+          LLVMDumpValue(function);
+       abort();
      }
   #endif
   
-    LLVMRunFunctionPassManager(screen->pass, variant->function);
+    /* Apply optimizations to LLVM IR */
+    if (1)
+       LLVMRunFunctionPassManager(screen->pass, function);
   
      if (LP_DEBUG & DEBUG_JIT) {
-       LLVMDumpValue(variant->function);
+       /* Print the LLVM IR to stderr */
+       LLVMDumpValue(function);
         debug_printf("\n");
      }
   
-    variant->jit_function = (lp_jit_frag_func)LLVMGetPointerToGlobal(screen->engine, variant->function);
+    /*
+     * Translate the LLVM IR into machine code.
+     */
+    variant->jit_function[do_tri_test] = (lp_jit_frag_func)LLVMGetPointerToGlobal(screen->engine, function);
   
      if (LP_DEBUG & DEBUG_ASM)
-       lp_disassemble(variant->jit_function);
+       lp_disassemble(variant->jit_function[do_tri_test]);
+ }
+ 
+ 
+ static struct lp_fragment_shader_variant *
+ generate_variant(struct llvmpipe_context *lp,
+                  struct lp_fragment_shader *shader,
+                  const struct lp_fragment_shader_variant_key *key)
+ {
+    struct lp_fragment_shader_variant *variant;
+ 
+    if (LP_DEBUG & DEBUG_JIT) {
+       unsigned i;
+ 
+       tgsi_dump(shader->base.tokens, 0);
+       if(key->depth.enabled) {
+          debug_printf("depth.format = %s\n", pf_name(key->zsbuf_format));
+          debug_printf("depth.func = %s\n", debug_dump_func(key->depth.func, TRUE));
+          debug_printf("depth.writemask = %u\n", key->depth.writemask);
+       }
+       if(key->alpha.enabled) {
+          debug_printf("alpha.func = %s\n", debug_dump_func(key->alpha.func, TRUE));
+          debug_printf("alpha.ref_value = %f\n", key->alpha.ref_value);
+       }
+       if(key->blend.logicop_enable) {
+          debug_printf("blend.logicop_func = %u\n", key->blend.logicop_func);
+       }
+       else if(key->blend.rt[0].blend_enable) {
+          debug_printf("blend.rgb_func = %s\n",   debug_dump_blend_func  (key->blend.rt[0].rgb_func, TRUE));
+          debug_printf("rgb_src_factor = %s\n",   debug_dump_blend_factor(key->blend.rt[0].rgb_src_factor, TRUE));
+          debug_printf("rgb_dst_factor = %s\n",   debug_dump_blend_factor(key->blend.rt[0].rgb_dst_factor, TRUE));
+          debug_printf("alpha_func = %s\n",       debug_dump_blend_func  (key->blend.rt[0].alpha_func, TRUE));
+          debug_printf("alpha_src_factor = %s\n", debug_dump_blend_factor(key->blend.rt[0].alpha_src_factor, TRUE));
+          debug_printf("alpha_dst_factor = %s\n", debug_dump_blend_factor(key->blend.rt[0].alpha_dst_factor, TRUE));
+       }
+       debug_printf("blend.colormask = 0x%x\n", key->blend.rt[0].colormask);
+       for(i = 0; i < PIPE_MAX_SAMPLERS; ++i) {
+          if(key->sampler[i].format) {
+             debug_printf("sampler[%u] = \n", i);
+             debug_printf("  .format = %s\n",
+                          pf_name(key->sampler[i].format));
+             debug_printf("  .target = %s\n",
+                          debug_dump_tex_target(key->sampler[i].target, TRUE));
+             debug_printf("  .pot = %u %u %u\n",
+                          key->sampler[i].pot_width,
+                          key->sampler[i].pot_height,
+                          key->sampler[i].pot_depth);
+             debug_printf("  .wrap = %s %s %s\n",
+                          debug_dump_tex_wrap(key->sampler[i].wrap_s, TRUE),
+                          debug_dump_tex_wrap(key->sampler[i].wrap_t, TRUE),
+                          debug_dump_tex_wrap(key->sampler[i].wrap_r, TRUE));
+             debug_printf("  .min_img_filter = %s\n",
+                          debug_dump_tex_filter(key->sampler[i].min_img_filter, TRUE));
+             debug_printf("  .min_mip_filter = %s\n",
+                          debug_dump_tex_mipfilter(key->sampler[i].min_mip_filter, TRUE));
+             debug_printf("  .mag_img_filter = %s\n",
+                          debug_dump_tex_filter(key->sampler[i].mag_img_filter, TRUE));
+             if(key->sampler[i].compare_mode != PIPE_TEX_COMPARE_NONE)
+                debug_printf("  .compare_func = %s\n", debug_dump_func(key->sampler[i].compare_func, TRUE));
+             debug_printf("  .normalized_coords = %u\n", key->sampler[i].normalized_coords);
+             debug_printf("  .prefilter = %u\n", key->sampler[i].prefilter);
+          }
+       }
+    }
+ 
+    variant = CALLOC_STRUCT(lp_fragment_shader_variant);
+    if(!variant)
+       return NULL;
+ 
+    variant->shader = shader;
+    memcpy(&variant->key, key, sizeof *key);
   
+    generate_fragment(lp, shader, variant, 0);
+    generate_fragment(lp, shader, variant, 1);
+ 
+    /* insert new variant into linked list */
      variant->next = shader->variants;
      shader->variants = variant;
   
@@@ -692,11 -976,15 +977,15 @@@ llvmpipe_delete_fs_state(struct pipe_co
      variant = shader->variants;
      while(variant) {
         struct lp_fragment_shader_variant *next = variant->next;
- 
-       if(variant->function) {
-          if(variant->jit_function)
-             LLVMFreeMachineCodeForFunction(screen->engine, variant->function);
-          LLVMDeleteFunction(variant->function);
+       unsigned i;
+ 
+       for (i = 0; i < Elements(variant->function); i++) {
+          if (variant->function[i]) {
+             if (variant->jit_function[i])
+                LLVMFreeMachineCodeForFunction(screen->engine,
+                                               variant->function[i]);
+             LLVMDeleteFunction(variant->function[i]);
+          }
         }
   
         FREE(variant);
@@@ -722,15 -1010,14 +1011,14 @@@ llvmpipe_set_constant_buffer(struct pip
      assert(shader < PIPE_SHADER_TYPES);
      assert(index == 0);
   
+    if(llvmpipe->constants[shader] == constants)
+       return;
+ 
      draw_flush(llvmpipe->draw);
   
      /* note: reference counting */
      pipe_buffer_reference(&llvmpipe->constants[shader], constants);
   
-    if(shader == PIPE_SHADER_FRAGMENT) {
-       llvmpipe->jit_context.constants = data;
-    }
- 
      if(shader == PIPE_SHADER_VERTEX) {
         draw_set_mapped_constant_buffer(llvmpipe->draw, PIPE_SHADER_VERTEX, 0,
                                         data, size);
@@@ -767,21 -1054,30 +1055,30 @@@ make_variant_key(struct llvmpipe_contex
         key->alpha.func = lp->depth_stencil->alpha.func;
      /* alpha.ref_value is passed in jit_context */
   
-    if(lp->framebuffer.cbufs[0]) {
-       const struct util_format_description *format_desc;
-       unsigned chan;
+    key->flatshade = lp->rasterizer->flatshade;
+    key->scissor = lp->rasterizer->scissor;
   
+    if (lp->framebuffer.nr_cbufs) {
         memcpy(&key->blend, lp->blend, sizeof key->blend);
+    }
   
-       format_desc = util_format_description(lp->framebuffer.cbufs[0]->format);
+    key->nr_cbufs = lp->framebuffer.nr_cbufs;
+    for (i = 0; i < lp->framebuffer.nr_cbufs; i++) {
+       const struct util_format_description *format_desc;
+       unsigned chan;
+ 
+       format_desc = util_format_description(lp->framebuffer.cbufs[i]->format);
         assert(format_desc->layout == UTIL_FORMAT_COLORSPACE_RGB ||
                format_desc->layout == UTIL_FORMAT_COLORSPACE_SRGB);
   
-       /* mask out color channels not present in the color buffer */
+       /* mask out color channels not present in the color buffer.
+        * Should be simple to incorporate per-cbuf writemasks:
+        */
         for(chan = 0; chan < 4; ++chan) {
            enum util_format_swizzle swizzle = format_desc->swizzle[chan];
-          if(swizzle > 4)
-             key->blend.rt[0].colormask &= ~(1 << chan);
+ 
+          if(swizzle <= UTIL_FORMAT_SWIZZLE_W)
+             key->blend.rt[0].colormask |= (1 << chan);
         }
      }
   
@@@ -791,12 -1087,17 +1088,17 @@@
   }
   
   
+ /**
+  * Update fragment state.  This is called just prior to drawing
+  * something when some fragment-related state has changed.
+  */
   void 
   llvmpipe_update_fs(struct llvmpipe_context *lp)
   {
      struct lp_fragment_shader *shader = lp->fs;
      struct lp_fragment_shader_variant_key key;
      struct lp_fragment_shader_variant *variant;
+    boolean opaque;
   
      make_variant_key(lp, shader, &key);
   
@@@ -808,8 -1109,34 +1110,34 @@@
         variant = variant->next;
      }
   
-    if(!variant)
-       variant = generate_fragment(lp, shader, &key);
+    if (!variant) {
+       struct util_time t0, t1;
+       int64_t dt;
+       util_time_get(&t0);
+ 
+       variant = generate_variant(lp, shader, &key);
+ 
+       util_time_get(&t1);
+       dt = util_time_diff(&t0, &t1);
+       LP_COUNT_ADD(llvm_compile_time, dt);
+       LP_COUNT_ADD(nr_llvm_compiles, 2);  /* emit vs. omit in/out test */
+    }
   
      shader->current = variant;
+ 
+    /* TODO: put this in the variant */
+    /* TODO: most of these can be relaxed, in particular the colormask */
+    opaque = !key.blend.logicop_enable &&
+             !key.blend.rt[0].blend_enable &&
+             key.blend.rt[0].colormask == 0xf &&
+             !key.alpha.enabled &&
+             !key.depth.enabled &&
+             !key.scissor &&
+             !shader->info.uses_kill
+             ? TRUE : FALSE;
+ 
+    lp_setup_set_fs_functions(lp->setup, 
+                              shader->current->jit_function[0],
+                              shader->current->jit_function[1],
+                              opaque);
   }
diff --combined src/gallium/drivers/llvmpipe/lp_state_sampler.c

index bda9c138d52291e21975271154f3ee992bffa07b,976f81113fd1e4bea5176cb00ae2942168815c85..43649febf27d40b553a80b9daaeb3863fb7990e8
--- 1/src/gallium/drivers/llvmpipe/lp_state_sampler.c
--- 2/src/gallium/drivers/llvmpipe/lp_state_sampler.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
@@@ -29,7 -29,6 +29,7 @@@
    *  Brian Paul
    */
   
+ +#include "util/u_inlines.h"
   #include "util/u_memory.h"
   
   #include "draw/draw_context.h"
@@@ -38,7 -37,6 +38,6 @@@
   #include "lp_context.h"
   #include "lp_state.h"
   #include "lp_texture.h"
- #include "lp_tex_cache.h"
   #include "draw/draw_context.h"
   
   
@@@ -126,17 -124,6 +125,6 @@@ llvmpipe_set_sampler_textures(struct pi
         struct pipe_texture *tex = i < num ? texture[i] : NULL;
   
         pipe_texture_reference(&llvmpipe->texture[i], tex);
-       lp_tex_tile_cache_set_texture(llvmpipe->tex_cache[i], tex);
- 
-       if(tex) {
-          struct llvmpipe_texture *lp_tex = llvmpipe_texture(tex);
-          struct lp_jit_texture *jit_tex = &llvmpipe->jit_context.textures[i];
-          jit_tex->width = tex->width0;
-          jit_tex->height = tex->height0;
-          jit_tex->stride = lp_tex->stride[0];
-          if(!lp_tex->dt)
-             jit_tex->data = lp_tex->data;
-       }
      }
   
      llvmpipe->num_textures = num;
@@@ -167,7 -154,6 +155,6 @@@ llvmpipe_set_vertex_sampler_textures(st
         struct pipe_texture *tex = i < num_textures ? textures[i] : NULL;
   
         pipe_texture_reference(&llvmpipe->vertex_textures[i], tex);
-       lp_tex_tile_cache_set_texture(llvmpipe->vertex_tex_cache[i], tex);
      }
   
      llvmpipe->num_vertex_textures = num_textures;
diff --combined src/gallium/drivers/llvmpipe/lp_state_surface.c

index 0afa49d0b758edb25fa03027c982673b3f779e7f,aa4241a80db46e630679a88edd08739e66ef5907..048ac5b968b315e4d7692602b40beab37aa59150
--- 1/src/gallium/drivers/llvmpipe/lp_state_surface.c
--- 2/src/gallium/drivers/llvmpipe/lp_state_surface.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_surface.c
@@@ -28,11 -28,11 +28,12 @@@
   /* Authors:  Keith Whitwell <keith@tungstengraphics.com>
    */
   
- 
+ #include "pipe/p_state.h"
+ +#include "util/u_inlines.h"
+ #include "util/u_surface.h"
   #include "lp_context.h"
   #include "lp_state.h"
- #include "lp_tile_cache.h"
+ #include "lp_setup.h"
   
   #include "draw/draw_context.h"
   
@@@ -40,54 -40,19 +41,19 @@@
   
   
   /**
-  * XXX this might get moved someday
    * Set the framebuffer surface info: color buffers, zbuffer, stencil buffer.
-  * Here, we flush the old surfaces and update the tile cache to point to the new
-  * surfaces.
    */
   void
   llvmpipe_set_framebuffer_state(struct pipe_context *pipe,
                                  const struct pipe_framebuffer_state *fb)
   {
      struct llvmpipe_context *lp = llvmpipe_context(pipe);
-    uint i;
- 
-    draw_flush(lp->draw);
- 
-    for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
-       /* check if changing cbuf */
-       if (lp->framebuffer.cbufs[i] != fb->cbufs[i]) {
-          /* flush old */
-          lp_tile_cache_map_transfers(lp->cbuf_cache[i]);
-          lp_flush_tile_cache(lp->cbuf_cache[i]);
- 
-          /* assign new */
-          pipe_surface_reference(&lp->framebuffer.cbufs[i], fb->cbufs[i]);
- 
-          /* update cache */
-          lp_tile_cache_set_surface(lp->cbuf_cache[i], fb->cbufs[i]);
-       }
-    }
   
-    lp->framebuffer.nr_cbufs = fb->nr_cbufs;
+    boolean changed = !util_framebuffer_state_equal(&lp->framebuffer, fb);
   
-    /* zbuf changing? */
-    if (lp->framebuffer.zsbuf != fb->zsbuf) {
+    if (changed) {
   
-       if(lp->zsbuf_transfer) {
-          struct pipe_screen *screen = pipe->screen;
- 
-          if(lp->zsbuf_map) {
-             screen->transfer_unmap(screen, lp->zsbuf_transfer);
-             lp->zsbuf_map = NULL;
-          }
- 
-          screen->tex_transfer_destroy(lp->zsbuf_transfer);
-          lp->zsbuf_transfer = NULL;
-       }
- 
-       /* assign new */
-       pipe_surface_reference(&lp->framebuffer.zsbuf, fb->zsbuf);
+       util_copy_framebuffer_state(&lp->framebuffer, fb);
   
         /* Tell draw module how deep the Z/depth buffer is */
         if (lp->framebuffer.zsbuf) {
@@@ -104,10 -69,9 +70,9 @@@
            }
            draw_set_mrd(lp->draw, mrd);
         }
-    }
   
-    lp->framebuffer.width = fb->width;
-    lp->framebuffer.height = fb->height;
+       lp_setup_bind_framebuffer( lp->setup, &lp->framebuffer );
   
-    lp->dirty |= LP_NEW_FRAMEBUFFER;
+       lp->dirty |= LP_NEW_FRAMEBUFFER;
+    }
   }
diff --combined src/gallium/drivers/llvmpipe/lp_texture.c

index 8c20625430c61c3b3f3e2b6ef0e1a52c56c92813,c9b6eb180ff81db61cb17155d3da3ab4aee0b6eb..605200396b225e59c4117780effc5b87e1e6f14b
--- 1/src/gallium/drivers/llvmpipe/lp_texture.c
--- 2/src/gallium/drivers/llvmpipe/lp_texture.c
+++ b/src/gallium/drivers/llvmpipe/lp_texture.c
@@@ -32,43 -32,43 +32,43 @@@
   
   #include "pipe/p_context.h"
   #include "pipe/p_defines.h"
- -#include "pipe/p_inlines.h"
+ +#include "util/u_inlines.h"
   
   #include "util/u_format.h"
   #include "util/u_math.h"
   #include "util/u_memory.h"
   
   #include "lp_context.h"
+ #include "lp_screen.h"
   #include "lp_state.h"
   #include "lp_texture.h"
- #include "lp_screen.h"
+ #include "lp_tile_size.h"
   #include "lp_winsys.h"
   
   
- /* Simple, maximally packed layout.
-  */
- 
- /* Conventional allocation path for non-display textures:
+ /**
+  * Conventional allocation path for non-display textures:
+  * Simple, maximally packed layout.
    */
   static boolean
   llvmpipe_texture_layout(struct llvmpipe_screen *screen,
-                         struct llvmpipe_texture * lpt)
+                         struct llvmpipe_texture *lpt)
   {
      struct pipe_texture *pt = &lpt->base;
      unsigned level;
      unsigned width = pt->width0;
      unsigned height = pt->height0;
      unsigned depth = pt->depth0;
- 
      unsigned buffer_size = 0;
   
      for (level = 0; level <= pt->last_level; level++) {
         unsigned nblocksx, nblocksy;
   
         /* Allocate storage for whole quads. This is particularly important
-        * for depth surfaces, which are currently stored in a swizzled format. */
-       nblocksx = util_format_get_nblocksx(pt->format, align(width, 2));
-       nblocksy = util_format_get_nblocksy(pt->format, align(height, 2));
+        * for depth surfaces, which are currently stored in a swizzled format.
+        */
+       nblocksx = util_format_get_nblocksx(pt->format, align(width, TILE_SIZE));
+       nblocksy = util_format_get_nblocksy(pt->format, align(height, TILE_SIZE));
   
         lpt->stride[level] = align(nblocksx * util_format_get_blocksize(pt->format), 16);
   
@@@ -78,7 -78,7 +78,7 @@@
                         ((pt->target == PIPE_TEXTURE_CUBE) ? 6 : depth) *
                         lpt->stride[level]);
   
-       width  = u_minify(width, 1);
+       width = u_minify(width, 1);
         height = u_minify(height, 1);
         depth = u_minify(depth, 1);
      }
@@@ -88,16 -88,23 +88,23 @@@
      return lpt->data != NULL;
   }
   
+ 
+ 
   static boolean
   llvmpipe_displaytarget_layout(struct llvmpipe_screen *screen,
-                               struct llvmpipe_texture * lpt)
+                               struct llvmpipe_texture *lpt)
   {
      struct llvmpipe_winsys *winsys = screen->winsys;
   
+    /* Round up the surface size to a multiple of the tile size to
+     * avoid tile clipping.
+     */
+    unsigned width = align(lpt->base.width0, TILE_SIZE);
+    unsigned height = align(lpt->base.height0, TILE_SIZE);
+ 
      lpt->dt = winsys->displaytarget_create(winsys,
                                             lpt->base.format,
-                                           lpt->base.width0,
-                                           lpt->base.height0,
+                                           width, height,
                                             16,
                                             &lpt->stride[0] );
   
@@@ -105,9 -112,6 +112,6 @@@
   }
   
   
- 
- 
- 
   static struct pipe_texture *
   llvmpipe_texture_create(struct pipe_screen *_screen,
                           const struct pipe_texture *templat)
@@@ -124,7 -128,7 +128,7 @@@
      /* XXX: The xlib state tracker is brain-dead and will request
       * PIPE_FORMAT_Z16_UNORM no matter how much we tell it we don't support it.
       */
-    if(lpt->base.format == PIPE_FORMAT_Z16_UNORM)
+    if (lpt->base.format == PIPE_FORMAT_Z16_UNORM)
         lpt->base.format = PIPE_FORMAT_Z32_UNORM;
   
      if (lpt->base.tex_usage & (PIPE_TEXTURE_USAGE_DISPLAY_TARGET |
@@@ -176,6 -180,7 +180,7 @@@ llvmpipe_texture_blanket(struct pipe_sc
   
      return &lpt->base;
   #else
+    debug_printf("llvmpipe_texture_blanket() not implemented!");
      return NULL;
   #endif
   }
@@@ -187,12 -192,15 +192,15 @@@ llvmpipe_texture_destroy(struct pipe_te
      struct llvmpipe_screen *screen = llvmpipe_screen(pt->screen);
      struct llvmpipe_texture *lpt = llvmpipe_texture(pt);
   
-    if(lpt->dt) {
+    if (lpt->dt) {
+       /* display target */
         struct llvmpipe_winsys *winsys = screen->winsys;
         winsys->displaytarget_destroy(winsys, lpt->dt);
      }
-    else
+    else {
+       /* regular texture */
         align_free(lpt->data);
+    }
   
      FREE(lpt);
   }
@@@ -234,7 -242,7 +242,7 @@@ llvmpipe_get_tex_surface(struct pipe_sc
   
         if (ps->usage & (PIPE_BUFFER_USAGE_CPU_WRITE |
                          PIPE_BUFFER_USAGE_GPU_WRITE)) {
-          /* Mark the surface as dirty.  The tile cache will look for this. */
+          /* Mark the surface as dirty. */
            lpt->timestamp++;
            llvmpipe_screen(screen)->timestamp++;
         }
@@@ -296,8 -304,8 +304,8 @@@ llvmpipe_get_tex_transfer(struct pipe_s
         pipe_texture_reference(&pt->texture, texture);
         pt->x = x;
         pt->y = y;
-       pt->width = w;
-       pt->height = h;
+       pt->width = align(w, TILE_SIZE);
+       pt->height = align(h, TILE_SIZE);
         pt->stride = lptex->stride[level];
         pt->usage = usage;
         pt->face = face;
@@@ -354,7 -362,8 +362,8 @@@ llvmpipe_transfer_map( struct pipe_scre
      lpt = llvmpipe_texture(transfer->texture);
      format = lpt->base.format;
   
-    if(lpt->dt) {
+    if (lpt->dt) {
+       /* display target */
         struct llvmpipe_winsys *winsys = screen->winsys;
   
         map = winsys->displaytarget_map(winsys, lpt->dt,
@@@ -362,16 -371,16 +371,16 @@@
         if (map == NULL)
            return NULL;
      }
-    else
+    else {
+       /* regular texture */
         map = lpt->data;
+    }
   
      /* May want to different things here depending on read/write nature
       * of the map:
       */
-    if (transfer->texture && (transfer->usage & PIPE_TRANSFER_WRITE))
-    {
+    if (transfer->texture && (transfer->usage & PIPE_TRANSFER_WRITE)) {
         /* Do something to notify sharing contexts of a texture change.
-        * In llvmpipe, that would mean flushing the texture cache.
          */
         screen->timestamp++;
      }
@@@ -385,28 -394,23 +394,23 @@@
   
   
   static void
- llvmpipe_transfer_unmap(struct pipe_screen *_screen,
+ llvmpipe_transfer_unmap(struct pipe_screen *screen,
                          struct pipe_transfer *transfer)
   {
-    struct llvmpipe_screen *screen = llvmpipe_screen(_screen);
+    struct llvmpipe_screen *lp_screen = llvmpipe_screen(screen);
      struct llvmpipe_texture *lpt;
   
      assert(transfer->texture);
      lpt = llvmpipe_texture(transfer->texture);
   
-    if(lpt->dt) {
-       struct llvmpipe_winsys *winsys = screen->winsys;
+    if (lpt->dt) {
+       /* display target */
+       struct llvmpipe_winsys *winsys = lp_screen->winsys;
         winsys->displaytarget_unmap(winsys, lpt->dt);
      }
   }
   
   
- void
- llvmpipe_init_texture_funcs(struct llvmpipe_context *lp)
- {
- }
- 
- 
   void
   llvmpipe_init_screen_texture_funcs(struct pipe_screen *screen)
   {
author	José Fonseca <jfonseca@vmware.com>
	Fri, 5 Feb 2010 13:48:35 +0000 (13:48 +0000)
committer	José Fonseca <jfonseca@vmware.com>
	Fri, 5 Feb 2010 13:48:35 +0000 (13:48 +0000)
		1	2
src/gallium/auxiliary/os/os_thread.h	patch \|	diff1 \|	\|	blob \| history
src/gallium/auxiliary/util/u_debug.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/gallium/auxiliary/util/u_debug.h	patch \|	diff1 \|	diff2 \|	blob \| history
src/gallium/auxiliary/util/u_ringbuffer.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/gallium/auxiliary/util/u_surface.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/gallium/auxiliary/util/u_time.h	patch \|	diff1 \|	diff2 \|	blob \| history
src/gallium/drivers/llvmpipe/SConscript	patch \|	diff1 \|	diff2 \|	blob \| history
src/gallium/drivers/llvmpipe/lp_bld_logic.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/gallium/drivers/llvmpipe/lp_buffer.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/gallium/drivers/llvmpipe/lp_context.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/gallium/drivers/llvmpipe/lp_fence.c	patch \|	\|	diff2 \|	blob \| history
src/gallium/drivers/llvmpipe/lp_fence.h	patch \|	\|	diff2 \|	blob \| history
src/gallium/drivers/llvmpipe/lp_rast.c	patch \|	\|	diff2 \|	blob \| history
src/gallium/drivers/llvmpipe/lp_rast_priv.h	patch \|	\|	diff2 \|	blob \| history
src/gallium/drivers/llvmpipe/lp_scene.c	patch \|	\|	diff2 \|	blob \| history
src/gallium/drivers/llvmpipe/lp_scene.h	patch \|	\|	diff2 \|	blob \| history
src/gallium/drivers/llvmpipe/lp_setup.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/gallium/drivers/llvmpipe/lp_state_fs.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/gallium/drivers/llvmpipe/lp_state_sampler.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/gallium/drivers/llvmpipe/lp_state_surface.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/gallium/drivers/llvmpipe/lp_texture.c	patch \|	diff1 \|	diff2 \|	blob \| history