From: Marek Olšák <maraeo@gmail.com>
Date: Fri, 11 May 2012 14:38:13 +0000 (+0200)
Subject: Merge branch 'gallium-userbuf'
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=bb4c5d72d7c7cb1d9e7016e2c07c36875f30011a;hp=8dd3e341b337ca2d22bcc0e7548a78a6c36ca77d;p=mesa.git

Merge branch 'gallium-userbuf'

Conflicts:
	src/gallium/docs/source/screen.rst
	src/gallium/drivers/nv50/nv50_state.c
	src/gallium/include/pipe/p_defines.h
	src/mesa/state_tracker/st_draw.c
---

diff --git a/bin/.gitignore b/bin/.gitignore
index 04c0a1c16bb..2ee67a666a1 100644
--- a/bin/.gitignore
+++ b/bin/.gitignore
@@ -4,3 +4,4 @@ install-sh
 /depcomp
 /missing
 ylwrap
+compile
diff --git a/configs/autoconf.in b/configs/autoconf.in
index eb6713ddf08..3c8f4c1eef9 100644
--- a/configs/autoconf.in
+++ b/configs/autoconf.in
@@ -219,3 +219,6 @@ MESA_LLVM = @MESA_LLVM@
 LLVM_VERSION = @LLVM_VERSION@
 
 HAVE_XF86VIDMODE = @HAVE_XF86VIDMODE@
+
+GALLIUM_PIPE_LOADER_DEFINES = @GALLIUM_PIPE_LOADER_DEFINES@
+GALLIUM_PIPE_LOADER_LIBS = @GALLIUM_PIPE_LOADER_LIBS@
diff --git a/configs/default b/configs/default
index cdaeec8b478..40fa5e31ea2 100644
--- a/configs/default
+++ b/configs/default
@@ -9,7 +9,7 @@ CONFIG_NAME = default
 
 # Version info
 MESA_MAJOR=8
-MESA_MINOR=0
+MESA_MINOR=1
 MESA_TINY=0
 MESA_VERSION = $(MESA_MAJOR).$(MESA_MINOR).$(MESA_TINY)
 
diff --git a/configure.ac b/configure.ac
index 3bc59ca3e78..2d1265a7b63 100644
--- a/configure.ac
+++ b/configure.ac
@@ -109,11 +109,11 @@ if test "x$GCC" = xyes -a "x$acv_mesa_CLANG" = xno; then
 
     GCC_VERSION=`$CC -dumpversion`
     if test $? -eq 0; then
-        major=`echo $GCC_VERSION | cut -d. -f1`
-        minor=`echo $GCC_VERSION | cut -d. -f2`
+        GCC_VERSION_MAJOR=`echo $GCC_VERSION | cut -d. -f1`
+        GCC_VERSION_MINOR=`echo $GCC_VERSION | cut -d. -f2`
     fi
 
-    if test $major -lt 3 -o $major -eq 3 -a $minor -lt 3 ; then
+    if test $GCC_VERSION_MAJOR -lt 3 -o $GCC_VERSION_MAJOR -eq 3 -a $GCC_VERSION_MINOR -lt 3 ; then
         AC_MSG_RESULT([no])
         AC_MSG_ERROR([If using GCC, version 3.3.0 or later is required.])
     else
@@ -616,7 +616,11 @@ AC_ARG_ENABLE([va],
          [enable va library @<:@default=auto@:>@])],
    [enable_va="$enableval"],
    [enable_va=auto])
-
+AC_ARG_ENABLE([opencl],
+   [AS_HELP_STRING([--enable-opencl],
+         [enable OpenCL library @<:@default=no@:>@])],
+   [enable_opencl="$enableval"],
+   [enable_opencl=no])
 AC_ARG_ENABLE([xlib_glx],
     [AS_HELP_STRING([--enable-xlib-glx],
         [make GLX library Xlib-based instead of DRI-based @<:@default=disable@:>@])],
@@ -643,6 +647,12 @@ AC_ARG_ENABLE([r600-llvm-compiler],
     [enable_r600_llvm="$enableval"],
     [enable_r600_llvm=no])
 
+AC_ARG_ENABLE([gallium_tests],
+    [AS_HELP_STRING([--enable-gallium-tests],
+        [Enable optional Gallium tests) @<:@default=disable@:>@])],
+    [enable_gallium_tests="$enableval"],
+    [enable_gallium_tests=no])
+
 # Option for Gallium drivers
 GALLIUM_DRIVERS_DEFAULT="r300,r600,svga,swrast"
 
@@ -670,7 +680,8 @@ if test "x$enable_opengl" = xno -a \
         "x$enable_d3d1x" = xno -a \
         "x$enable_xvmc" = xno -a \
         "x$enable_vdpau" = xno -a \
-        "x$enable_va" = xno; then
+        "x$enable_va" = xno -a \
+        "x$enable_opencl" = xno; then
     AC_MSG_ERROR([at least one API should be enabled])
 fi
 
@@ -1477,6 +1488,7 @@ if test "x$enable_gallium_gbm" = xyes; then
     GALLIUM_STATE_TRACKERS_DIRS="gbm $GALLIUM_STATE_TRACKERS_DIRS"
     GALLIUM_TARGET_DIRS="$GALLIUM_TARGET_DIRS gbm"
     HAVE_ST_GBM="yes"
+    enable_gallium_loader=yes
 fi
 
 dnl
@@ -1596,6 +1608,24 @@ if test "x$enable_va" = xyes; then
     HAVE_ST_VA="yes"
 fi
 
+dnl
+dnl OpenCL configuration
+dnl
+
+if test "x$enable_opencl" = xyes; then
+    if test "x$with_gallium_drivers" = x; then
+        AC_MSG_ERROR([cannot enable OpenCL without Gallium])
+    fi
+
+    if test $GCC_VERSION_MAJOR -lt 4 -o $GCC_VERSION_MAJOR -eq 4 -a $GCC_VERSION_MINOR -lt 6; then
+        AC_MSG_ERROR([gcc >= 4.6 is required to build clover])
+    fi
+
+    GALLIUM_STATE_TRACKERS_DIRS="$GALLIUM_STATE_TRACKERS_DIRS clover"
+    GALLIUM_TARGET_DIRS="$GALLIUM_TARGET_DIRS opencl"
+    enable_gallium_loader=yes
+fi
+
 dnl
 dnl GLU configuration
 dnl
@@ -1828,6 +1858,14 @@ AC_ARG_WITH([xvmc-libdir],
     [XVMC_LIB_INSTALL_DIR='${libdir}'])
 AC_SUBST([XVMC_LIB_INSTALL_DIR])
 
+dnl
+dnl Gallium Tests
+dnl
+if test "x$enable_gallium_tests" = xyes; then
+    SRC_DIRS="$SRC_DIRS gallium/tests/trivial"
+    enable_gallium_loader=yes
+fi
+
 dnl Directory for VDPAU libs
 AC_ARG_WITH([vdpau-libdir],
     [AS_HELP_STRING([--with-vdpau-libdir=DIR],
@@ -1844,6 +1882,14 @@ AC_ARG_WITH([va-libdir],
     [VA_LIB_INSTALL_DIR='${libdir}/va'])
 AC_SUBST([VA_LIB_INSTALL_DIR])
 
+dnl Directory for OpenCL libs
+AC_ARG_WITH([opencl-libdir],
+    [AS_HELP_STRING([--with-opencl-libdir=DIR],
+        [directory for the OpenCL libraries @<:@default=${libdir}/opencl@:>@])],
+    [OPENCL_LIB_INSTALL_DIR="$withval"],
+    [OPENCL_LIB_INSTALL_DIR='${libdir}/opencl'])
+AC_SUBST([OPENCL_LIB_INSTALL_DIR])
+
 dnl
 dnl Gallium helper functions
 dnl
@@ -1884,6 +1930,17 @@ gallium_require_llvm() {
     fi
 }
 
+gallium_require_drm_loader() {
+    if test "x$enable_gallium_loader" = xyes; then
+        PKG_CHECK_MODULES([LIBUDEV], [libudev], [],
+                          AC_MSG_ERROR([Gallium drm loader requrires libudev]))
+        if test "x$have_libdrm" != xyes; then
+            AC_MSG_ERROR([Gallium drm loader requires libdrm >= $LIBDRM_REQUIRED])
+        fi
+        enable_gallium_drm_loader=yes
+    fi
+}
+
 dnl Gallium drivers
 dnl Duplicates in GALLIUM_DRIVERS_DIRS are removed by sorting it after this block
 if test "x$with_gallium_drivers" != x; then
@@ -1911,6 +1968,7 @@ if test "x$with_gallium_drivers" != x; then
             ;;
         xr600)
             PKG_CHECK_MODULES([RADEON], [libdrm_radeon >= $LIBDRM_RADEON_REQUIRED])
+            gallium_require_drm_loader
             GALLIUM_DRIVERS_DIRS="$GALLIUM_DRIVERS_DIRS r600"
             if test "x$enable_r600_llvm" = xyes; then
                 if test "x$LLVM_VERSION" != "x3.1"; then
@@ -1931,6 +1989,7 @@ if test "x$with_gallium_drivers" != x; then
             ;;
         xnouveau)
             PKG_CHECK_MODULES([NOUVEAU], [libdrm_nouveau >= $LIBDRM_NOUVEAU_REQUIRED])
+            gallium_require_drm_loader
             GALLIUM_DRIVERS_DIRS="$GALLIUM_DRIVERS_DIRS nouveau nv30 nv50 nvc0"
             gallium_check_st "nouveau/drm" "dri-nouveau" "xorg-nouveau" "" "xvmc-nouveau" "vdpau-nouveau"
             ;;
@@ -1967,6 +2026,25 @@ if test "x$with_gallium_drivers" != x; then
     done
 fi
 
+if test "x$enable_gallium_loader" = xyes; then
+    GALLIUM_WINSYS_DIRS="$GALLIUM_WINSYS_DIRS sw/null"
+    GALLIUM_PIPE_LOADER_DEFINES="-DHAVE_PIPE_LOADER_SW"
+    GALLIUM_PIPE_LOADER_LIBS="\$(TOP)/src/gallium/auxiliary/pipe-loader/libpipe_loader.a"
+    GALLIUM_PIPE_LOADER_LIBS="$GALLIUM_PIPE_LOADER_LIBS \$(TOP)/src/gallium/winsys/sw/null/libws_null.a"
+
+    if test "x$HAVE_WINSYS_XLIB" = xyes; then
+        GALLIUM_PIPE_LOADER_DEFINES="$GALLIUM_PIPE_LOADER_DEFINES -DHAVE_PIPE_LOADER_XLIB"
+        GALLIUM_PIPE_LOADER_LIBS="$GALLIUM_PIPE_LOADER_LIBS \$(TOP)/src/gallium/winsys/sw/xlib/libws_xlib.a"
+    fi
+
+    if test "x$enable_gallium_drm_loader" = xyes; then
+        GALLIUM_PIPE_LOADER_DEFINES="$GALLIUM_PIPE_LOADER_DEFINES -DHAVE_PIPE_LOADER_DRM"
+    fi
+
+    AC_SUBST([GALLIUM_PIPE_LOADER_DEFINES])
+    AC_SUBST([GALLIUM_PIPE_LOADER_LIBS])
+fi
+
 dnl Tell Automake which drivers to build
 for driver in $GALLIUM_DRIVERS_DIRS; do
     case "x$driver" in
@@ -1990,6 +2068,8 @@ AM_CONDITIONAL(HAVE_IDENTITY_GALLIUM, test x$HAVE_IDENTITY_GALLIUM = xyes)
 AM_CONDITIONAL(HAVE_NOOP_GALLIUM, test x$HAVE_NOOP_GALLIUM = xyes)
 AM_CONDITIONAL(NEED_RADEON_GALLIUM, test x$NEED_RADEON_GALLIUM = xyes)
 AM_CONDITIONAL(USE_R600_LLVM_COMPILER, test x$USE_R600_LLVM_COMPILER = xyes)
+AM_CONDITIONAL(HAVE_LOADER_GALLIUM, test x$enable_gallium_loader = xyes)
+AM_CONDITIONAL(HAVE_DRM_LOADER_GALLIUM, test x$enable_gallium_drm_loader = xyes)
 AC_SUBST([GALLIUM_MAKE_DIRS])
 
 dnl prepend CORE_DIRS to SRC_DIRS
@@ -2005,9 +2085,12 @@ CXXFLAGS="$CXXFLAGS $USER_CXXFLAGS"
 
 dnl Substitute the config
 AC_CONFIG_FILES([configs/autoconf
+		src/gallium/auxiliary/pipe-loader/Makefile
+		src/gallium/state_trackers/clover/Makefile
 		src/gallium/drivers/Makefile
 		src/gallium/drivers/r300/Makefile
 		src/gallium/drivers/r600/Makefile
+		src/gallium/targets/opencl/Makefile
 		src/gbm/Makefile
 		src/gbm/main/gbm.pc
 		src/egl/drivers/Makefile
diff --git a/docs/GL3.txt b/docs/GL3.txt
index 0a860bb3c1d..5fc6c69ba4b 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -88,7 +88,7 @@ GL 4.0:
 
 GLSL 4.0                                             not started
 GL_ARB_texture_query_lod                             not started
-GL_ARB_draw_buffers_blend                            DONE (r600, softpipe)
+GL_ARB_draw_buffers_blend                            DONE (i965, r600, softpipe)
 GL_ARB_draw_indirect                                 not started
 GL_ARB_gpu_shader_fp64                               not started
 GL_ARB_sample_shading                                not started
diff --git a/docs/viewperf.html b/docs/viewperf.html
index 27962504620..af351bc4225 100644
--- a/docs/viewperf.html
+++ b/docs/viewperf.html
@@ -84,6 +84,11 @@ If the Mesa driver doesn't support this extension the rendering will
 be incorrect and the test will fail.
 </p>
 
+<p>
+Also, the color of the line drawings in test 2 seem to appear in a random
+color.  This is probably due to some uninitialized state somewhere.
+</p>
+
 
 
 <h2>sw-02 test 6</h2>
diff --git a/include/CL/cl.h b/include/CL/cl.h
new file mode 100644
index 00000000000..4f21afe55bb
--- /dev/null
+++ b/include/CL/cl.h
@@ -0,0 +1,998 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2010 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/* $Revision: 11985 $ on $Date: 2010-07-15 11:16:06 -0700 (Thu, 15 Jul 2010) $ */
+
+#ifndef __OPENCL_CL_H
+#define __OPENCL_CL_H
+
+#ifdef __APPLE__
+#include <OpenCL/cl_platform.h>
+#else
+#include <CL/cl_platform.h>
+#endif	
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************/
+
+typedef struct _cl_platform_id *    cl_platform_id;
+typedef struct _cl_device_id *      cl_device_id;
+typedef struct _cl_context *        cl_context;
+typedef struct _cl_command_queue *  cl_command_queue;
+typedef struct _cl_mem *            cl_mem;
+typedef struct _cl_program *        cl_program;
+typedef struct _cl_kernel *         cl_kernel;
+typedef struct _cl_event *          cl_event;
+typedef struct _cl_sampler *        cl_sampler;
+
+typedef cl_uint             cl_bool;                     /* WARNING!  Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */ 
+typedef cl_ulong            cl_bitfield;
+typedef cl_bitfield         cl_device_type;
+typedef cl_uint             cl_platform_info;
+typedef cl_uint             cl_device_info;
+typedef cl_bitfield         cl_device_fp_config;
+typedef cl_uint             cl_device_mem_cache_type;
+typedef cl_uint             cl_device_local_mem_type;
+typedef cl_bitfield         cl_device_exec_capabilities;
+typedef cl_bitfield         cl_command_queue_properties;
+
+typedef intptr_t			cl_context_properties;
+typedef cl_uint             cl_context_info;
+typedef cl_uint             cl_command_queue_info;
+typedef cl_uint             cl_channel_order;
+typedef cl_uint             cl_channel_type;
+typedef cl_bitfield         cl_mem_flags;
+typedef cl_uint             cl_mem_object_type;
+typedef cl_uint             cl_mem_info;
+typedef cl_uint             cl_image_info;
+typedef cl_uint             cl_buffer_create_type;
+typedef cl_uint             cl_addressing_mode;
+typedef cl_uint             cl_filter_mode;
+typedef cl_uint             cl_sampler_info;
+typedef cl_bitfield         cl_map_flags;
+typedef cl_uint             cl_program_info;
+typedef cl_uint             cl_program_build_info;
+typedef cl_int              cl_build_status;
+typedef cl_uint             cl_kernel_info;
+typedef cl_uint             cl_kernel_work_group_info;
+typedef cl_uint             cl_event_info;
+typedef cl_uint             cl_command_type;
+typedef cl_uint             cl_profiling_info;
+
+typedef struct _cl_image_format {
+    cl_channel_order        image_channel_order;
+    cl_channel_type         image_channel_data_type;
+} cl_image_format;
+
+
+typedef struct _cl_buffer_region {
+    size_t                  origin;
+    size_t                  size;
+} cl_buffer_region;
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_SUCCESS                                  0
+#define CL_DEVICE_NOT_FOUND                         -1
+#define CL_DEVICE_NOT_AVAILABLE                     -2
+#define CL_COMPILER_NOT_AVAILABLE                   -3
+#define CL_MEM_OBJECT_ALLOCATION_FAILURE            -4
+#define CL_OUT_OF_RESOURCES                         -5
+#define CL_OUT_OF_HOST_MEMORY                       -6
+#define CL_PROFILING_INFO_NOT_AVAILABLE             -7
+#define CL_MEM_COPY_OVERLAP                         -8
+#define CL_IMAGE_FORMAT_MISMATCH                    -9
+#define CL_IMAGE_FORMAT_NOT_SUPPORTED               -10
+#define CL_BUILD_PROGRAM_FAILURE                    -11
+#define CL_MAP_FAILURE                              -12
+#define CL_MISALIGNED_SUB_BUFFER_OFFSET             -13
+#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14
+
+#define CL_INVALID_VALUE                            -30
+#define CL_INVALID_DEVICE_TYPE                      -31
+#define CL_INVALID_PLATFORM                         -32
+#define CL_INVALID_DEVICE                           -33
+#define CL_INVALID_CONTEXT                          -34
+#define CL_INVALID_QUEUE_PROPERTIES                 -35
+#define CL_INVALID_COMMAND_QUEUE                    -36
+#define CL_INVALID_HOST_PTR                         -37
+#define CL_INVALID_MEM_OBJECT                       -38
+#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR          -39
+#define CL_INVALID_IMAGE_SIZE                       -40
+#define CL_INVALID_SAMPLER                          -41
+#define CL_INVALID_BINARY                           -42
+#define CL_INVALID_BUILD_OPTIONS                    -43
+#define CL_INVALID_PROGRAM                          -44
+#define CL_INVALID_PROGRAM_EXECUTABLE               -45
+#define CL_INVALID_KERNEL_NAME                      -46
+#define CL_INVALID_KERNEL_DEFINITION                -47
+#define CL_INVALID_KERNEL                           -48
+#define CL_INVALID_ARG_INDEX                        -49
+#define CL_INVALID_ARG_VALUE                        -50
+#define CL_INVALID_ARG_SIZE                         -51
+#define CL_INVALID_KERNEL_ARGS                      -52
+#define CL_INVALID_WORK_DIMENSION                   -53
+#define CL_INVALID_WORK_GROUP_SIZE                  -54
+#define CL_INVALID_WORK_ITEM_SIZE                   -55
+#define CL_INVALID_GLOBAL_OFFSET                    -56
+#define CL_INVALID_EVENT_WAIT_LIST                  -57
+#define CL_INVALID_EVENT                            -58
+#define CL_INVALID_OPERATION                        -59
+#define CL_INVALID_GL_OBJECT                        -60
+#define CL_INVALID_BUFFER_SIZE                      -61
+#define CL_INVALID_MIP_LEVEL                        -62
+#define CL_INVALID_GLOBAL_WORK_SIZE                 -63
+#define CL_INVALID_PROPERTY                         -64
+
+/* OpenCL Version */
+#define CL_VERSION_1_0                              1
+#define CL_VERSION_1_1                              1
+
+/* cl_bool */
+#define CL_FALSE                                    0
+#define CL_TRUE                                     1
+
+/* cl_platform_info */
+#define CL_PLATFORM_PROFILE                         0x0900
+#define CL_PLATFORM_VERSION                         0x0901
+#define CL_PLATFORM_NAME                            0x0902
+#define CL_PLATFORM_VENDOR                          0x0903
+#define CL_PLATFORM_EXTENSIONS                      0x0904
+
+/* cl_device_type - bitfield */
+#define CL_DEVICE_TYPE_DEFAULT                      (1 << 0)
+#define CL_DEVICE_TYPE_CPU                          (1 << 1)
+#define CL_DEVICE_TYPE_GPU                          (1 << 2)
+#define CL_DEVICE_TYPE_ACCELERATOR                  (1 << 3)
+#define CL_DEVICE_TYPE_ALL                          0xFFFFFFFF
+
+/* cl_device_info */
+#define CL_DEVICE_TYPE                              0x1000
+#define CL_DEVICE_VENDOR_ID                         0x1001
+#define CL_DEVICE_MAX_COMPUTE_UNITS                 0x1002
+#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS          0x1003
+#define CL_DEVICE_MAX_WORK_GROUP_SIZE               0x1004
+#define CL_DEVICE_MAX_WORK_ITEM_SIZES               0x1005
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR       0x1006
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT      0x1007
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT        0x1008
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG       0x1009
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT      0x100A
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE     0x100B
+#define CL_DEVICE_MAX_CLOCK_FREQUENCY               0x100C
+#define CL_DEVICE_ADDRESS_BITS                      0x100D
+#define CL_DEVICE_MAX_READ_IMAGE_ARGS               0x100E
+#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS              0x100F
+#define CL_DEVICE_MAX_MEM_ALLOC_SIZE                0x1010
+#define CL_DEVICE_IMAGE2D_MAX_WIDTH                 0x1011
+#define CL_DEVICE_IMAGE2D_MAX_HEIGHT                0x1012
+#define CL_DEVICE_IMAGE3D_MAX_WIDTH                 0x1013
+#define CL_DEVICE_IMAGE3D_MAX_HEIGHT                0x1014
+#define CL_DEVICE_IMAGE3D_MAX_DEPTH                 0x1015
+#define CL_DEVICE_IMAGE_SUPPORT                     0x1016
+#define CL_DEVICE_MAX_PARAMETER_SIZE                0x1017
+#define CL_DEVICE_MAX_SAMPLERS                      0x1018
+#define CL_DEVICE_MEM_BASE_ADDR_ALIGN               0x1019
+#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE          0x101A
+#define CL_DEVICE_SINGLE_FP_CONFIG                  0x101B
+#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE             0x101C
+#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE         0x101D
+#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE             0x101E
+#define CL_DEVICE_GLOBAL_MEM_SIZE                   0x101F
+#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE          0x1020
+#define CL_DEVICE_MAX_CONSTANT_ARGS                 0x1021
+#define CL_DEVICE_LOCAL_MEM_TYPE                    0x1022
+#define CL_DEVICE_LOCAL_MEM_SIZE                    0x1023
+#define CL_DEVICE_ERROR_CORRECTION_SUPPORT          0x1024
+#define CL_DEVICE_PROFILING_TIMER_RESOLUTION        0x1025
+#define CL_DEVICE_ENDIAN_LITTLE                     0x1026
+#define CL_DEVICE_AVAILABLE                         0x1027
+#define CL_DEVICE_COMPILER_AVAILABLE                0x1028
+#define CL_DEVICE_EXECUTION_CAPABILITIES            0x1029
+#define CL_DEVICE_QUEUE_PROPERTIES                  0x102A
+#define CL_DEVICE_NAME                              0x102B
+#define CL_DEVICE_VENDOR                            0x102C
+#define CL_DRIVER_VERSION                           0x102D
+#define CL_DEVICE_PROFILE                           0x102E
+#define CL_DEVICE_VERSION                           0x102F
+#define CL_DEVICE_EXTENSIONS                        0x1030
+#define CL_DEVICE_PLATFORM                          0x1031
+/* 0x1032 reserved for CL_DEVICE_DOUBLE_FP_CONFIG */
+/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG */
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF       0x1034
+#define CL_DEVICE_HOST_UNIFIED_MEMORY               0x1035
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR          0x1036
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT         0x1037
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT           0x1038
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG          0x1039
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT         0x103A
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE        0x103B
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF          0x103C
+#define CL_DEVICE_OPENCL_C_VERSION                  0x103D
+
+/* cl_device_fp_config - bitfield */
+#define CL_FP_DENORM                                (1 << 0)
+#define CL_FP_INF_NAN                               (1 << 1)
+#define CL_FP_ROUND_TO_NEAREST                      (1 << 2)
+#define CL_FP_ROUND_TO_ZERO                         (1 << 3)
+#define CL_FP_ROUND_TO_INF                          (1 << 4)
+#define CL_FP_FMA                                   (1 << 5)
+#define CL_FP_SOFT_FLOAT                            (1 << 6)
+
+/* cl_device_mem_cache_type */
+#define CL_NONE                                     0x0
+#define CL_READ_ONLY_CACHE                          0x1
+#define CL_READ_WRITE_CACHE                         0x2
+
+/* cl_device_local_mem_type */
+#define CL_LOCAL                                    0x1
+#define CL_GLOBAL                                   0x2
+
+/* cl_device_exec_capabilities - bitfield */
+#define CL_EXEC_KERNEL                              (1 << 0)
+#define CL_EXEC_NATIVE_KERNEL                       (1 << 1)
+
+/* cl_command_queue_properties - bitfield */
+#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE      (1 << 0)
+#define CL_QUEUE_PROFILING_ENABLE                   (1 << 1)
+
+/* cl_context_info  */
+#define CL_CONTEXT_REFERENCE_COUNT                  0x1080
+#define CL_CONTEXT_DEVICES                          0x1081
+#define CL_CONTEXT_PROPERTIES                       0x1082
+#define CL_CONTEXT_NUM_DEVICES                      0x1083
+
+/* cl_context_info + cl_context_properties */
+#define CL_CONTEXT_PLATFORM                         0x1084
+
+/* cl_command_queue_info */
+#define CL_QUEUE_CONTEXT                            0x1090
+#define CL_QUEUE_DEVICE                             0x1091
+#define CL_QUEUE_REFERENCE_COUNT                    0x1092
+#define CL_QUEUE_PROPERTIES                         0x1093
+
+/* cl_mem_flags - bitfield */
+#define CL_MEM_READ_WRITE                           (1 << 0)
+#define CL_MEM_WRITE_ONLY                           (1 << 1)
+#define CL_MEM_READ_ONLY                            (1 << 2)
+#define CL_MEM_USE_HOST_PTR                         (1 << 3)
+#define CL_MEM_ALLOC_HOST_PTR                       (1 << 4)
+#define CL_MEM_COPY_HOST_PTR                        (1 << 5)
+
+/* cl_channel_order */
+#define CL_R                                        0x10B0
+#define CL_A                                        0x10B1
+#define CL_RG                                       0x10B2
+#define CL_RA                                       0x10B3
+#define CL_RGB                                      0x10B4
+#define CL_RGBA                                     0x10B5
+#define CL_BGRA                                     0x10B6
+#define CL_ARGB                                     0x10B7
+#define CL_INTENSITY                                0x10B8
+#define CL_LUMINANCE                                0x10B9
+#define CL_Rx                                       0x10BA
+#define CL_RGx                                      0x10BB
+#define CL_RGBx                                     0x10BC
+
+/* cl_channel_type */
+#define CL_SNORM_INT8                               0x10D0
+#define CL_SNORM_INT16                              0x10D1
+#define CL_UNORM_INT8                               0x10D2
+#define CL_UNORM_INT16                              0x10D3
+#define CL_UNORM_SHORT_565                          0x10D4
+#define CL_UNORM_SHORT_555                          0x10D5
+#define CL_UNORM_INT_101010                         0x10D6
+#define CL_SIGNED_INT8                              0x10D7
+#define CL_SIGNED_INT16                             0x10D8
+#define CL_SIGNED_INT32                             0x10D9
+#define CL_UNSIGNED_INT8                            0x10DA
+#define CL_UNSIGNED_INT16                           0x10DB
+#define CL_UNSIGNED_INT32                           0x10DC
+#define CL_HALF_FLOAT                               0x10DD
+#define CL_FLOAT                                    0x10DE
+
+/* cl_mem_object_type */
+#define CL_MEM_OBJECT_BUFFER                        0x10F0
+#define CL_MEM_OBJECT_IMAGE2D                       0x10F1
+#define CL_MEM_OBJECT_IMAGE3D                       0x10F2
+
+/* cl_mem_info */
+#define CL_MEM_TYPE                                 0x1100
+#define CL_MEM_FLAGS                                0x1101
+#define CL_MEM_SIZE                                 0x1102
+#define CL_MEM_HOST_PTR                             0x1103
+#define CL_MEM_MAP_COUNT                            0x1104
+#define CL_MEM_REFERENCE_COUNT                      0x1105
+#define CL_MEM_CONTEXT                              0x1106
+#define CL_MEM_ASSOCIATED_MEMOBJECT                 0x1107
+#define CL_MEM_OFFSET                               0x1108
+
+/* cl_image_info */
+#define CL_IMAGE_FORMAT                             0x1110
+#define CL_IMAGE_ELEMENT_SIZE                       0x1111
+#define CL_IMAGE_ROW_PITCH                          0x1112
+#define CL_IMAGE_SLICE_PITCH                        0x1113
+#define CL_IMAGE_WIDTH                              0x1114
+#define CL_IMAGE_HEIGHT                             0x1115
+#define CL_IMAGE_DEPTH                              0x1116
+
+/* cl_addressing_mode */
+#define CL_ADDRESS_NONE                             0x1130
+#define CL_ADDRESS_CLAMP_TO_EDGE                    0x1131
+#define CL_ADDRESS_CLAMP                            0x1132
+#define CL_ADDRESS_REPEAT                           0x1133
+#define CL_ADDRESS_MIRRORED_REPEAT                  0x1134
+
+/* cl_filter_mode */
+#define CL_FILTER_NEAREST                           0x1140
+#define CL_FILTER_LINEAR                            0x1141
+
+/* cl_sampler_info */
+#define CL_SAMPLER_REFERENCE_COUNT                  0x1150
+#define CL_SAMPLER_CONTEXT                          0x1151
+#define CL_SAMPLER_NORMALIZED_COORDS                0x1152
+#define CL_SAMPLER_ADDRESSING_MODE                  0x1153
+#define CL_SAMPLER_FILTER_MODE                      0x1154
+
+/* cl_map_flags - bitfield */
+#define CL_MAP_READ                                 (1 << 0)
+#define CL_MAP_WRITE                                (1 << 1)
+
+/* cl_program_info */
+#define CL_PROGRAM_REFERENCE_COUNT                  0x1160
+#define CL_PROGRAM_CONTEXT                          0x1161
+#define CL_PROGRAM_NUM_DEVICES                      0x1162
+#define CL_PROGRAM_DEVICES                          0x1163
+#define CL_PROGRAM_SOURCE                           0x1164
+#define CL_PROGRAM_BINARY_SIZES                     0x1165
+#define CL_PROGRAM_BINARIES                         0x1166
+
+/* cl_program_build_info */
+#define CL_PROGRAM_BUILD_STATUS                     0x1181
+#define CL_PROGRAM_BUILD_OPTIONS                    0x1182
+#define CL_PROGRAM_BUILD_LOG                        0x1183
+
+/* cl_build_status */
+#define CL_BUILD_SUCCESS                            0
+#define CL_BUILD_NONE                               -1
+#define CL_BUILD_ERROR                              -2
+#define CL_BUILD_IN_PROGRESS                        -3
+
+/* cl_kernel_info */
+#define CL_KERNEL_FUNCTION_NAME                     0x1190
+#define CL_KERNEL_NUM_ARGS                          0x1191
+#define CL_KERNEL_REFERENCE_COUNT                   0x1192
+#define CL_KERNEL_CONTEXT                           0x1193
+#define CL_KERNEL_PROGRAM                           0x1194
+
+/* cl_kernel_work_group_info */
+#define CL_KERNEL_WORK_GROUP_SIZE                   0x11B0
+#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE           0x11B1
+#define CL_KERNEL_LOCAL_MEM_SIZE                    0x11B2
+#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3
+#define CL_KERNEL_PRIVATE_MEM_SIZE                  0x11B4
+
+/* cl_event_info  */
+#define CL_EVENT_COMMAND_QUEUE                      0x11D0
+#define CL_EVENT_COMMAND_TYPE                       0x11D1
+#define CL_EVENT_REFERENCE_COUNT                    0x11D2
+#define CL_EVENT_COMMAND_EXECUTION_STATUS           0x11D3
+#define CL_EVENT_CONTEXT                            0x11D4
+
+/* cl_command_type */
+#define CL_COMMAND_NDRANGE_KERNEL                   0x11F0
+#define CL_COMMAND_TASK                             0x11F1
+#define CL_COMMAND_NATIVE_KERNEL                    0x11F2
+#define CL_COMMAND_READ_BUFFER                      0x11F3
+#define CL_COMMAND_WRITE_BUFFER                     0x11F4
+#define CL_COMMAND_COPY_BUFFER                      0x11F5
+#define CL_COMMAND_READ_IMAGE                       0x11F6
+#define CL_COMMAND_WRITE_IMAGE                      0x11F7
+#define CL_COMMAND_COPY_IMAGE                       0x11F8
+#define CL_COMMAND_COPY_IMAGE_TO_BUFFER             0x11F9
+#define CL_COMMAND_COPY_BUFFER_TO_IMAGE             0x11FA
+#define CL_COMMAND_MAP_BUFFER                       0x11FB
+#define CL_COMMAND_MAP_IMAGE                        0x11FC
+#define CL_COMMAND_UNMAP_MEM_OBJECT                 0x11FD
+#define CL_COMMAND_MARKER                           0x11FE
+#define CL_COMMAND_ACQUIRE_GL_OBJECTS               0x11FF
+#define CL_COMMAND_RELEASE_GL_OBJECTS               0x1200
+#define CL_COMMAND_READ_BUFFER_RECT                 0x1201
+#define CL_COMMAND_WRITE_BUFFER_RECT                0x1202
+#define CL_COMMAND_COPY_BUFFER_RECT                 0x1203
+#define CL_COMMAND_USER                             0x1204
+
+/* command execution status */
+#define CL_COMPLETE                                 0x0
+#define CL_RUNNING                                  0x1
+#define CL_SUBMITTED                                0x2
+#define CL_QUEUED                                   0x3
+  
+/* cl_buffer_create_type  */
+#define CL_BUFFER_CREATE_TYPE_REGION                0x1220
+
+/* cl_profiling_info  */
+#define CL_PROFILING_COMMAND_QUEUED                 0x1280
+#define CL_PROFILING_COMMAND_SUBMIT                 0x1281
+#define CL_PROFILING_COMMAND_START                  0x1282
+#define CL_PROFILING_COMMAND_END                    0x1283
+
+/********************************************************************************************************/
+
+/* Platform API */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPlatformIDs(cl_uint          /* num_entries */,
+                 cl_platform_id * /* platforms */,
+                 cl_uint *        /* num_platforms */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL 
+clGetPlatformInfo(cl_platform_id   /* platform */, 
+                  cl_platform_info /* param_name */,
+                  size_t           /* param_value_size */, 
+                  void *           /* param_value */,
+                  size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Device APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDs(cl_platform_id   /* platform */,
+               cl_device_type   /* device_type */, 
+               cl_uint          /* num_entries */, 
+               cl_device_id *   /* devices */, 
+               cl_uint *        /* num_devices */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceInfo(cl_device_id    /* device */,
+                cl_device_info  /* param_name */, 
+                size_t          /* param_value_size */, 
+                void *          /* param_value */,
+                size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Context APIs  */
+extern CL_API_ENTRY cl_context CL_API_CALL
+clCreateContext(const cl_context_properties * /* properties */,
+                cl_uint                       /* num_devices */,
+                const cl_device_id *          /* devices */,
+                void (CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t, void *),
+                void *                        /* user_data */,
+                cl_int *                      /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_context CL_API_CALL
+clCreateContextFromType(const cl_context_properties * /* properties */,
+                        cl_device_type                /* device_type */,
+                        void (CL_CALLBACK *     /* pfn_notify*/ )(const char *, const void *, size_t, void *),
+                        void *                        /* user_data */,
+                        cl_int *                      /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetContextInfo(cl_context         /* context */, 
+                 cl_context_info    /* param_name */, 
+                 size_t             /* param_value_size */, 
+                 void *             /* param_value */, 
+                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Command Queue APIs */
+extern CL_API_ENTRY cl_command_queue CL_API_CALL
+clCreateCommandQueue(cl_context                     /* context */, 
+                     cl_device_id                   /* device */, 
+                     cl_command_queue_properties    /* properties */,
+                     cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetCommandQueueInfo(cl_command_queue      /* command_queue */,
+                      cl_command_queue_info /* param_name */,
+                      size_t                /* param_value_size */,
+                      void *                /* param_value */,
+                      size_t *              /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+#warning CL_USE_DEPRECATED_OPENCL_1_0_APIS is defined. These APIs are unsupported and untested in OpenCL 1.1!
+/* 
+ *  WARNING:
+ *     This API introduces mutable state into the OpenCL implementation. It has been REMOVED
+ *  to better facilitate thread safety.  The 1.0 API is not thread safe. It is not tested by the
+ *  OpenCL 1.1 conformance test, and consequently may not work or may not work dependably.
+ *  It is likely to be non-performant. Use of this API is not advised. Use at your own risk.
+ *
+ *  Software developers previously relying on this API are instructed to set the command queue 
+ *  properties when creating the queue, instead. 
+ */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetCommandQueueProperty(cl_command_queue              /* command_queue */,
+                          cl_command_queue_properties   /* properties */, 
+                          cl_bool                        /* enable */,
+                          cl_command_queue_properties * /* old_properties */) CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED;
+#endif /* CL_USE_DEPRECATED_OPENCL_1_0_APIS */
+
+/* Memory Object APIs */
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateBuffer(cl_context   /* context */,
+               cl_mem_flags /* flags */,
+               size_t       /* size */,
+               void *       /* host_ptr */,
+               cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateSubBuffer(cl_mem                   /* buffer */,
+                  cl_mem_flags             /* flags */,
+                  cl_buffer_create_type    /* buffer_create_type */,
+                  const void *             /* buffer_create_info */,
+                  cl_int *                 /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateImage2D(cl_context              /* context */,
+                cl_mem_flags            /* flags */,
+                const cl_image_format * /* image_format */,
+                size_t                  /* image_width */,
+                size_t                  /* image_height */,
+                size_t                  /* image_row_pitch */, 
+                void *                  /* host_ptr */,
+                cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+                        
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateImage3D(cl_context              /* context */,
+                cl_mem_flags            /* flags */,
+                const cl_image_format * /* image_format */,
+                size_t                  /* image_width */, 
+                size_t                  /* image_height */,
+                size_t                  /* image_depth */, 
+                size_t                  /* image_row_pitch */, 
+                size_t                  /* image_slice_pitch */, 
+                void *                  /* host_ptr */,
+                cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+                        
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSupportedImageFormats(cl_context           /* context */,
+                           cl_mem_flags         /* flags */,
+                           cl_mem_object_type   /* image_type */,
+                           cl_uint              /* num_entries */,
+                           cl_image_format *    /* image_formats */,
+                           cl_uint *            /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0;
+                                    
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetMemObjectInfo(cl_mem           /* memobj */,
+                   cl_mem_info      /* param_name */, 
+                   size_t           /* param_value_size */,
+                   void *           /* param_value */,
+                   size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetImageInfo(cl_mem           /* image */,
+               cl_image_info    /* param_name */, 
+               size_t           /* param_value_size */,
+               void *           /* param_value */,
+               size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetMemObjectDestructorCallback(  cl_mem /* memobj */, 
+                                    void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), 
+                                    void * /*user_data */ )             CL_API_SUFFIX__VERSION_1_1;  
+
+/* Sampler APIs  */
+extern CL_API_ENTRY cl_sampler CL_API_CALL
+clCreateSampler(cl_context          /* context */,
+                cl_bool             /* normalized_coords */, 
+                cl_addressing_mode  /* addressing_mode */, 
+                cl_filter_mode      /* filter_mode */,
+                cl_int *            /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSamplerInfo(cl_sampler         /* sampler */,
+                 cl_sampler_info    /* param_name */,
+                 size_t             /* param_value_size */,
+                 void *             /* param_value */,
+                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+                            
+/* Program Object APIs  */
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithSource(cl_context        /* context */,
+                          cl_uint           /* count */,
+                          const char **     /* strings */,
+                          const size_t *    /* lengths */,
+                          cl_int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithBinary(cl_context                     /* context */,
+                          cl_uint                        /* num_devices */,
+                          const cl_device_id *           /* device_list */,
+                          const size_t *                 /* lengths */,
+                          const unsigned char **         /* binaries */,
+                          cl_int *                       /* binary_status */,
+                          cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clBuildProgram(cl_program           /* program */,
+               cl_uint              /* num_devices */,
+               const cl_device_id * /* device_list */,
+               const char *         /* options */, 
+               void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+               void *               /* user_data */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clUnloadCompiler(void) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramInfo(cl_program         /* program */,
+                 cl_program_info    /* param_name */,
+                 size_t             /* param_value_size */,
+                 void *             /* param_value */,
+                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramBuildInfo(cl_program            /* program */,
+                      cl_device_id          /* device */,
+                      cl_program_build_info /* param_name */,
+                      size_t                /* param_value_size */,
+                      void *                /* param_value */,
+                      size_t *              /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+                            
+/* Kernel Object APIs */
+extern CL_API_ENTRY cl_kernel CL_API_CALL
+clCreateKernel(cl_program      /* program */,
+               const char *    /* kernel_name */,
+               cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateKernelsInProgram(cl_program     /* program */,
+                         cl_uint        /* num_kernels */,
+                         cl_kernel *    /* kernels */,
+                         cl_uint *      /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainKernel(cl_kernel    /* kernel */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseKernel(cl_kernel   /* kernel */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArg(cl_kernel    /* kernel */,
+               cl_uint      /* arg_index */,
+               size_t       /* arg_size */,
+               const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelInfo(cl_kernel       /* kernel */,
+                cl_kernel_info  /* param_name */,
+                size_t          /* param_value_size */,
+                void *          /* param_value */,
+                size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelWorkGroupInfo(cl_kernel                  /* kernel */,
+                         cl_device_id               /* device */,
+                         cl_kernel_work_group_info  /* param_name */,
+                         size_t                     /* param_value_size */,
+                         void *                     /* param_value */,
+                         size_t *                   /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Event Object APIs  */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clWaitForEvents(cl_uint             /* num_events */,
+                const cl_event *    /* event_list */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetEventInfo(cl_event         /* event */,
+               cl_event_info    /* param_name */,
+               size_t           /* param_value_size */,
+               void *           /* param_value */,
+               size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+                            
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateUserEvent(cl_context    /* context */,
+                  cl_int *      /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;               
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetUserEventStatus(cl_event   /* event */,
+                     cl_int     /* execution_status */) CL_API_SUFFIX__VERSION_1_1;
+                     
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetEventCallback( cl_event    /* event */,
+                    cl_int      /* command_exec_callback_type */,
+                    void (CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *),
+                    void *      /* user_data */) CL_API_SUFFIX__VERSION_1_1;
+
+/* Profiling APIs  */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetEventProfilingInfo(cl_event            /* event */,
+                        cl_profiling_info   /* param_name */,
+                        size_t              /* param_value_size */,
+                        void *              /* param_value */,
+                        size_t *            /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+                                
+/* Flush and Finish APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clFlush(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clFinish(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Enqueued Commands APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBuffer(cl_command_queue    /* command_queue */,
+                    cl_mem              /* buffer */,
+                    cl_bool             /* blocking_read */,
+                    size_t              /* offset */,
+                    size_t              /* cb */, 
+                    void *              /* ptr */,
+                    cl_uint             /* num_events_in_wait_list */,
+                    const cl_event *    /* event_wait_list */,
+                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBufferRect(cl_command_queue    /* command_queue */,
+                        cl_mem              /* buffer */,
+                        cl_bool             /* blocking_read */,
+                        const size_t *      /* buffer_origin */,
+                        const size_t *      /* host_origin */, 
+                        const size_t *      /* region */,
+                        size_t              /* buffer_row_pitch */,
+                        size_t              /* buffer_slice_pitch */,
+                        size_t              /* host_row_pitch */,
+                        size_t              /* host_slice_pitch */,                        
+                        void *              /* ptr */,
+                        cl_uint             /* num_events_in_wait_list */,
+                        const cl_event *    /* event_wait_list */,
+                        cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBuffer(cl_command_queue   /* command_queue */, 
+                     cl_mem             /* buffer */, 
+                     cl_bool            /* blocking_write */, 
+                     size_t             /* offset */, 
+                     size_t             /* cb */, 
+                     const void *       /* ptr */, 
+                     cl_uint            /* num_events_in_wait_list */, 
+                     const cl_event *   /* event_wait_list */, 
+                     cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_0;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBufferRect(cl_command_queue    /* command_queue */,
+                         cl_mem              /* buffer */,
+                         cl_bool             /* blocking_write */,
+                         const size_t *      /* buffer_origin */,
+                         const size_t *      /* host_origin */, 
+                         const size_t *      /* region */,
+                         size_t              /* buffer_row_pitch */,
+                         size_t              /* buffer_slice_pitch */,
+                         size_t              /* host_row_pitch */,
+                         size_t              /* host_slice_pitch */,                        
+                         const void *        /* ptr */,
+                         cl_uint             /* num_events_in_wait_list */,
+                         const cl_event *    /* event_wait_list */,
+                         cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBuffer(cl_command_queue    /* command_queue */, 
+                    cl_mem              /* src_buffer */,
+                    cl_mem              /* dst_buffer */, 
+                    size_t              /* src_offset */,
+                    size_t              /* dst_offset */,
+                    size_t              /* cb */, 
+                    cl_uint             /* num_events_in_wait_list */,
+                    const cl_event *    /* event_wait_list */,
+                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBufferRect(cl_command_queue    /* command_queue */, 
+                        cl_mem              /* src_buffer */,
+                        cl_mem              /* dst_buffer */, 
+                        const size_t *      /* src_origin */,
+                        const size_t *      /* dst_origin */,
+                        const size_t *      /* region */, 
+                        size_t              /* src_row_pitch */,
+                        size_t              /* src_slice_pitch */,
+                        size_t              /* dst_row_pitch */,
+                        size_t              /* dst_slice_pitch */,
+                        cl_uint             /* num_events_in_wait_list */,
+                        const cl_event *    /* event_wait_list */,
+                        cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadImage(cl_command_queue     /* command_queue */,
+                   cl_mem               /* image */,
+                   cl_bool              /* blocking_read */, 
+                   const size_t *       /* origin[3] */,
+                   const size_t *       /* region[3] */,
+                   size_t               /* row_pitch */,
+                   size_t               /* slice_pitch */, 
+                   void *               /* ptr */,
+                   cl_uint              /* num_events_in_wait_list */,
+                   const cl_event *     /* event_wait_list */,
+                   cl_event *           /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteImage(cl_command_queue    /* command_queue */,
+                    cl_mem              /* image */,
+                    cl_bool             /* blocking_write */, 
+                    const size_t *      /* origin[3] */,
+                    const size_t *      /* region[3] */,
+                    size_t              /* input_row_pitch */,
+                    size_t              /* input_slice_pitch */, 
+                    const void *        /* ptr */,
+                    cl_uint             /* num_events_in_wait_list */,
+                    const cl_event *    /* event_wait_list */,
+                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImage(cl_command_queue     /* command_queue */,
+                   cl_mem               /* src_image */,
+                   cl_mem               /* dst_image */, 
+                   const size_t *       /* src_origin[3] */,
+                   const size_t *       /* dst_origin[3] */,
+                   const size_t *       /* region[3] */, 
+                   cl_uint              /* num_events_in_wait_list */,
+                   const cl_event *     /* event_wait_list */,
+                   cl_event *           /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImageToBuffer(cl_command_queue /* command_queue */,
+                           cl_mem           /* src_image */,
+                           cl_mem           /* dst_buffer */, 
+                           const size_t *   /* src_origin[3] */,
+                           const size_t *   /* region[3] */, 
+                           size_t           /* dst_offset */,
+                           cl_uint          /* num_events_in_wait_list */,
+                           const cl_event * /* event_wait_list */,
+                           cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBufferToImage(cl_command_queue /* command_queue */,
+                           cl_mem           /* src_buffer */,
+                           cl_mem           /* dst_image */, 
+                           size_t           /* src_offset */,
+                           const size_t *   /* dst_origin[3] */,
+                           const size_t *   /* region[3] */, 
+                           cl_uint          /* num_events_in_wait_list */,
+                           const cl_event * /* event_wait_list */,
+                           cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clEnqueueMapBuffer(cl_command_queue /* command_queue */,
+                   cl_mem           /* buffer */,
+                   cl_bool          /* blocking_map */, 
+                   cl_map_flags     /* map_flags */,
+                   size_t           /* offset */,
+                   size_t           /* cb */,
+                   cl_uint          /* num_events_in_wait_list */,
+                   const cl_event * /* event_wait_list */,
+                   cl_event *       /* event */,
+                   cl_int *         /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clEnqueueMapImage(cl_command_queue  /* command_queue */,
+                  cl_mem            /* image */, 
+                  cl_bool           /* blocking_map */, 
+                  cl_map_flags      /* map_flags */, 
+                  const size_t *    /* origin[3] */,
+                  const size_t *    /* region[3] */,
+                  size_t *          /* image_row_pitch */,
+                  size_t *          /* image_slice_pitch */,
+                  cl_uint           /* num_events_in_wait_list */,
+                  const cl_event *  /* event_wait_list */,
+                  cl_event *        /* event */,
+                  cl_int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueUnmapMemObject(cl_command_queue /* command_queue */,
+                        cl_mem           /* memobj */,
+                        void *           /* mapped_ptr */,
+                        cl_uint          /* num_events_in_wait_list */,
+                        const cl_event *  /* event_wait_list */,
+                        cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
+                       cl_kernel        /* kernel */,
+                       cl_uint          /* work_dim */,
+                       const size_t *   /* global_work_offset */,
+                       const size_t *   /* global_work_size */,
+                       const size_t *   /* local_work_size */,
+                       cl_uint          /* num_events_in_wait_list */,
+                       const cl_event * /* event_wait_list */,
+                       cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueTask(cl_command_queue  /* command_queue */,
+              cl_kernel         /* kernel */,
+              cl_uint           /* num_events_in_wait_list */,
+              const cl_event *  /* event_wait_list */,
+              cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNativeKernel(cl_command_queue  /* command_queue */,
+					  void (*user_func)(void *), 
+                      void *            /* args */,
+                      size_t            /* cb_args */, 
+                      cl_uint           /* num_mem_objects */,
+                      const cl_mem *    /* mem_list */,
+                      const void **     /* args_mem_loc */,
+                      cl_uint           /* num_events_in_wait_list */,
+                      const cl_event *  /* event_wait_list */,
+                      cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMarker(cl_command_queue    /* command_queue */,
+                cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWaitForEvents(cl_command_queue /* command_queue */,
+                       cl_uint          /* num_events */,
+                       const cl_event * /* event_list */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueBarrier(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Extension function access
+ *
+ * Returns the extension function address for the given function name,
+ * or NULL if a valid function can not be found.  The client must
+ * check to make sure the address is not NULL, before using or 
+ * calling the returned function address.
+ */
+extern CL_API_ENTRY void * CL_API_CALL clGetExtensionFunctionAddress(const char * /* func_name */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_H */
+
diff --git a/include/CL/cl.hpp b/include/CL/cl.hpp
new file mode 100644
index 00000000000..99b86a66563
--- /dev/null
+++ b/include/CL/cl.hpp
@@ -0,0 +1,4011 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2010 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/*! \file
+ *
+ *   \brief C++ bindings for OpenCL 1.0 (rev 48) and OpenCL 1.1 (rev 33)    
+ *   \author Benedict R. Gaster and Laurent Morichetti
+ *   
+ *   Additions and fixes from Brian Cole, March 3rd 2010.
+ *   
+ *   \version 1.1
+ *   \date June 2010
+ *
+ *   Optional extension support
+ *
+ *         cl
+ *         cl_ext_device_fission
+ *				#define USE_CL_DEVICE_FISSION
+ */
+
+/*! \mainpage
+ * \section intro Introduction
+ * For many large applications C++ is the language of choice and so it seems
+ * reasonable to define C++ bindings for OpenCL.
+ *
+ *
+ * The interface is contained with a single C++ header file \em cl.hpp and all
+ * definitions are contained within the namespace \em cl. There is no additional
+ * requirement to include \em cl.h and to use either the C++ or original C
+ * bindings it is enough to simply include \em cl.hpp.
+ *
+ * The bindings themselves are lightweight and correspond closely to the
+ * underlying C API. Using the C++ bindings introduces no additional execution
+ * overhead.
+ *
+ * For detail documentation on the bindings see:
+ *
+ * The OpenCL C++ Wrapper API 1.1 (revision 04)
+ *  http://www.khronos.org/registry/cl/specs/opencl-cplusplus-1.1.pdf
+ *
+ * \section example Example
+ *
+ * The following example shows a general use case for the C++
+ * bindings, including support for the optional exception feature and
+ * also the supplied vector and string classes, see following sections for
+ * decriptions of these features.
+ *
+ * \code
+ * #define __CL_ENABLE_EXCEPTIONS
+ * 
+ * #if defined(__APPLE__) || defined(__MACOSX)
+ * #include <OpenCL/cl.hpp>
+ * #else
+ * #include <CL/cl.hpp>
+ * #endif
+ * #include <cstdio>
+ * #include <cstdlib>
+ * #include <iostream>
+ * 
+ *  const char * helloStr  = "__kernel void "
+ *                           "hello(void) "
+ *                           "{ "
+ *                           "  "
+ *                           "} ";
+ * 
+ *  int
+ *  main(void)
+ *  {
+ *     cl_int err = CL_SUCCESS;
+ *     try {
+ *
+ *       std::vector<cl::Platform> platforms;
+ *       cl::Platform::get(&platforms);
+ *       if (platforms.size() == 0) {
+ *           std::cout << "Platform size 0\n";
+ *           return -1;
+ *       }
+ *
+ *       cl_context_properties properties[] = 
+ *          { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0};
+ *       cl::Context context(CL_DEVICE_TYPE_CPU, properties); 
+ * 
+ *       std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
+ * 
+ *       cl::Program::Sources source(1,
+ *           std::make_pair(helloStr,strlen(helloStr)));
+ *       cl::Program program_ = cl::Program(context, source);
+ *       program_.build(devices);
+ * 
+ *       cl::Kernel kernel(program_, "hello", &err);
+ * 
+ *       cl::Event event;
+ *       cl::CommandQueue queue(context, devices[0], 0, &err);
+ *       queue.enqueueNDRangeKernel(
+ *           kernel, 
+ *           cl::NullRange, 
+ *           cl::NDRange(4,4),
+ *           cl::NullRange,
+ *           NULL,
+ *           &event); 
+ * 
+ *       event.wait();
+ *     }
+ *     catch (cl::Error err) {
+ *        std::cerr 
+ *           << "ERROR: "
+ *           << err.what()
+ *           << "("
+ *           << err.err()
+ *           << ")"
+ *           << std::endl;
+ *     }
+ * 
+ *    return EXIT_SUCCESS;
+ *  }
+ * 
+ * \endcode
+ *
+ */
+#ifndef CL_HPP_
+#define CL_HPP_
+
+#ifdef _WIN32
+#include <windows.h>
+#include <malloc.h>
+#if defined(USE_DX_INTEROP)
+#include <CL/cl_d3d10.h>
+#endif
+#endif // _WIN32
+
+// 
+#if defined(USE_CL_DEVICE_FISSION)
+#include <CL/cl_ext.h>
+#endif
+
+#if defined(__APPLE__) || defined(__MACOSX)
+#include <OpenGL/OpenGL.h>
+#include <OpenCL/opencl.h>
+#else
+#include <GL/gl.h>
+#include <CL/opencl.h>
+#endif // !__APPLE__
+
+#if !defined(CL_CALLBACK)
+#define CL_CALLBACK
+#endif //CL_CALLBACK
+
+#include <utility>
+
+#if !defined(__NO_STD_VECTOR)
+#include <vector>
+#endif
+
+#if !defined(__NO_STD_STRING)
+#include <string>
+#endif 
+
+#if defined(linux) || defined(__APPLE__) || defined(__MACOSX)
+# include <alloca.h>
+#endif // linux
+
+#include <cstring>
+
+/*! \namespace cl
+ *
+ * \brief The OpenCL C++ bindings are defined within this namespace.
+ *
+ */
+namespace cl {
+
+#define __INIT_CL_EXT_FCN_PTR(name) \
+    if(!pfn_##name) { \
+        pfn_##name = (PFN_##name) \
+            clGetExtensionFunctionAddress(#name); \
+        if(!pfn_##name) { \
+        } \
+    }
+
+class Program;
+class Device;
+class Context;
+class CommandQueue;
+class Memory;
+
+#if defined(__CL_ENABLE_EXCEPTIONS)
+#include <exception>
+/*! \class Error
+ * \brief Exception class
+ */
+class Error : public std::exception
+{
+private:
+    cl_int err_;
+    const char * errStr_;
+public:
+    /*! Create a new CL error exception for a given error code
+     *  and corresponding message.
+     */
+    Error(cl_int err, const char * errStr = NULL) : err_(err), errStr_(errStr)
+    {}
+
+    ~Error() throw() {}
+
+    /*! \brief Get error string associated with exception
+     *
+     * \return A memory pointer to the error message string.
+     */
+    virtual const char * what() const throw ()
+    {
+        if (errStr_ == NULL) {
+            return "empty";
+        }
+        else {
+            return errStr_;
+        }
+    }
+
+    /*! \brief Get error code associated with exception
+     *
+     *  \return The error code.
+     */
+    const cl_int err(void) const { return err_; }
+};
+
+#define __ERR_STR(x) #x
+#else
+#define __ERR_STR(x) NULL
+#endif // __CL_ENABLE_EXCEPTIONS
+
+//! \cond DOXYGEN_DETAIL
+#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS)
+#define __GET_DEVICE_INFO_ERR               __ERR_STR(clgetDeviceInfo)
+#define __GET_PLATFORM_INFO_ERR             __ERR_STR(clGetPlatformInfo)
+#define __GET_DEVICE_IDS_ERR                __ERR_STR(clGetDeviceIDs)
+#define __GET_PLATFORM_IDS_ERR              __ERR_STR(clGetPlatformIDs)
+#define __GET_CONTEXT_INFO_ERR              __ERR_STR(clGetContextInfo)
+#define __GET_EVENT_INFO_ERR                __ERR_STR(clGetEventInfo)
+#define __GET_EVENT_PROFILE_INFO_ERR        __ERR_STR(clGetEventProfileInfo)
+#define __GET_MEM_OBJECT_INFO_ERR           __ERR_STR(clGetMemObjectInfo)
+#define __GET_IMAGE_INFO_ERR                __ERR_STR(clGetImageInfo)
+#define __GET_SAMPLER_INFO_ERR              __ERR_STR(clGetSamplerInfo)
+#define __GET_KERNEL_INFO_ERR               __ERR_STR(clGetKernelInfo)
+#define __GET_KERNEL_WORK_GROUP_INFO_ERR    __ERR_STR(clGetKernelWorkGroupInfo)
+#define __GET_PROGRAM_INFO_ERR              __ERR_STR(clGetProgramInfo)
+#define __GET_PROGRAM_BUILD_INFO_ERR        __ERR_STR(clGetProgramBuildInfo)
+#define __GET_COMMAND_QUEUE_INFO_ERR        __ERR_STR(clGetCommandQueueInfo)
+
+#define __CREATE_CONTEXT_FROM_TYPE_ERR      __ERR_STR(clCreateContextFromType)
+#define __GET_SUPPORTED_IMAGE_FORMATS_ERR   __ERR_STR(clGetSupportedImageFormats)
+
+#define __CREATE_BUFFER_ERR                 __ERR_STR(clCreateBuffer)
+#define __CREATE_SUBBUFFER_ERR              __ERR_STR(clCreateSubBuffer)
+#define __CREATE_GL_BUFFER_ERR              __ERR_STR(clCreateFromGLBuffer)
+#define __GET_GL_OBJECT_INFO_ERR            __ERR_STR(clGetGLObjectInfo)
+#define __CREATE_IMAGE2D_ERR                __ERR_STR(clCreateImage2D)
+#define __CREATE_IMAGE3D_ERR                __ERR_STR(clCreateImage3D)
+#define __CREATE_SAMPLER_ERR                __ERR_STR(clCreateSampler)
+#define __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR __ERR_STR(clSetMemObjectDestructorCallback)
+
+#define __CREATE_USER_EVENT_ERR             __ERR_STR(clCreateUserEvent)
+#define __SET_USER_EVENT_STATUS_ERR         __ERR_STR(clSetUserEventStatus)
+#define __SET_EVENT_CALLBACK_ERR            __ERR_STR(clSetEventCallback)
+#define __WAIT_FOR_EVENTS_ERR               __ERR_STR(clWaitForEvents)
+
+#define __CREATE_KERNEL_ERR                 __ERR_STR(clCreateKernel)
+#define __SET_KERNEL_ARGS_ERR               __ERR_STR(clSetKernelArg)
+#define __CREATE_PROGRAM_WITH_SOURCE_ERR    __ERR_STR(clCreateProgramWithSource)
+#define __CREATE_PROGRAM_WITH_BINARY_ERR    __ERR_STR(clCreateProgramWithBinary)
+#define __BUILD_PROGRAM_ERR                 __ERR_STR(clBuildProgram)
+#define __CREATE_KERNELS_IN_PROGRAM_ERR     __ERR_STR(clCreateKernelsInProgram)
+
+#define __CREATE_COMMAND_QUEUE_ERR          __ERR_STR(clCreateCommandQueue)
+#define __SET_COMMAND_QUEUE_PROPERTY_ERR    __ERR_STR(clSetCommandQueueProperty)
+#define __ENQUEUE_READ_BUFFER_ERR           __ERR_STR(clEnqueueReadBuffer)
+#define __ENQUEUE_READ_BUFFER_RECT_ERR      __ERR_STR(clEnqueueReadBufferRect)
+#define __ENQUEUE_WRITE_BUFFER_ERR          __ERR_STR(clEnqueueWriteBuffer)
+#define __ENQUEUE_WRITE_BUFFER_RECT_ERR     __ERR_STR(clEnqueueWriteBufferRect)
+#define __ENQEUE_COPY_BUFFER_ERR            __ERR_STR(clEnqueueCopyBuffer)
+#define __ENQEUE_COPY_BUFFER_RECT_ERR       __ERR_STR(clEnqueueCopyBufferRect)
+#define __ENQUEUE_READ_IMAGE_ERR            __ERR_STR(clEnqueueReadImage)
+#define __ENQUEUE_WRITE_IMAGE_ERR           __ERR_STR(clEnqueueWriteImage)
+#define __ENQUEUE_COPY_IMAGE_ERR            __ERR_STR(clEnqueueCopyImage)
+#define __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR  __ERR_STR(clEnqueueCopyImageToBuffer)
+#define __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR  __ERR_STR(clEnqueueCopyBufferToImage)
+#define __ENQUEUE_MAP_BUFFER_ERR            __ERR_STR(clEnqueueMapBuffer)
+#define __ENQUEUE_MAP_IMAGE_ERR             __ERR_STR(clEnqueueMapImage)
+#define __ENQUEUE_UNMAP_MEM_OBJECT_ERR      __ERR_STR(clEnqueueUnMapMemObject)
+#define __ENQUEUE_NDRANGE_KERNEL_ERR        __ERR_STR(clEnqueueNDRangeKernel)
+#define __ENQUEUE_TASK_ERR                  __ERR_STR(clEnqueueTask)
+#define __ENQUEUE_NATIVE_KERNEL             __ERR_STR(clEnqueueNativeKernel)
+#define __ENQUEUE_MARKER_ERR                __ERR_STR(clEnqueueMarker)
+#define __ENQUEUE_WAIT_FOR_EVENTS_ERR       __ERR_STR(clEnqueueWaitForEvents)
+#define __ENQUEUE_BARRIER_ERR               __ERR_STR(clEnqueueBarrier)
+
+#define __ENQUEUE_ACQUIRE_GL_ERR            __ERR_STR(clEnqueueAcquireGLObjects)
+#define __ENQUEUE_RELEASE_GL_ERR            __ERR_STR(clEnqueueReleaseGLObjects)
+
+#define __UNLOAD_COMPILER_ERR               __ERR_STR(clUnloadCompiler)
+
+#define __FLUSH_ERR                         __ERR_STR(clFlush)
+#define __FINISH_ERR                        __ERR_STR(clFinish)
+
+#define __CREATE_SUB_DEVICES                __ERR_STR(clCreateSubDevicesEXT)
+#endif // __CL_USER_OVERRIDE_ERROR_STRINGS
+//! \endcond
+
+/*! \class string
+ * \brief Simple string class, that provides a limited subset of std::string
+ * functionality but avoids many of the issues that come with that class.
+ */
+class string
+{
+private:
+    ::size_t size_;
+    char * str_;
+public:
+    string(void) : size_(0), str_(NULL)
+    {
+    }
+
+    string(char * str, ::size_t size) :
+        size_(size),
+        str_(NULL)
+    {
+        str_ = new char[size_+1];
+        if (str_ != NULL) {
+            memcpy(str_, str, size_  * sizeof(char));
+            str_[size_] = '\0';
+        }
+        else {
+            size_ = 0;
+        }
+    }
+
+    string(char * str) :
+        str_(NULL)
+    {
+        size_= ::strlen(str);
+        str_ = new char[size_ + 1];
+        if (str_ != NULL) {
+            memcpy(str_, str, (size_ + 1) * sizeof(char));
+        }
+        else {
+            size_ = 0;
+        }
+    }
+
+    string& operator=(const string& rhs)
+    {
+        if (this == &rhs) {
+            return *this;
+        }
+
+        if (rhs.size_ == 0 || rhs.str_ == NULL) {
+            size_ = 0;
+            str_  = NULL;
+        } 
+        else {
+            size_ = rhs.size_;
+            str_ = new char[size_ + 1];
+            if (str_ != NULL) {
+                memcpy(str_, rhs.str_, (size_ + 1) * sizeof(char));
+            }
+            else {
+                size_ = 0;
+            }
+        }
+
+        return *this;
+    }
+
+    string(const string& rhs)
+    {
+        *this = rhs;
+    }
+
+    ~string()
+    {
+        if (str_ != NULL) {
+            delete[] str_;
+        }
+    }
+
+    ::size_t size(void) const   { return size_; }
+    ::size_t length(void) const { return size(); }
+
+    const char * c_str(void) const { return (str_) ? str_ : "";}
+};
+
+#if !defined(__USE_DEV_STRING) && !defined(__NO_STD_STRING)
+#include <string>
+typedef std::string STRING_CLASS;
+#elif !defined(__USE_DEV_STRING) 
+typedef cl::string STRING_CLASS;
+#endif
+
+#if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR)
+#include <vector>
+#define VECTOR_CLASS std::vector
+#elif !defined(__USE_DEV_VECTOR) 
+#define VECTOR_CLASS cl::vector 
+#endif
+
+#if !defined(__MAX_DEFAULT_VECTOR_SIZE)
+#define __MAX_DEFAULT_VECTOR_SIZE 10
+#endif
+
+/*! \class vector
+ * \brief Fixed sized vector implementation that mirroring 
+ * std::vector functionality.
+ */
+template <typename T, unsigned int N = __MAX_DEFAULT_VECTOR_SIZE>
+class vector
+{
+private:
+    T data_[N];
+    unsigned int size_;
+    bool empty_;
+public:
+    vector() : 
+        size_(-1),
+        empty_(true)
+    {}
+
+    ~vector() {}
+
+    unsigned int size(void) const
+    {
+        return size_ + 1;
+    }
+
+    void clear()
+    {
+        size_ = -1;
+        empty_ = true;
+    }
+
+    void push_back (const T& x)
+    { 
+        if (size() < N) {
+            size_++;  
+            data_[size_] = x;
+            empty_ = false;
+        }
+    }
+
+    void pop_back(void)
+    {
+        if (!empty_) {
+            data_[size_].~T();
+            size_--;
+            if (size_ == -1) {
+                empty_ = true;
+            }
+        }
+    }
+  
+    vector(const vector<T, N>& vec) : 
+        size_(vec.size_),
+        empty_(vec.empty_)
+    {
+        if (!empty_) {
+            memcpy(&data_[0], &vec.data_[0], size() * sizeof(T));
+        }
+    } 
+
+    vector(unsigned int size, const T& val = T()) :
+        size_(-1),
+        empty_(true)
+    {
+        for (unsigned int i = 0; i < size; i++) {
+            push_back(val);
+        }
+    }
+
+    vector<T, N>& operator=(const vector<T, N>& rhs)
+    {
+        if (this == &rhs) {
+            return *this;
+        }
+
+        size_  = rhs.size_;
+        empty_ = rhs.empty_;
+
+        if (!empty_) {	
+            memcpy(&data_[0], &rhs.data_[0], size() * sizeof(T));
+        }
+    
+        return *this;
+    }
+
+    bool operator==(vector<T,N> &vec)
+    {
+        if (empty_ && vec.empty_) {
+            return true;
+        }
+
+        if (size() != vec.size()) {
+            return false;
+        }
+
+        return memcmp(&data_[0], &vec.data_[0], size() * sizeof(T)) == 0 ? true : false;
+    }
+  
+    operator T* ()             { return data_; }
+    operator const T* () const { return data_; }
+   
+    bool empty (void) const
+    {
+        return empty_;
+    }
+  
+    unsigned int max_size (void) const
+    {
+        return N;
+    }
+
+    unsigned int capacity () const
+    {
+        return sizeof(T) * N;
+    }
+
+    T& operator[](int index)
+    {
+        return data_[index];
+    }
+  
+    T operator[](int index) const
+    {
+        return data_[index];
+    }
+  
+    template<class I>
+    void assign(I start, I end)
+    {
+        clear();   
+        while(start < end) {
+            push_back(*start);
+            start++;
+        }
+    }
+
+    /*! \class iterator
+     * \brief Iterator class for vectors
+     */
+    class iterator
+    {
+    private:
+        vector<T,N> vec_;
+        int index_;
+        bool initialized_;
+    public:
+        iterator(void) : 
+            index_(-1),
+            initialized_(false)
+        {
+            index_ = -1;
+            initialized_ = false;
+        }
+
+        ~iterator(void) {}
+
+        static iterator begin(vector<T,N> &vec)
+        {
+            iterator i;
+
+            if (!vec.empty()) {
+                i.index_ = 0;
+            }
+
+            i.vec_ = vec;
+            i.initialized_ = true;
+            return i;
+        }
+
+        static iterator end(vector<T,N> &vec)
+        {
+            iterator i;
+
+            if (!vec.empty()) {
+                i.index_ = vec.size();
+            }
+            i.vec_ = vec;
+            i.initialized_ = true;
+            return i;
+        }
+    
+        bool operator==(iterator i)
+        {
+            return ((vec_ == i.vec_) && 
+                    (index_ == i.index_) && 
+                    (initialized_ == i.initialized_));
+        }
+
+        bool operator!=(iterator i)
+        {
+            return (!(*this==i));
+        }
+
+        void operator++()
+        {
+            index_++;
+        }
+
+        void operator++(int x)
+        {
+            index_ += x;
+        }
+
+        void operator--()
+        {
+            index_--;
+        }
+
+        void operator--(int x)
+        {
+            index_ -= x;
+        }
+
+        T operator *()
+        {
+            return vec_[index_];
+        }
+    };
+
+    iterator begin(void)
+    {
+        return iterator::begin(*this);
+    }
+
+    iterator end(void)
+    {
+        return iterator::end(*this);
+    }
+
+    T& front(void)
+    {
+        return data_[0];
+    }
+
+    T& back(void)
+    {
+        return data_[size_];
+    }
+
+    const T& front(void) const
+    {
+        return data_[0];
+    }
+
+    const T& back(void) const
+    {
+        return data_[size_];
+    }
+};  
+    
+/*!
+ * \brief size_t class used to interface between C++ and
+ * OpenCL C calls that require arrays of size_t values, who's
+ * size is known statically.
+ */
+template <int N>
+struct size_t : public cl::vector< ::size_t, N> { };
+
+namespace detail {
+
+// GetInfo help struct
+template <typename Functor, typename T>
+struct GetInfoHelper
+{
+    static cl_int
+    get(Functor f, cl_uint name, T* param)
+    {
+        return f(name, sizeof(T), param, NULL);
+    }
+};
+
+// Specialized GetInfoHelper for VECTOR_CLASS params
+template <typename Func, typename T>
+struct GetInfoHelper<Func, VECTOR_CLASS<T> >
+{
+    static cl_int get(Func f, cl_uint name, VECTOR_CLASS<T>* param)
+    {
+        ::size_t required;
+        cl_int err = f(name, 0, NULL, &required);
+        if (err != CL_SUCCESS) {
+            return err;
+        }
+
+        T* value = (T*) alloca(required);
+        err = f(name, required, value, NULL);
+        if (err != CL_SUCCESS) {
+            return err;
+        }
+
+        param->assign(&value[0], &value[required/sizeof(T)]);
+        return CL_SUCCESS;
+    }
+};
+
+// Specialized for getInfo<CL_PROGRAM_BINARIES>
+template <typename Func>
+struct GetInfoHelper<Func, VECTOR_CLASS<char *> >
+{
+    static cl_int
+    get(Func f, cl_uint name, VECTOR_CLASS<char *>* param)
+    {
+      cl_uint err = f(name, param->size() * sizeof(char *), &(*param)[0], NULL);
+      if (err != CL_SUCCESS) {
+        return err;
+      }
+      
+      return CL_SUCCESS;
+    }
+};
+
+// Specialized GetInfoHelper for STRING_CLASS params
+template <typename Func>
+struct GetInfoHelper<Func, STRING_CLASS>
+{
+    static cl_int get(Func f, cl_uint name, STRING_CLASS* param)
+    {
+        ::size_t required;
+        cl_int err = f(name, 0, NULL, &required);
+        if (err != CL_SUCCESS) {
+            return err;
+        }
+
+        char* value = (char*) alloca(required);
+        err = f(name, required, value, NULL);
+        if (err != CL_SUCCESS) {
+            return err;
+        }
+
+        *param = value;
+        return CL_SUCCESS;
+    }
+};
+
+#define __GET_INFO_HELPER_WITH_RETAIN(CPP_TYPE) \
+namespace detail { \
+template <typename Func> \
+struct GetInfoHelper<Func, CPP_TYPE> \
+{ \
+    static cl_int get(Func f, cl_uint name, CPP_TYPE* param) \
+    { \
+      cl_uint err = f(name, sizeof(CPP_TYPE), param, NULL); \
+      if (err != CL_SUCCESS) { \
+        return err; \
+      } \
+      \
+      return ReferenceHandler<CPP_TYPE::cl_type>::retain((*param)()); \
+    } \
+}; \
+} 
+
+
+#define __PARAM_NAME_INFO_1_0(F) \
+    F(cl_platform_info, CL_PLATFORM_PROFILE, STRING_CLASS) \
+    F(cl_platform_info, CL_PLATFORM_VERSION, STRING_CLASS) \
+    F(cl_platform_info, CL_PLATFORM_NAME, STRING_CLASS) \
+    F(cl_platform_info, CL_PLATFORM_VENDOR, STRING_CLASS) \
+    F(cl_platform_info, CL_PLATFORM_EXTENSIONS, STRING_CLASS) \
+    \
+    F(cl_device_info, CL_DEVICE_TYPE, cl_device_type) \
+    F(cl_device_info, CL_DEVICE_VENDOR_ID, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_WORK_GROUP_SIZE, ::size_t) \
+    F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_SIZES, VECTOR_CLASS< ::size_t>) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint) \
+    F(cl_device_info, CL_DEVICE_ADDRESS_BITS, cl_bitfield) \
+    F(cl_device_info, CL_DEVICE_MAX_READ_IMAGE_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_WIDTH, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_HEIGHT, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_WIDTH, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_HEIGHT, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_DEPTH, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE_SUPPORT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_PARAMETER_SIZE, ::size_t) \
+    F(cl_device_info, CL_DEVICE_MAX_SAMPLERS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MEM_BASE_ADDR_ALIGN, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_SINGLE_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, cl_device_mem_cache_type) \
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, cl_uint)\
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_MAX_CONSTANT_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_LOCAL_MEM_TYPE, cl_device_local_mem_type) \
+    F(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_ERROR_CORRECTION_SUPPORT, cl_bool) \
+    F(cl_device_info, CL_DEVICE_PROFILING_TIMER_RESOLUTION, ::size_t) \
+    F(cl_device_info, CL_DEVICE_ENDIAN_LITTLE, cl_bool) \
+    F(cl_device_info, CL_DEVICE_AVAILABLE, cl_bool) \
+    F(cl_device_info, CL_DEVICE_COMPILER_AVAILABLE, cl_bool) \
+    F(cl_device_info, CL_DEVICE_EXECUTION_CAPABILITIES, cl_device_exec_capabilities) \
+    F(cl_device_info, CL_DEVICE_QUEUE_PROPERTIES, cl_command_queue_properties) \
+    F(cl_device_info, CL_DEVICE_PLATFORM, cl_platform_id) \
+    F(cl_device_info, CL_DEVICE_NAME, STRING_CLASS) \
+    F(cl_device_info, CL_DEVICE_VENDOR, STRING_CLASS) \
+    F(cl_device_info, CL_DRIVER_VERSION, STRING_CLASS) \
+    F(cl_device_info, CL_DEVICE_PROFILE, STRING_CLASS) \
+    F(cl_device_info, CL_DEVICE_VERSION, STRING_CLASS) \
+    F(cl_device_info, CL_DEVICE_EXTENSIONS, STRING_CLASS) \
+    \
+    F(cl_context_info, CL_CONTEXT_REFERENCE_COUNT, cl_uint) \
+    F(cl_context_info, CL_CONTEXT_DEVICES, VECTOR_CLASS<Device>) \
+    F(cl_context_info, CL_CONTEXT_PROPERTIES, VECTOR_CLASS<cl_context_properties>) \
+    \
+    F(cl_event_info, CL_EVENT_COMMAND_QUEUE, cl::CommandQueue) \
+    F(cl_event_info, CL_EVENT_COMMAND_TYPE, cl_command_type) \
+    F(cl_event_info, CL_EVENT_REFERENCE_COUNT, cl_uint) \
+    F(cl_event_info, CL_EVENT_COMMAND_EXECUTION_STATUS, cl_uint) \
+    \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_QUEUED, cl_ulong) \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_SUBMIT, cl_ulong) \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_START, cl_ulong) \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_END, cl_ulong) \
+    \
+    F(cl_mem_info, CL_MEM_TYPE, cl_mem_object_type) \
+    F(cl_mem_info, CL_MEM_FLAGS, cl_mem_flags) \
+    F(cl_mem_info, CL_MEM_SIZE, ::size_t) \
+    F(cl_mem_info, CL_MEM_HOST_PTR, void*) \
+    F(cl_mem_info, CL_MEM_MAP_COUNT, cl_uint) \
+    F(cl_mem_info, CL_MEM_REFERENCE_COUNT, cl_uint) \
+    F(cl_mem_info, CL_MEM_CONTEXT, cl::Context) \
+    \
+    F(cl_image_info, CL_IMAGE_FORMAT, cl_image_format) \
+    F(cl_image_info, CL_IMAGE_ELEMENT_SIZE, ::size_t) \
+    F(cl_image_info, CL_IMAGE_ROW_PITCH, ::size_t) \
+    F(cl_image_info, CL_IMAGE_SLICE_PITCH, ::size_t) \
+    F(cl_image_info, CL_IMAGE_WIDTH, ::size_t) \
+    F(cl_image_info, CL_IMAGE_HEIGHT, ::size_t) \
+    F(cl_image_info, CL_IMAGE_DEPTH, ::size_t) \
+    \
+    F(cl_sampler_info, CL_SAMPLER_REFERENCE_COUNT, cl_uint) \
+    F(cl_sampler_info, CL_SAMPLER_CONTEXT, cl::Context) \
+    F(cl_sampler_info, CL_SAMPLER_NORMALIZED_COORDS, cl_addressing_mode) \
+    F(cl_sampler_info, CL_SAMPLER_ADDRESSING_MODE, cl_filter_mode) \
+    F(cl_sampler_info, CL_SAMPLER_FILTER_MODE, cl_bool) \
+    \
+    F(cl_program_info, CL_PROGRAM_REFERENCE_COUNT, cl_uint) \
+    F(cl_program_info, CL_PROGRAM_CONTEXT, cl::Context) \
+    F(cl_program_info, CL_PROGRAM_NUM_DEVICES, cl_uint) \
+    F(cl_program_info, CL_PROGRAM_DEVICES, VECTOR_CLASS<cl_device_id>) \
+    F(cl_program_info, CL_PROGRAM_SOURCE, STRING_CLASS) \
+    F(cl_program_info, CL_PROGRAM_BINARY_SIZES, VECTOR_CLASS< ::size_t>) \
+    F(cl_program_info, CL_PROGRAM_BINARIES, VECTOR_CLASS<char *>) \
+    \
+    F(cl_program_build_info, CL_PROGRAM_BUILD_STATUS, cl_build_status) \
+    F(cl_program_build_info, CL_PROGRAM_BUILD_OPTIONS, STRING_CLASS) \
+    F(cl_program_build_info, CL_PROGRAM_BUILD_LOG, STRING_CLASS) \
+    \
+    F(cl_kernel_info, CL_KERNEL_FUNCTION_NAME, STRING_CLASS) \
+    F(cl_kernel_info, CL_KERNEL_NUM_ARGS, cl_uint) \
+    F(cl_kernel_info, CL_KERNEL_REFERENCE_COUNT, cl_uint) \
+    F(cl_kernel_info, CL_KERNEL_CONTEXT, cl::Context) \
+    F(cl_kernel_info, CL_KERNEL_PROGRAM, cl::Program) \
+    \
+    F(cl_kernel_work_group_info, CL_KERNEL_WORK_GROUP_SIZE, ::size_t) \
+    F(cl_kernel_work_group_info, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, cl::size_t<3>) \
+    F(cl_kernel_work_group_info, CL_KERNEL_LOCAL_MEM_SIZE, cl_ulong) \
+    \
+    F(cl_command_queue_info, CL_QUEUE_CONTEXT, cl::Context) \
+    F(cl_command_queue_info, CL_QUEUE_DEVICE, cl::Device) \
+    F(cl_command_queue_info, CL_QUEUE_REFERENCE_COUNT, cl_uint) \
+    F(cl_command_queue_info, CL_QUEUE_PROPERTIES, cl_command_queue_properties)
+
+#if defined(CL_VERSION_1_1)
+#define __PARAM_NAME_INFO_1_1(F) \
+    F(cl_context_info, CL_CONTEXT_NUM_DEVICES, cl_uint)\
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, cl_uint) \
+    F(cl_device_info, CL_DEVICE_DOUBLE_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_HALF_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_HOST_UNIFIED_MEMORY, cl_bool) \
+    \
+    F(cl_mem_info, CL_MEM_ASSOCIATED_MEMOBJECT, cl::Memory) \
+    F(cl_mem_info, CL_MEM_OFFSET, ::size_t) \
+    \
+    F(cl_kernel_work_group_info, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, ::size_t) \
+    F(cl_kernel_work_group_info, CL_KERNEL_PRIVATE_MEM_SIZE, cl_ulong) \
+    \
+    F(cl_event_info, CL_EVENT_CONTEXT, cl::Context)
+#endif // CL_VERSION_1_1
+
+#if defined(USE_CL_DEVICE_FISSION)
+#define __PARAM_NAME_DEVICE_FISSION(F) \
+    F(cl_device_info, CL_DEVICE_PARENT_DEVICE_EXT, cl_device_id) \
+	F(cl_device_info, CL_DEVICE_PARTITION_TYPES_EXT, VECTOR_CLASS<cl_device_partition_property_ext>) \
+	F(cl_device_info, CL_DEVICE_AFFINITY_DOMAINS_EXT, VECTOR_CLASS<cl_device_partition_property_ext>) \
+	F(cl_device_info, CL_DEVICE_REFERENCE_COUNT_EXT , cl_uint) \
+	F(cl_device_info, CL_DEVICE_PARTITION_STYLE_EXT, VECTOR_CLASS<cl_device_partition_property_ext>)
+#endif // USE_CL_DEVICE_FISSION
+
+template <typename enum_type, cl_int Name>
+struct param_traits {};
+
+#define __DECLARE_PARAM_TRAITS(token, param_name, T) \
+struct token;                                        \
+template<>                                           \
+struct param_traits<detail:: token,param_name>       \
+{                                                    \
+    enum { value = param_name };                     \
+    typedef T param_type;                            \
+};
+
+__PARAM_NAME_INFO_1_0(__DECLARE_PARAM_TRAITS);
+#if defined(CL_VERSION_1_1)
+__PARAM_NAME_INFO_1_1(__DECLARE_PARAM_TRAITS);
+#endif // CL_VERSION_1_1
+
+#if defined(USE_CL_DEVICE_FISSION)
+__PARAM_NAME_DEVICE_FISSION(__DECLARE_PARAM_TRAITS);
+#endif // USE_CL_DEVICE_FISSION
+
+#undef __DECLARE_PARAM_TRAITS
+
+// Convenience functions
+
+template <typename Func, typename T>
+inline cl_int
+getInfo(Func f, cl_uint name, T* param)
+{
+    return GetInfoHelper<Func, T>::get(f, name, param);
+}
+
+template <typename Func, typename Arg0>
+struct GetInfoFunctor0
+{
+    Func f_; const Arg0& arg0_;
+    cl_int operator ()(
+        cl_uint param, ::size_t size, void* value, ::size_t* size_ret)
+    { return f_(arg0_, param, size, value, size_ret); }
+};
+
+template <typename Func, typename Arg0, typename Arg1>
+struct GetInfoFunctor1
+{
+    Func f_; const Arg0& arg0_; const Arg1& arg1_;
+    cl_int operator ()(
+        cl_uint param, ::size_t size, void* value, ::size_t* size_ret)
+    { return f_(arg0_, arg1_, param, size, value, size_ret); }
+};
+
+template <typename Func, typename Arg0, typename T>
+inline cl_int
+getInfo(Func f, const Arg0& arg0, cl_uint name, T* param)
+{
+    GetInfoFunctor0<Func, Arg0> f0 = { f, arg0 };
+    return GetInfoHelper<GetInfoFunctor0<Func, Arg0>, T>
+        ::get(f0, name, param);
+}
+
+template <typename Func, typename Arg0, typename Arg1, typename T>
+inline cl_int
+getInfo(Func f, const Arg0& arg0, const Arg1& arg1, cl_uint name, T* param)
+{
+    GetInfoFunctor1<Func, Arg0, Arg1> f0 = { f, arg0, arg1 };
+    return GetInfoHelper<GetInfoFunctor1<Func, Arg0, Arg1>, T>
+        ::get(f0, name, param);
+}
+
+template<typename T>
+struct ReferenceHandler
+{ };
+
+template <>
+struct ReferenceHandler<cl_device_id>
+{
+    // cl_device_id does not have retain().
+    static cl_int retain(cl_device_id)
+    { return CL_INVALID_DEVICE; }
+    // cl_device_id does not have release().
+    static cl_int release(cl_device_id)
+    { return CL_INVALID_DEVICE; }
+};
+
+template <>
+struct ReferenceHandler<cl_platform_id>
+{
+    // cl_platform_id does not have retain().
+    static cl_int retain(cl_platform_id)
+    { return CL_INVALID_PLATFORM; }
+    // cl_platform_id does not have release().
+    static cl_int release(cl_platform_id)
+    { return CL_INVALID_PLATFORM; }
+};
+
+template <>
+struct ReferenceHandler<cl_context>
+{
+    static cl_int retain(cl_context context)
+    { return ::clRetainContext(context); }
+    static cl_int release(cl_context context)
+    { return ::clReleaseContext(context); }
+};
+
+template <>
+struct ReferenceHandler<cl_command_queue>
+{
+    static cl_int retain(cl_command_queue queue)
+    { return ::clRetainCommandQueue(queue); }
+    static cl_int release(cl_command_queue queue)
+    { return ::clReleaseCommandQueue(queue); }
+};
+
+template <>
+struct ReferenceHandler<cl_mem>
+{
+    static cl_int retain(cl_mem memory)
+    { return ::clRetainMemObject(memory); }
+    static cl_int release(cl_mem memory)
+    { return ::clReleaseMemObject(memory); }
+};
+
+template <>
+struct ReferenceHandler<cl_sampler>
+{
+    static cl_int retain(cl_sampler sampler)
+    { return ::clRetainSampler(sampler); }
+    static cl_int release(cl_sampler sampler)
+    { return ::clReleaseSampler(sampler); }
+};
+
+template <>
+struct ReferenceHandler<cl_program>
+{
+    static cl_int retain(cl_program program)
+    { return ::clRetainProgram(program); }
+    static cl_int release(cl_program program)
+    { return ::clReleaseProgram(program); }
+};
+
+template <>
+struct ReferenceHandler<cl_kernel>
+{
+    static cl_int retain(cl_kernel kernel)
+    { return ::clRetainKernel(kernel); }
+    static cl_int release(cl_kernel kernel)
+    { return ::clReleaseKernel(kernel); }
+};
+
+template <>
+struct ReferenceHandler<cl_event>
+{
+    static cl_int retain(cl_event event)
+    { return ::clRetainEvent(event); }
+    static cl_int release(cl_event event)
+    { return ::clReleaseEvent(event); }
+};
+
+template <typename T>
+class Wrapper
+{
+public:
+    typedef T cl_type;
+
+protected:
+    cl_type object_;
+
+public:
+    Wrapper() : object_(NULL) { }
+
+    ~Wrapper()
+    {
+        if (object_ != NULL) { release(); }
+    }
+
+    Wrapper(const Wrapper<cl_type>& rhs)
+    {
+        object_ = rhs.object_;
+        if (object_ != NULL) { retain(); }
+    }
+
+    Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
+    {
+        if (object_ != NULL) { release(); }
+        object_ = rhs.object_;
+        if (object_ != NULL) { retain(); }
+        return *this;
+    }
+
+    cl_type operator ()() const { return object_; }
+
+    cl_type& operator ()() { return object_; }
+
+protected:
+
+    cl_int retain() const
+    {
+        return ReferenceHandler<cl_type>::retain(object_);
+    }
+
+    cl_int release() const
+    {
+        return ReferenceHandler<cl_type>::release(object_);
+    }
+};
+
+#if defined(__CL_ENABLE_EXCEPTIONS)
+static inline cl_int errHandler (
+    cl_int err,
+    const char * errStr = NULL) throw(Error)
+{
+    if (err != CL_SUCCESS) {
+        throw Error(err, errStr);
+    }
+    return err;
+}
+#else
+static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
+{
+    return err;
+}
+#endif // __CL_ENABLE_EXCEPTIONS
+
+} // namespace detail
+//! \endcond
+
+/*! \stuct ImageFormat
+ * \brief ImageFormat interface fro cl_image_format.
+ */
+struct ImageFormat : public cl_image_format
+{
+    ImageFormat(){}
+
+    ImageFormat(cl_channel_order order, cl_channel_type type)
+    {
+        image_channel_order = order;
+        image_channel_data_type = type;
+    }
+
+    ImageFormat& operator = (const ImageFormat& rhs)
+    {
+        if (this != &rhs) {
+            this->image_channel_data_type = rhs.image_channel_data_type;
+            this->image_channel_order     = rhs.image_channel_order;
+        }
+        return *this;
+    }
+};
+
+/*! \class Device
+ * \brief Device interface for cl_device_id.
+ */
+class Device : public detail::Wrapper<cl_device_id>
+{
+public:
+    Device(cl_device_id device) { object_ = device; }
+
+    Device() : detail::Wrapper<cl_type>() { }
+
+    Device(const Device& device) : detail::Wrapper<cl_type>(device) { }
+
+    Device& operator = (const Device& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    template <typename T>
+    cl_int getInfo(cl_device_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetDeviceInfo, object_, name, param),
+            __GET_DEVICE_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_device_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_device_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+#if defined(USE_CL_DEVICE_FISSION)
+	cl_int createSubDevices(
+		const cl_device_partition_property_ext * properties,
+		VECTOR_CLASS<Device>* devices)
+	{
+		typedef CL_API_ENTRY cl_int 
+			( CL_API_CALL * PFN_clCreateSubDevicesEXT)(
+				cl_device_id /*in_device*/,
+                const cl_device_partition_property_ext * /* properties */,
+                cl_uint /*num_entries*/,
+                cl_device_id * /*out_devices*/,
+                cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+		static PFN_clCreateSubDevicesEXT pfn_clCreateSubDevicesEXT = NULL;
+		__INIT_CL_EXT_FCN_PTR(clCreateSubDevicesEXT);
+
+		cl_uint n = 0;
+        cl_int err = pfn_clCreateSubDevicesEXT(object_, properties, 0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES);
+        }
+
+        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
+        err = pfn_clCreateSubDevicesEXT(object_, properties, n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES);
+        }
+
+        devices->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+ 	}
+#endif
+};
+
+/*! \class Platform
+ *  \brief Platform interface.
+ */
+class Platform : public detail::Wrapper<cl_platform_id>
+{
+public:
+    static const Platform null();
+
+    Platform(cl_platform_id platform) { object_ = platform; }
+
+    Platform() : detail::Wrapper<cl_type>()  { }
+
+    Platform(const Platform& platform) : detail::Wrapper<cl_type>(platform) { }
+
+    Platform& operator = (const Platform& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    cl_int getInfo(cl_platform_info name, STRING_CLASS* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetPlatformInfo, object_, name, param),
+            __GET_PLATFORM_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_platform_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_platform_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    cl_int getDevices(
+        cl_device_type type,
+        VECTOR_CLASS<Device>* devices) const
+    {
+        cl_uint n = 0;
+        cl_int err = ::clGetDeviceIDs(object_, type, 0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
+        err = ::clGetDeviceIDs(object_, type, n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        devices->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+    }
+
+#if defined(USE_DX_INTEROP)
+   /*! \brief Get the list of available D3D10 devices.
+     *
+     *  \param d3d_device_source.
+     *
+     *  \param d3d_object.
+     *
+     *  \param d3d_device_set.
+     *
+     *  \param devices returns a vector of OpenCL D3D10 devices found. The cl::Device
+     *  values returned in devices can be used to identify a specific OpenCL
+     *  device. If \a devices argument is NULL, this argument is ignored.
+     *
+     *  \return One of the following values:
+     *    - CL_SUCCESS if the function is executed successfully.
+     *
+     *  The application can query specific capabilities of the OpenCL device(s)
+     *  returned by cl::getDevices. This can be used by the application to
+     *  determine which device(s) to use.
+     *
+     * \note In the case that exceptions are enabled and a return value
+     * other than CL_SUCCESS is generated, then cl::Error exception is
+     * generated.
+     */
+    cl_int getDevices(
+        cl_d3d10_device_source_khr d3d_device_source,
+        void *                     d3d_object,
+        cl_d3d10_device_set_khr    d3d_device_set,
+        VECTOR_CLASS<Device>* devices) const
+    {
+        typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clGetDeviceIDsFromD3D10KHR)(
+            cl_platform_id platform, 
+            cl_d3d10_device_source_khr d3d_device_source, 
+            void * d3d_object,
+            cl_d3d10_device_set_khr d3d_device_set,
+            cl_uint num_entries,
+            cl_device_id * devices,
+            cl_uint* num_devices);
+
+        static PFN_clGetDeviceIDsFromD3D10KHR pfn_clGetDeviceIDsFromD3D10KHR = NULL;
+        __INIT_CL_EXT_FCN_PTR(clGetDeviceIDsFromD3D10KHR);
+
+        cl_uint n = 0;
+        cl_int err = pfn_clGetDeviceIDsFromD3D10KHR(
+            object_, 
+            d3d_device_source, 
+            d3d_object,
+            d3d_device_set, 
+            0, 
+            NULL, 
+            &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
+        err = pfn_clGetDeviceIDsFromD3D10KHR(
+            object_, 
+            d3d_device_source, 
+            d3d_object,
+            d3d_device_set,
+            n, 
+            ids, 
+            NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        devices->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+    }
+#endif
+
+    static cl_int get(
+        VECTOR_CLASS<Platform>* platforms)
+    {
+        cl_uint n = 0;
+        cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        cl_platform_id* ids = (cl_platform_id*) alloca(
+            n * sizeof(cl_platform_id));
+        err = ::clGetPlatformIDs(n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        platforms->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+    }
+};
+
+static inline cl_int
+UnloadCompiler()
+{
+    return ::clUnloadCompiler();
+}
+
+class Context : public detail::Wrapper<cl_context>
+{
+public:
+    Context(
+        const VECTOR_CLASS<Device>& devices,
+        cl_context_properties* properties = NULL,
+        void (CL_CALLBACK * notifyFptr)(
+            const char *,
+            const void *,
+            ::size_t,
+            void *) = NULL,
+        void* data = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateContext(
+            properties, (cl_uint) devices.size(),
+            (cl_device_id*) &devices.front(),
+            notifyFptr, data, &error);
+
+        detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Context(
+        cl_device_type type,
+        cl_context_properties* properties = NULL,
+        void (CL_CALLBACK * notifyFptr)(
+            const char *,
+            const void *,
+            ::size_t,
+            void *) = NULL,
+        void* data = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateContextFromType(
+            properties, type, notifyFptr, data, &error);
+
+        detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Context() : detail::Wrapper<cl_type>() { }
+
+    Context(const Context& context) : detail::Wrapper<cl_type>(context) { }
+
+    Context& operator = (const Context& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    template <typename T>
+    cl_int getInfo(cl_context_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetContextInfo, object_, name, param),
+            __GET_CONTEXT_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_context_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_context_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    cl_int getSupportedImageFormats(
+        cl_mem_flags flags,
+        cl_mem_object_type type,
+        VECTOR_CLASS<ImageFormat>* formats) const
+    {
+        cl_uint numEntries;
+        cl_int err = ::clGetSupportedImageFormats(
+           object_, 
+           flags,
+           type, 
+           0, 
+           NULL, 
+           &numEntries);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
+        }
+
+        ImageFormat* value = (ImageFormat*)
+            alloca(numEntries * sizeof(ImageFormat));
+        err = ::clGetSupportedImageFormats(
+            object_, 
+            flags, 
+            type, 
+            numEntries,
+            (cl_image_format*) value, 
+            NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
+        }
+
+        formats->assign(&value[0], &value[numEntries]);
+        return CL_SUCCESS;
+    }
+};
+
+__GET_INFO_HELPER_WITH_RETAIN(cl::Context)
+
+/*! \class Event
+ * \brief Event interface for cl_event.
+ */
+class Event : public detail::Wrapper<cl_event>
+{
+public:
+    Event() : detail::Wrapper<cl_type>() { }
+
+    Event(const Event& event) : detail::Wrapper<cl_type>(event) { }
+
+    Event& operator = (const Event& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    template <typename T>
+    cl_int getInfo(cl_event_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetEventInfo, object_, name, param),
+            __GET_EVENT_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_event_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_event_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    template <typename T>
+    cl_int getProfilingInfo(cl_profiling_info name, T* param) const
+    {
+        return detail::errHandler(detail::getInfo(
+            &::clGetEventProfilingInfo, object_, name, param),
+            __GET_EVENT_PROFILE_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_profiling_info, name>::param_type
+    getProfilingInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_profiling_info, name>::param_type param;
+        cl_int result = getProfilingInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    cl_int wait() const
+    {
+        return detail::errHandler(
+            ::clWaitForEvents(1, &object_),
+            __WAIT_FOR_EVENTS_ERR);
+    }
+
+#if defined(CL_VERSION_1_1)
+    cl_int setCallback(
+        cl_int type,
+        void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *),		
+        void * user_data = NULL)
+    {
+        return detail::errHandler(
+            ::clSetEventCallback(
+                object_,
+                type,
+                pfn_notify,
+                user_data), 
+            __SET_EVENT_CALLBACK_ERR);
+    }
+#endif
+
+    static cl_int
+    waitForEvents(const VECTOR_CLASS<Event>& events)
+    {
+        return detail::errHandler(
+            ::clWaitForEvents(
+                (cl_uint) events.size(), (cl_event*)&events.front()),
+            __WAIT_FOR_EVENTS_ERR);
+    }
+};
+
+__GET_INFO_HELPER_WITH_RETAIN(cl::Event)
+
+#if defined(CL_VERSION_1_1)
+/*! \class UserEvent
+ * \brief User event interface for cl_event.
+ */
+class UserEvent : public Event
+{
+public:
+    UserEvent(
+        const Context& context,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateUserEvent(
+            context(),
+            &error);
+
+        detail::errHandler(error, __CREATE_USER_EVENT_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    UserEvent() : Event() { }
+
+    UserEvent(const UserEvent& event) : Event(event) { }
+
+    UserEvent& operator = (const UserEvent& rhs)
+    {
+        if (this != &rhs) {
+            Event::operator=(rhs);
+        }
+        return *this;
+    }
+
+    cl_int setStatus(cl_int status)
+    {
+        return detail::errHandler(
+            ::clSetUserEventStatus(object_,status), 
+            __SET_USER_EVENT_STATUS_ERR);
+    }
+};
+#endif
+
+inline static cl_int
+WaitForEvents(const VECTOR_CLASS<Event>& events)
+{
+    return detail::errHandler(
+        ::clWaitForEvents(
+            (cl_uint) events.size(), (cl_event*)&events.front()),
+        __WAIT_FOR_EVENTS_ERR);
+}
+
+/*! \class Memory
+ * \brief Memory interface for cl_mem.
+ */
+class Memory : public detail::Wrapper<cl_mem>
+{
+public:
+    Memory() : detail::Wrapper<cl_type>() { }
+
+    Memory(const Memory& memory) : detail::Wrapper<cl_type>(memory) { }
+
+    Memory& operator = (const Memory& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    template <typename T>
+    cl_int getInfo(cl_mem_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetMemObjectInfo, object_, name, param),
+            __GET_MEM_OBJECT_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_mem_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_mem_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+#if defined(CL_VERSION_1_1)
+    cl_int setDestructorCallback(
+        void (CL_CALLBACK * pfn_notify)(cl_mem, void *),		
+        void * user_data = NULL)
+    {
+        return detail::errHandler(
+            ::clSetMemObjectDestructorCallback(
+                object_,
+                pfn_notify,
+                user_data), 
+            __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR);
+    }
+#endif
+
+};
+
+__GET_INFO_HELPER_WITH_RETAIN(cl::Memory)
+
+/*! \class Buffer
+ * \brief Memory buffer interface.
+ */
+class Buffer : public Memory
+{
+public:
+    Buffer(
+        const Context& context,
+        cl_mem_flags flags,
+        ::size_t size,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
+
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Buffer() : Memory() { }
+
+    Buffer(const Buffer& buffer) : Memory(buffer) { }
+
+    Buffer& operator = (const Buffer& rhs)
+    {
+        if (this != &rhs) {
+            Memory::operator=(rhs);
+        }
+        return *this;
+    }
+
+#if defined(CL_VERSION_1_1)
+    Buffer createSubBuffer(
+        cl_mem_flags flags,
+        cl_buffer_create_type buffer_create_type,
+        const void * buffer_create_info,
+        cl_int * err = NULL)
+    {
+        Buffer result;
+        cl_int error;
+        result.object_ = ::clCreateSubBuffer(
+            object_, 
+            flags, 
+            buffer_create_type, 
+            buffer_create_info, 
+            &error);
+
+        detail::errHandler(error, __CREATE_SUBBUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        return result;
+	}		
+#endif
+};
+
+#if defined (USE_DX_INTEROP)
+class BufferD3D10 : public Buffer
+{
+public:
+    typedef CL_API_ENTRY cl_mem (CL_API_CALL *PFN_clCreateFromD3D10BufferKHR)(
+    cl_context context, cl_mem_flags flags, ID3D10Buffer*  buffer,
+    cl_int* errcode_ret);
+
+    BufferD3D10(
+        const Context& context,
+        cl_mem_flags flags,
+        ID3D10Buffer* bufobj,
+        cl_int * err = NULL)
+    {
+        static PFN_clCreateFromD3D10BufferKHR pfn_clCreateFromD3D10BufferKHR = NULL;
+        __INIT_CL_EXT_FCN_PTR(clCreateFromD3D10BufferKHR);
+
+        cl_int error;
+        object_ = pfn_clCreateFromD3D10BufferKHR(
+            context(),
+            flags,
+            bufobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    BufferD3D10() : Buffer() { }
+
+    BufferD3D10(const BufferD3D10& buffer) : Buffer(buffer) { }
+
+    BufferD3D10& operator = (const BufferD3D10& rhs)
+    {
+        if (this != &rhs) {
+            Buffer::operator=(rhs);
+        }
+        return *this;
+    }
+};
+#endif
+
+/*! \class BufferGL
+ * \brief Memory buffer interface for GL interop.
+ */
+class BufferGL : public Buffer
+{
+public:
+    BufferGL(
+        const Context& context,
+        cl_mem_flags flags,
+        GLuint bufobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLBuffer(
+            context(),
+            flags,
+            bufobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    BufferGL() : Buffer() { }
+
+    BufferGL(const BufferGL& buffer) : Buffer(buffer) { }
+
+    BufferGL& operator = (const BufferGL& rhs)
+    {
+        if (this != &rhs) {
+            Buffer::operator=(rhs);
+        }
+        return *this;
+    }
+
+    cl_int getObjectInfo(
+        cl_gl_object_type *type,
+        GLuint * gl_object_name)
+    {
+        return detail::errHandler(
+            ::clGetGLObjectInfo(object_,type,gl_object_name),
+            __GET_GL_OBJECT_INFO_ERR);
+    }
+};
+
+/*! \class BufferRenderGL
+ * \brief Memory buffer interface for GL interop with renderbuffer.
+ */
+class BufferRenderGL : public Buffer
+{
+public:
+    BufferRenderGL(
+        const Context& context,
+        cl_mem_flags flags,
+        GLuint bufobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLRenderbuffer(
+            context(),
+            flags,
+            bufobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    BufferRenderGL() : Buffer() { }
+
+    BufferRenderGL(const BufferGL& buffer) : Buffer(buffer) { }
+
+    BufferRenderGL& operator = (const BufferRenderGL& rhs)
+    {
+        if (this != &rhs) {
+            Buffer::operator=(rhs);
+        }
+        return *this;
+    }
+
+    cl_int getObjectInfo(
+        cl_gl_object_type *type,
+        GLuint * gl_object_name)
+    {
+        return detail::errHandler(
+            ::clGetGLObjectInfo(object_,type,gl_object_name),
+            __GET_GL_OBJECT_INFO_ERR);
+    }
+};
+
+/*! \class Image
+ * \brief Base class  interface for all images.
+ */
+class Image : public Memory
+{
+protected:
+    Image() : Memory() { }
+
+    Image(const Image& image) : Memory(image) { }
+
+    Image& operator = (const Image& rhs)
+    {
+        if (this != &rhs) {
+            Memory::operator=(rhs);
+        }
+        return *this;
+    }
+public:
+    template <typename T>
+    cl_int getImageInfo(cl_image_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetImageInfo, object_, name, param),
+            __GET_IMAGE_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_image_info, name>::param_type
+    getImageInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_image_info, name>::param_type param;
+        cl_int result = getImageInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+};
+
+/*! \class Image2D
+ * \brief Image interface for 2D images.
+ */
+class Image2D : public Image
+{
+public:
+    Image2D(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t width,
+        ::size_t height,
+        ::size_t row_pitch = 0,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateImage2D(
+            context(), flags,&format, width, height, row_pitch, host_ptr, &error);
+
+        detail::errHandler(error, __CREATE_IMAGE2D_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Image2D() { }
+
+    Image2D(const Image2D& image2D) : Image(image2D) { }
+
+    Image2D& operator = (const Image2D& rhs)
+    {
+        if (this != &rhs) {
+            Image::operator=(rhs);
+        }
+        return *this;
+    }
+};
+
+/*! \class Image2DGL
+ * \brief 2D image interface for GL interop.
+ */
+class Image2DGL : public Image2D
+{
+public:
+    Image2DGL(
+        const Context& context,
+        cl_mem_flags flags,
+        GLenum target,
+        GLint  miplevel,
+        GLuint texobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLTexture2D(
+            context(),
+            flags,
+            target,
+            miplevel,
+            texobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Image2DGL() : Image2D() { }
+
+    Image2DGL(const Image2DGL& image) : Image2D(image) { }
+
+    Image2DGL& operator = (const Image2DGL& rhs)
+    {
+        if (this != &rhs) {
+            Image2D::operator=(rhs);
+        }
+        return *this;
+    }
+};
+
+/*! \class Image3D
+ * \brief Image interface for 3D images.
+ */
+class Image3D : public Image
+{
+public:
+    Image3D(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t width,
+        ::size_t height,
+        ::size_t depth,
+        ::size_t row_pitch = 0,
+        ::size_t slice_pitch = 0,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateImage3D(
+            context(), flags, &format, width, height, depth, row_pitch,
+            slice_pitch, host_ptr, &error);
+
+        detail::errHandler(error, __CREATE_IMAGE3D_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Image3D() { }
+
+    Image3D(const Image3D& image3D) : Image(image3D) { }
+
+    Image3D& operator = (const Image3D& rhs)
+    {
+        if (this != &rhs) {
+            Image::operator=(rhs);
+        }
+        return *this;
+    }
+};
+
+/*! \class Image2DGL
+ * \brief 2D image interface for GL interop.
+ */
+class Image3DGL : public Image3D
+{
+public:
+    Image3DGL(
+        const Context& context,
+        cl_mem_flags flags,
+        GLenum target,
+        GLint  miplevel,
+        GLuint texobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLTexture3D(
+            context(),
+            flags,
+            target,
+            miplevel,
+            texobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Image3DGL() : Image3D() { }
+
+    Image3DGL(const Image3DGL& image) : Image3D(image) { }
+
+    Image3DGL& operator = (const Image3DGL& rhs)
+    {
+        if (this != &rhs) {
+            Image3D::operator=(rhs);
+        }
+        return *this;
+    }
+};
+
+/*! \class Sampler
+ * \brief Sampler interface for cl_sampler.
+ */
+class Sampler : public detail::Wrapper<cl_sampler>
+{
+public:
+    Sampler() { }
+
+    Sampler(
+        const Context& context,
+        cl_bool normalized_coords,
+        cl_addressing_mode addressing_mode,
+        cl_filter_mode filter_mode,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateSampler(
+            context(), 
+            normalized_coords,
+            addressing_mode,
+            filter_mode,
+            &error);
+
+        detail::errHandler(error, __CREATE_SAMPLER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Sampler(const Sampler& sampler) : detail::Wrapper<cl_type>(sampler) { }
+
+    Sampler& operator = (const Sampler& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    template <typename T>
+    cl_int getInfo(cl_sampler_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetSamplerInfo, object_, name, param),
+            __GET_SAMPLER_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_sampler_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_sampler_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+};
+
+__GET_INFO_HELPER_WITH_RETAIN(cl::Sampler)
+
+class Program;
+class CommandQueue;
+class Kernel;
+
+/*! \class NDRange
+ * \brief NDRange interface
+ */
+class NDRange
+{
+private:
+    size_t<3> sizes_;
+    cl_uint dimensions_;
+
+public:
+    NDRange()
+        : dimensions_(0)
+    { }
+
+    NDRange(::size_t size0)
+        : dimensions_(1)
+    {
+        sizes_.push_back(size0);
+    }
+
+    NDRange(::size_t size0, ::size_t size1)
+        : dimensions_(2)
+    {
+        sizes_.push_back(size0);
+        sizes_.push_back(size1);
+    }
+
+    NDRange(::size_t size0, ::size_t size1, ::size_t size2)
+        : dimensions_(3)
+    {
+        sizes_.push_back(size0);
+        sizes_.push_back(size1);
+        sizes_.push_back(size2);
+    }
+
+    operator const ::size_t*() const { return (const ::size_t*) sizes_; }
+    ::size_t dimensions() const { return dimensions_; }
+};
+
+static const NDRange NullRange;
+
+/*!
+ * \struct LocalSpaceArg
+ * \brief Local address raper for use with Kernel::setArg
+ */
+struct LocalSpaceArg
+{
+    ::size_t size_;
+};
+
+namespace detail {
+
+template <typename T>
+struct KernelArgumentHandler
+{
+    static ::size_t size(const T&) { return sizeof(T); }
+    static T* ptr(T& value) { return &value; }
+};
+
+template <>
+struct KernelArgumentHandler<LocalSpaceArg>
+{
+    static ::size_t size(const LocalSpaceArg& value) { return value.size_; }
+    static void* ptr(LocalSpaceArg&) { return NULL; }
+};
+
+} 
+//! \endcond
+
+inline LocalSpaceArg
+__local(::size_t size)
+{
+    LocalSpaceArg ret = { size };
+    return ret;
+}
+
+class KernelFunctor;
+
+/*! \class Kernel
+ * \brief Kernel interface that implements cl_kernel
+ */
+class Kernel : public detail::Wrapper<cl_kernel>
+{
+public:
+    inline Kernel(const Program& program, const char* name, cl_int* err = NULL);
+
+    Kernel() { }
+
+    Kernel(const Kernel& kernel) : detail::Wrapper<cl_type>(kernel) { }
+
+    Kernel& operator = (const Kernel& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    template <typename T>
+    cl_int getInfo(cl_kernel_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetKernelInfo, object_, name, param),
+            __GET_KERNEL_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_kernel_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_kernel_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    template <typename T>
+    cl_int getWorkGroupInfo(
+        const Device& device, cl_kernel_work_group_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(
+                &::clGetKernelWorkGroupInfo, object_, device(), name, param),
+                __GET_KERNEL_WORK_GROUP_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_kernel_work_group_info, name>::param_type
+        getWorkGroupInfo(const Device& device, cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+        detail::cl_kernel_work_group_info, name>::param_type param;
+        cl_int result = getWorkGroupInfo(device, name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    template <typename T>
+    cl_int setArg(cl_uint index, T value)
+    {
+        return detail::errHandler(
+            ::clSetKernelArg(
+                object_,
+                index,
+                detail::KernelArgumentHandler<T>::size(value),
+                detail::KernelArgumentHandler<T>::ptr(value)),
+            __SET_KERNEL_ARGS_ERR);
+    }
+
+    cl_int setArg(cl_uint index, ::size_t size, void* argPtr)
+    {
+        return detail::errHandler(
+            ::clSetKernelArg(object_, index, size, argPtr),
+            __SET_KERNEL_ARGS_ERR);
+    }
+
+    KernelFunctor bind(
+        const CommandQueue& queue,
+        const NDRange& offset,
+        const NDRange& global,
+        const NDRange& local);
+
+    KernelFunctor bind(
+        const CommandQueue& queue,
+        const NDRange& global,
+        const NDRange& local);
+};
+
+__GET_INFO_HELPER_WITH_RETAIN(cl::Kernel)
+
+/*! \class Program
+ * \brief Program interface that implements cl_program.
+ */
+class Program : public detail::Wrapper<cl_program>
+{
+public:
+    typedef VECTOR_CLASS<std::pair<const void*, ::size_t> > Binaries;
+    typedef VECTOR_CLASS<std::pair<const char*, ::size_t> > Sources;
+
+    Program(
+        const Context& context,
+        const Sources& sources,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        const ::size_t n = (::size_t)sources.size();
+        ::size_t* lengths = (::size_t*) alloca(n * sizeof(::size_t));
+        const char** strings = (const char**) alloca(n * sizeof(const char*));
+
+        for (::size_t i = 0; i < n; ++i) {
+            strings[i] = sources[(int)i].first;
+            lengths[i] = sources[(int)i].second;
+        }
+
+        object_ = ::clCreateProgramWithSource(
+            context(), (cl_uint)n, strings, lengths, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Program(
+        const Context& context,
+        const VECTOR_CLASS<Device>& devices,
+        const Binaries& binaries,
+        VECTOR_CLASS<cl_int>* binaryStatus = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        const ::size_t n = binaries.size();
+        ::size_t* lengths = (::size_t*) alloca(n * sizeof(::size_t));
+        const unsigned char** images = (const unsigned char**) alloca(n * sizeof(const void*));
+
+        for (::size_t i = 0; i < n; ++i) {
+            images[i] = (const unsigned char*)binaries[(int)i].first;
+            lengths[i] = binaries[(int)i].second;
+        }
+
+        object_ = ::clCreateProgramWithBinary(
+            context(), (cl_uint) devices.size(),
+            (cl_device_id*)&devices.front(),
+            lengths, images, binaryStatus != NULL
+               ? (cl_int*) &binaryStatus->front()
+               : NULL, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Program() { }
+
+    Program(const Program& program) : detail::Wrapper<cl_type>(program) { }
+
+    Program& operator = (const Program& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    cl_int build(
+        const VECTOR_CLASS<Device>& devices,
+        const char* options = NULL,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+        void* data = NULL) const
+    {
+        return detail::errHandler(
+            ::clBuildProgram(
+                object_,
+                (cl_uint)
+                devices.size(),
+                (cl_device_id*)&devices.front(),
+                options,
+                notifyFptr,
+                data),
+                __BUILD_PROGRAM_ERR);
+    }
+
+    template <typename T>
+    cl_int getInfo(cl_program_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetProgramInfo, object_, name, param),
+            __GET_PROGRAM_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_program_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_program_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    template <typename T>
+    cl_int getBuildInfo(
+        const Device& device, cl_program_build_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(
+                &::clGetProgramBuildInfo, object_, device(), name, param),
+                __GET_PROGRAM_BUILD_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_program_build_info, name>::param_type
+    getBuildInfo(const Device& device, cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_program_build_info, name>::param_type param;
+        cl_int result = getBuildInfo(device, name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    cl_int createKernels(VECTOR_CLASS<Kernel>* kernels)
+    {
+        cl_uint numKernels;
+        cl_int err = ::clCreateKernelsInProgram(object_, 0, NULL, &numKernels);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
+        }
+
+        Kernel* value = (Kernel*) alloca(numKernels * sizeof(Kernel));
+        err = ::clCreateKernelsInProgram(
+            object_, numKernels, (cl_kernel*) value, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
+        }
+
+        kernels->assign(&value[0], &value[numKernels]);
+        return CL_SUCCESS;
+    }
+};
+
+__GET_INFO_HELPER_WITH_RETAIN(cl::Program)
+
+inline Kernel::Kernel(const Program& program, const char* name, cl_int* err)
+{
+    cl_int error;
+
+    object_ = ::clCreateKernel(program(), name, &error);
+    detail::errHandler(error, __CREATE_KERNEL_ERR);
+
+    if (err != NULL) {
+        *err = error;
+    }
+
+}
+
+/*! \class CommandQueue
+ * \brief CommandQueue interface for cl_command_queue.
+ */
+class CommandQueue : public detail::Wrapper<cl_command_queue>
+{
+public:
+    CommandQueue(
+        const Context& context,
+        const Device& device,
+        cl_command_queue_properties properties = 0,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateCommandQueue(
+            context(), device(), properties, &error);
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    CommandQueue() { }
+
+    CommandQueue(const CommandQueue& commandQueue) : detail::Wrapper<cl_type>(commandQueue) { }
+
+    CommandQueue& operator = (const CommandQueue& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    template <typename T>
+    cl_int getInfo(cl_command_queue_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(
+                &::clGetCommandQueueInfo, object_, name, param),
+                __GET_COMMAND_QUEUE_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_command_queue_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_command_queue_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    cl_int enqueueReadBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        ::size_t offset,
+        ::size_t size,
+        void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        return detail::errHandler(
+            ::clEnqueueReadBuffer(
+                object_, buffer(), blocking, offset, size,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (cl_event*) event),
+            __ENQUEUE_READ_BUFFER_ERR);
+    }
+
+    cl_int enqueueWriteBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        ::size_t offset,
+        ::size_t size,
+        const void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        return detail::errHandler(
+            ::clEnqueueWriteBuffer(
+                object_, buffer(), blocking, offset, size,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (cl_event*) event),
+                __ENQUEUE_WRITE_BUFFER_ERR);
+    }
+
+    cl_int enqueueCopyBuffer(
+        const Buffer& src,
+        const Buffer& dst,
+        ::size_t src_offset,
+        ::size_t dst_offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        return detail::errHandler(
+            ::clEnqueueCopyBuffer(
+                object_, src(), dst(), src_offset, dst_offset, size,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (cl_event*) event),
+            __ENQEUE_COPY_BUFFER_ERR);
+    }
+
+#if defined(CL_VERSION_1_1)
+    cl_int enqueueReadBufferRect(
+        const Buffer& buffer,
+        cl_bool blocking,
+        const size_t<3>& buffer_offset,
+        const size_t<3>& host_offset,
+        const size_t<3>& region,
+        ::size_t buffer_row_pitch,
+        ::size_t buffer_slice_pitch,
+        ::size_t host_row_pitch,
+        ::size_t host_slice_pitch,
+        void *ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        return detail::errHandler(
+            ::clEnqueueReadBufferRect(
+                object_, 
+                buffer(), 
+                blocking, 
+                (const ::size_t *)buffer_offset,
+                (const ::size_t *)host_offset,
+                (const ::size_t *)region,
+                buffer_row_pitch,
+                buffer_slice_pitch,
+                host_row_pitch,
+                host_slice_pitch,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (cl_event*) event),
+                __ENQUEUE_READ_BUFFER_RECT_ERR);
+    }
+
+
+    cl_int enqueueWriteBufferRect(
+        const Buffer& buffer,
+        cl_bool blocking,
+        const size_t<3>& buffer_offset,
+        const size_t<3>& host_offset,
+        const size_t<3>& region,
+        ::size_t buffer_row_pitch,
+        ::size_t buffer_slice_pitch,
+        ::size_t host_row_pitch,
+        ::size_t host_slice_pitch,
+        void *ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        return detail::errHandler(
+            ::clEnqueueWriteBufferRect(
+                object_, 
+                buffer(), 
+                blocking, 
+                (const ::size_t *)buffer_offset,
+                (const ::size_t *)host_offset,
+                (const ::size_t *)region,
+                buffer_row_pitch,
+                buffer_slice_pitch,
+                host_row_pitch,
+                host_slice_pitch,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (cl_event*) event),
+                __ENQUEUE_WRITE_BUFFER_RECT_ERR);
+    }
+
+    cl_int enqueueCopyBufferRect(
+        const Buffer& src,
+        const Buffer& dst,
+        const size_t<3>& src_origin,
+        const size_t<3>& dst_origin,
+        const size_t<3>& region,
+        ::size_t src_row_pitch,
+        ::size_t src_slice_pitch,
+        ::size_t dst_row_pitch,
+        ::size_t dst_slice_pitch,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        return detail::errHandler(
+            ::clEnqueueCopyBufferRect(
+                object_, 
+                src(), 
+                dst(), 
+                (const ::size_t *)src_origin, 
+                (const ::size_t *)dst_origin, 
+                (const ::size_t *)region,
+                src_row_pitch,
+                src_slice_pitch,
+                dst_row_pitch,
+                dst_slice_pitch,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (cl_event*) event),
+            __ENQEUE_COPY_BUFFER_RECT_ERR);
+    }
+#endif
+
+    cl_int enqueueReadImage(
+        const Image& image,
+        cl_bool blocking,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        ::size_t row_pitch,
+        ::size_t slice_pitch,
+        void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        return detail::errHandler(
+            ::clEnqueueReadImage(
+                object_, image(), blocking, (const ::size_t *) origin,
+                (const ::size_t *) region, row_pitch, slice_pitch, ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (cl_event*) event),
+            __ENQUEUE_READ_IMAGE_ERR);
+    }
+
+    cl_int enqueueWriteImage(
+        const Image& image,
+        cl_bool blocking,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        ::size_t row_pitch,
+        ::size_t slice_pitch,
+        void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        return detail::errHandler(
+            ::clEnqueueWriteImage(
+                object_, image(), blocking, (const ::size_t *) origin,
+                (const ::size_t *) region, row_pitch, slice_pitch, ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (cl_event*) event),
+            __ENQUEUE_WRITE_IMAGE_ERR);
+    }
+
+    cl_int enqueueCopyImage(
+        const Image& src,
+        const Image& dst,
+        const size_t<3>& src_origin,
+        const size_t<3>& dst_origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        return detail::errHandler(
+            ::clEnqueueCopyImage(
+                object_, src(), dst(), (const ::size_t *) src_origin,
+                (const ::size_t *)dst_origin, (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (cl_event*) event),
+            __ENQUEUE_COPY_IMAGE_ERR);
+    }
+
+    cl_int enqueueCopyImageToBuffer(
+        const Image& src,
+        const Buffer& dst,
+        const size_t<3>& src_origin,
+        const size_t<3>& region,
+        ::size_t dst_offset,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        return detail::errHandler(
+            ::clEnqueueCopyImageToBuffer(
+                object_, src(), dst(), (const ::size_t *) src_origin,
+                (const ::size_t *) region, dst_offset,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (cl_event*) event),
+            __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR);
+    }
+
+    cl_int enqueueCopyBufferToImage(
+        const Buffer& src,
+        const Image& dst,
+        ::size_t src_offset,
+        const size_t<3>& dst_origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        return detail::errHandler(
+            ::clEnqueueCopyBufferToImage(
+                object_, src(), dst(), src_offset,
+                (const ::size_t *) dst_origin, (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (cl_event*) event),
+            __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR);
+    }
+
+    void* enqueueMapBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        cl_map_flags flags,
+        ::size_t offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL,
+        cl_int* err = NULL) const
+    {
+        cl_int error;
+        void * result = ::clEnqueueMapBuffer(
+            object_, buffer(), blocking, flags, offset, size,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (cl_event*) event,
+            &error);
+
+        detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+        return result;
+    }
+
+    void* enqueueMapImage(
+        const Image& buffer,
+        cl_bool blocking,
+        cl_map_flags flags,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        ::size_t * row_pitch,
+        ::size_t * slice_pitch,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL,
+        cl_int* err = NULL) const
+    {
+        cl_int error;
+        void * result = ::clEnqueueMapImage(
+            object_, buffer(), blocking, flags,
+            (const ::size_t *) origin, (const ::size_t *) region,
+            row_pitch, slice_pitch,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (cl_event*) event,
+            &error);
+
+        detail::errHandler(error, __ENQUEUE_MAP_IMAGE_ERR);
+        if (err != NULL) {
+              *err = error;
+        }
+        return result;
+    }
+
+    cl_int enqueueUnmapMemObject(
+        const Memory& memory,
+        void* mapped_ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        return detail::errHandler(
+            ::clEnqueueUnmapMemObject(
+                object_, memory(), mapped_ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (cl_event*) event),
+            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+    }
+
+    cl_int enqueueNDRangeKernel(
+        const Kernel& kernel,
+        const NDRange& offset,
+        const NDRange& global,
+        const NDRange& local,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        return detail::errHandler(
+            ::clEnqueueNDRangeKernel(
+                object_, kernel(), (cl_uint) global.dimensions(),
+                offset.dimensions() != 0 ? (const ::size_t*) offset : NULL,
+                (const ::size_t*) global,
+                local.dimensions() != 0 ? (const ::size_t*) local : NULL,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (cl_event*) event),
+            __ENQUEUE_NDRANGE_KERNEL_ERR);
+    }
+
+    cl_int enqueueTask(
+        const Kernel& kernel,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        return detail::errHandler(
+            ::clEnqueueTask(
+                object_, kernel(),
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (cl_event*) event),
+            __ENQUEUE_TASK_ERR);
+    }
+
+    cl_int enqueueNativeKernel(
+        void (*userFptr)(void *),
+        std::pair<void*, ::size_t> args,
+        const VECTOR_CLASS<Memory>* mem_objects = NULL,
+        const VECTOR_CLASS<const void*>* mem_locs = NULL,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_mem * mems = (mem_objects != NULL && mem_objects->size() > 0) 
+            ? (cl_mem*) alloca(mem_objects->size() * sizeof(cl_mem))
+            : NULL;
+
+        if (mems != NULL) {
+            for (unsigned int i = 0; i < mem_objects->size(); i++) {
+                mems[i] = ((*mem_objects)[i])();
+            }
+        }
+
+        return detail::errHandler(
+            ::clEnqueueNativeKernel(
+                object_, userFptr, args.first, args.second,
+                (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                mems,
+                (mem_locs != NULL) ? (const void **) &mem_locs->front() : NULL,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (cl_event*) event),
+            __ENQUEUE_NATIVE_KERNEL);
+    }
+
+    cl_int enqueueMarker(Event* event = NULL) const
+    {
+        return detail::errHandler(
+            ::clEnqueueMarker(object_, (cl_event*) event),
+            __ENQUEUE_MARKER_ERR);
+    }
+
+    cl_int enqueueWaitForEvents(const VECTOR_CLASS<Event>& events) const
+    {
+        return detail::errHandler(
+            ::clEnqueueWaitForEvents(
+                object_,
+                (cl_uint) events.size(),
+                (const cl_event*) &events.front()),
+            __ENQUEUE_WAIT_FOR_EVENTS_ERR);
+    }
+
+    cl_int enqueueAcquireGLObjects(
+         const VECTOR_CLASS<Memory>* mem_objects = NULL,
+         const VECTOR_CLASS<Event>* events = NULL,
+         Event* event = NULL) const
+     {
+         return detail::errHandler(
+             ::clEnqueueAcquireGLObjects(
+                 object_,
+                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
+                 (events != NULL) ? (cl_uint) events->size() : 0,
+                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                 (cl_event*) event),
+             __ENQUEUE_ACQUIRE_GL_ERR);
+     }
+
+    cl_int enqueueReleaseGLObjects(
+         const VECTOR_CLASS<Memory>* mem_objects = NULL,
+         const VECTOR_CLASS<Event>* events = NULL,
+         Event* event = NULL) const
+     {
+         return detail::errHandler(
+             ::clEnqueueReleaseGLObjects(
+                 object_,
+                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
+                 (events != NULL) ? (cl_uint) events->size() : 0,
+                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                 (cl_event*) event),
+             __ENQUEUE_RELEASE_GL_ERR);
+     }
+
+#if defined (USE_DX_INTEROP)
+typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueAcquireD3D10ObjectsKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem* mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list, cl_event* event);
+typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem* mem_objects,  cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list, cl_event* event);
+
+    cl_int enqueueAcquireD3D10Objects(
+         const VECTOR_CLASS<Memory>* mem_objects = NULL,
+         const VECTOR_CLASS<Event>* events = NULL,
+         Event* event = NULL) const
+     {
+         static PFN_clEnqueueAcquireD3D10ObjectsKHR pfn_clEnqueueAcquireD3D10ObjectsKHR = NULL;
+         __INIT_CL_EXT_FCN_PTR(clEnqueueAcquireD3D10ObjectsKHR);
+		
+         return detail::errHandler(
+             pfn_clEnqueueAcquireD3D10ObjectsKHR(
+                 object_,
+                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
+                 (events != NULL) ? (cl_uint) events->size() : 0,
+                 (events != NULL) ? (cl_event*) &events->front() : NULL,
+                 (cl_event*) event),
+             __ENQUEUE_ACQUIRE_GL_ERR);
+     }
+
+    cl_int enqueueReleaseD3D10Objects(
+         const VECTOR_CLASS<Memory>* mem_objects = NULL,
+         const VECTOR_CLASS<Event>* events = NULL,
+         Event* event = NULL) const
+    {
+        static PFN_clEnqueueReleaseD3D10ObjectsKHR pfn_clEnqueueReleaseD3D10ObjectsKHR = NULL;
+        __INIT_CL_EXT_FCN_PTR(clEnqueueReleaseD3D10ObjectsKHR);
+
+        return detail::errHandler(
+            pfn_clEnqueueReleaseD3D10ObjectsKHR(
+                object_,
+                (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL) ? (cl_event*) &events->front() : NULL,
+                (cl_event*) event),
+            __ENQUEUE_RELEASE_GL_ERR);
+    }
+#endif
+
+    cl_int enqueueBarrier() const
+    {
+        return detail::errHandler(
+            ::clEnqueueBarrier(object_),
+            __ENQUEUE_BARRIER_ERR);
+    }
+
+    cl_int flush() const
+    {
+        return detail::errHandler(::clFlush(object_), __FLUSH_ERR);
+    }
+
+    cl_int finish() const
+    {
+        return detail::errHandler(::clFinish(object_), __FINISH_ERR);
+    }
+};
+
+__GET_INFO_HELPER_WITH_RETAIN(cl::CommandQueue)
+
+/*! \class KernelFunctor
+ * \brief Kernel functor interface
+ *
+ * \note Currently only functors of zero to ten arguments are supported. It
+ * is straightforward to add more and a more general solution, similar to
+ * Boost.Lambda could be followed if required in the future.
+ */
+class KernelFunctor
+{
+private:
+    Kernel kernel_;
+    CommandQueue queue_;
+    NDRange offset_;
+    NDRange global_;
+    NDRange local_;
+
+    cl_int err_;
+public:
+    KernelFunctor() { }
+
+    KernelFunctor(
+        const Kernel& kernel,
+        const CommandQueue& queue,
+        const NDRange& offset,
+        const NDRange& global,
+        const NDRange& local) :
+            kernel_(kernel),
+            queue_(queue),
+            offset_(offset),
+            global_(global),
+            local_(local),
+            err_(CL_SUCCESS)
+    {}
+
+    KernelFunctor& operator=(const KernelFunctor& rhs);
+
+    KernelFunctor(const KernelFunctor& rhs);
+
+    cl_int getError() { return err_; }
+
+    inline Event operator()(const VECTOR_CLASS<Event>* events = NULL);
+
+    template<typename A1>
+    inline Event operator()(
+        const A1& a1, 
+        const VECTOR_CLASS<Event>* events = NULL);
+
+    template<class A1, class A2>
+    inline Event operator()(
+        const A1& a1, 
+        const A2& a2, 
+        const VECTOR_CLASS<Event>* events = NULL);
+
+    template<class A1, class A2, class A3>
+    inline Event operator()(
+        const A1& a1, 
+        const A2& a2, 
+        const A3& a3,
+        const VECTOR_CLASS<Event>* events = NULL);
+
+    template<class A1, class A2, class A3, class A4>
+    inline Event operator()(
+        const A1& a1, 
+        const A2& a2, 
+        const A3& a3, 
+        const A4& a4,
+        const VECTOR_CLASS<Event>* events = NULL);
+
+    template<class A1, class A2, class A3, class A4, class A5>
+    inline Event operator()(
+        const A1& a1, 
+        const A2& a2, 
+        const A3& a3, 
+        const A4& a4, 
+        const A5& a5,
+        const VECTOR_CLASS<Event>* events = NULL);
+
+    template<class A1, class A2, class A3, class A4, class A5, class A6>
+    inline Event operator()(
+        const A1& a1, 
+        const A2& a2, 
+        const A3& a3, 
+        const A4& a4, 
+        const A5& a5, 
+        const A6& a6,
+        const VECTOR_CLASS<Event>* events = NULL);
+
+    template<class A1, class A2, class A3, class A4,
+             class A5, class A6, class A7>
+    inline Event operator()(
+        const A1& a1, 
+        const A2& a2, 
+        const A3& a3, 
+        const A4& a4, 
+        const A5& a5, 
+        const A6& a6, 
+        const A7& a7,
+        const VECTOR_CLASS<Event>* events = NULL);
+
+    template<class A1, class A2, class A3, class A4, class A5,
+             class A6, class A7, class A8>
+    inline Event operator()(
+        const A1& a1, 
+        const A2& a2, 
+        const A3& a3, 
+        const A4& a4, 
+        const A5& a5, 
+        const A6& a6, 
+        const A7& a7, 
+        const A8& a8,
+        const VECTOR_CLASS<Event>* events = NULL);
+
+    template<class A1, class A2, class A3, class A4, class A5,
+             class A6, class A7, class A8, class A9>
+    inline Event operator()(
+        const A1& a1, 
+        const A2& a2, 
+        const A3& a3, 
+        const A4& a4, 
+        const A5& a5, 
+        const A6& a6, 
+        const A7& a7, 
+        const A8& a8, 
+        const A9& a9,
+        const VECTOR_CLASS<Event>* events = NULL);
+    
+    template<class A1, class A2, class A3, class A4, class A5,
+             class A6, class A7, class A8, class A9, class A10>
+    inline Event operator()(
+        const A1& a1, 
+        const A2& a2, 
+        const A3& a3, 
+        const A4& a4, 
+        const A5& a5, 
+        const A6& a6,
+        const A7& a7, 
+        const A8& a8, 
+        const A9& a9, 
+        const A10& a10,
+        const VECTOR_CLASS<Event>* events = NULL);
+    
+    template<class A1, class A2, class A3, class A4, class A5,
+             class A6, class A7, class A8, class A9, class A10,
+             class A11>
+    inline Event operator()(
+        const A1& a1, 
+        const A2& a2, 
+        const A3& a3, 
+        const A4& a4, 
+        const A5& a5, 
+        const A6& a6,
+        const A7& a7, 
+        const A8& a8, 
+        const A9& a9, 
+        const A10& a10, 
+        const A11& a11,
+        const VECTOR_CLASS<Event>* events = NULL);
+    
+    template<class A1, class A2, class A3, class A4, class A5,
+             class A6, class A7, class A8, class A9, class A10,
+             class A11, class A12>
+    inline Event operator()(
+        const A1& a1, 
+        const A2& a2, 
+        const A3& a3, 
+        const A4& a4, 
+        const A5& a5, 
+        const A6& a6,
+        const A7& a7, 
+        const A8& a8, 
+        const A9& a9, 
+        const A10& a10, 
+        const A11& a11, 
+        const A12& a12,
+        const VECTOR_CLASS<Event>* events = NULL);
+    
+    template<class A1, class A2, class A3, class A4, class A5,
+             class A6, class A7, class A8, class A9, class A10,
+             class A11, class A12, class A13>
+    inline Event operator()(
+        const A1& a1, 
+        const A2& a2, 
+        const A3& a3, 
+        const A4& a4, 
+        const A5& a5, 
+        const A6& a6,
+        const A7& a7, 
+        const A8& a8, 
+        const A9& a9, 
+        const A10& a10, 
+        const A11& a11, 
+        const A12& a12, 
+        const A13& a13,
+        const VECTOR_CLASS<Event>* events = NULL);
+    
+    template<class A1, class A2, class A3, class A4, class A5,
+             class A6, class A7, class A8, class A9, class A10,
+             class A11, class A12, class A13, class A14>
+    inline Event operator()(
+        const A1& a1, 
+        const A2& a2, 
+        const A3& a3, 
+        const A4& a4, 
+        const A5& a5, 
+        const A6& a6,
+        const A7& a7, 
+        const A8& a8, 
+        const A9& a9, 
+        const A10& a10, 
+        const A11& a11,
+        const A12& a12, 
+        const A13& a13, 
+        const A14& a14,
+        const VECTOR_CLASS<Event>* events = NULL);
+    
+    template<class A1, class A2, class A3, class A4, class A5,
+             class A6, class A7, class A8, class A9, class A10,
+             class A11, class A12, class A13, class A14, class A15>
+    inline Event operator()(
+        const A1& a1, 
+        const A2& a2, 
+        const A3& a3, 
+        const A4& a4, 
+        const A5& a5, 
+        const A6& a6,
+        const A7& a7, 
+        const A8& a8, 
+        const A9& a9, 
+        const A10& a10, 
+        const A11& a11,
+        const A12& a12, 
+        const A13& a13, 
+        const A14& a14, 
+        const A15& a15,
+        const VECTOR_CLASS<Event>* events = NULL);
+};
+
+inline KernelFunctor Kernel::bind(
+    const CommandQueue& queue,
+    const NDRange& offset,
+    const NDRange& global,
+    const NDRange& local)
+{
+    return KernelFunctor(*this,queue,offset,global,local);
+}
+
+inline KernelFunctor Kernel::bind(
+    const CommandQueue& queue,
+    const NDRange& global,
+    const NDRange& local)
+{
+    return KernelFunctor(*this,queue,NullRange,global,local);
+}
+
+inline KernelFunctor& KernelFunctor::operator=(const KernelFunctor& rhs)
+{
+    if (this == &rhs) {
+        return *this;
+    }
+    
+    kernel_ = rhs.kernel_;
+    queue_  = rhs.queue_;
+    offset_ = rhs.offset_;
+    global_ = rhs.global_;
+    local_  = rhs.local_;
+    
+    return *this;
+}
+
+inline KernelFunctor::KernelFunctor(const KernelFunctor& rhs) :
+    kernel_(rhs.kernel_),
+    queue_(rhs.queue_),
+    offset_(rhs.offset_),
+    global_(rhs.global_),
+    local_(rhs.local_)
+{
+}
+
+Event KernelFunctor::operator()(const VECTOR_CLASS<Event>* events)
+{
+    Event event;
+
+    err_ = queue_.enqueueNDRangeKernel(
+        kernel_,
+        offset_,
+        global_,
+        local_,
+        NULL,    // bgaster_fixme - do we want to allow wait event lists?
+        &event);
+
+    return event;
+}
+
+template<typename A1>
+Event KernelFunctor::operator()(
+    const A1& a1, 
+    const VECTOR_CLASS<Event>* events)
+{
+    Event event;
+
+    kernel_.setArg(0,a1);
+
+    err_ = queue_.enqueueNDRangeKernel(
+        kernel_,
+        offset_,
+        global_,
+        local_,
+        NULL,    // bgaster_fixme - do we want to allow wait event lists?
+        &event);
+
+    return event;
+}
+
+template<typename A1, typename A2>
+Event KernelFunctor::operator()(
+    const A1& a1, 
+    const A2& a2,
+    const VECTOR_CLASS<Event>* events)
+{
+    Event event;
+
+    kernel_.setArg(0,a1);
+    kernel_.setArg(1,a2);
+
+    err_ = queue_.enqueueNDRangeKernel(
+        kernel_,
+        offset_,
+        global_,
+        local_,
+        NULL,    // bgaster_fixme - do we want to allow wait event lists?
+        &event);
+
+    return event;
+}
+
+template<typename A1, typename A2, typename A3>
+Event KernelFunctor::operator()(
+    const A1& a1, 
+    const A2& a2, 
+    const A3& a3,
+    const VECTOR_CLASS<Event>* events)
+{
+    Event event;
+
+    kernel_.setArg(0,a1);
+    kernel_.setArg(1,a2);
+    kernel_.setArg(2,a3);
+
+    err_ = queue_.enqueueNDRangeKernel(
+        kernel_,
+        offset_,
+        global_,
+        local_,
+        NULL,    // bgaster_fixme - do we want to allow wait event lists?
+        &event);
+
+    return event;
+}
+
+template<typename A1, typename A2, typename A3, typename A4>
+Event KernelFunctor::operator()(
+    const A1& a1, 
+    const A2& a2, 
+    const A3& a3, 
+    const A4& a4,
+    const VECTOR_CLASS<Event>* events)
+{
+    Event event;
+
+    kernel_.setArg(0,a1);
+    kernel_.setArg(1,a2);
+    kernel_.setArg(2,a3);
+    kernel_.setArg(3,a4);
+
+    err_ = queue_.enqueueNDRangeKernel(
+        kernel_,
+        offset_,
+        global_,
+        local_,
+        NULL,    // bgaster_fixme - do we want to allow wait event lists?
+        &event);
+
+    return event;
+}
+
+template<typename A1, typename A2, typename A3, typename A4, typename A5>
+Event KernelFunctor::operator()(
+    const A1& a1, 
+    const A2& a2, 
+    const A3& a3, 
+    const A4& a4, 
+    const A5& a5,
+    const VECTOR_CLASS<Event>* events)
+{
+    Event event;
+
+    kernel_.setArg(0,a1);
+    kernel_.setArg(1,a2);
+    kernel_.setArg(2,a3);
+    kernel_.setArg(3,a4);
+    kernel_.setArg(4,a5);
+
+    err_ = queue_.enqueueNDRangeKernel(
+        kernel_,
+        offset_,
+        global_,
+        local_,
+        NULL,    // bgaster_fixme - do we want to allow wait event lists?
+        &event);
+
+    return event;
+}
+
+template<typename A1, typename A2, typename A3, typename A4, typename A5,
+         typename A6>
+Event KernelFunctor::operator()(
+    const A1& a1, 
+    const A2& a2, 
+    const A3& a3, 
+    const A4& a4, 
+    const A5& a5, 
+    const A6& a6,
+    const VECTOR_CLASS<Event>* events)
+{
+    Event event;
+
+    kernel_.setArg(0,a1);
+    kernel_.setArg(1,a2);
+    kernel_.setArg(2,a3);
+    kernel_.setArg(3,a4);
+    kernel_.setArg(4,a5);
+    kernel_.setArg(5,a6);
+
+    err_ = queue_.enqueueNDRangeKernel(
+        kernel_,
+        offset_,
+        global_,
+        local_,
+        NULL,    // bgaster_fixme - do we want to allow wait event lists?
+        &event);
+
+    return event;
+}
+
+template<typename A1, typename A2, typename A3, typename A4,
+         typename A5, typename A6, typename A7>
+Event KernelFunctor::operator()(
+    const A1& a1, 
+    const A2& a2, 
+    const A3& a3, 
+    const A4& a4, 
+    const A5& a5, 
+    const A6& a6, 
+    const A7& a7,
+    const VECTOR_CLASS<Event>* events)
+{
+    Event event;
+
+    kernel_.setArg(0,a1);
+    kernel_.setArg(1,a2);
+    kernel_.setArg(2,a3);
+    kernel_.setArg(3,a4);
+    kernel_.setArg(4,a5);
+    kernel_.setArg(5,a6);
+    kernel_.setArg(6,a7);
+
+    err_ = queue_.enqueueNDRangeKernel(
+        kernel_,
+        offset_,
+        global_,
+        local_,
+        NULL,    // bgaster_fixme - do we want to allow wait event lists?
+        &event);
+
+    return event;
+}
+
+template<typename A1, typename A2, typename A3, typename A4, typename A5,
+         typename A6, typename A7, typename A8>
+Event KernelFunctor::operator()(
+    const A1& a1, 
+    const A2& a2, 
+    const A3& a3, 
+    const A4& a4, 
+    const A5& a5, 
+    const A6& a6, 
+    const A7& a7, 
+    const A8& a8,
+    const VECTOR_CLASS<Event>* events)
+{
+    Event event;
+
+    kernel_.setArg(0,a1);
+    kernel_.setArg(1,a2);
+    kernel_.setArg(2,a3);
+    kernel_.setArg(3,a4);
+    kernel_.setArg(4,a5);
+    kernel_.setArg(5,a6);
+    kernel_.setArg(6,a7);
+    kernel_.setArg(7,a8);
+
+    err_ = queue_.enqueueNDRangeKernel(
+        kernel_,
+        offset_,
+        global_,
+        local_,
+        NULL,    // bgaster_fixme - do we want to allow wait event lists?
+        &event);
+
+    return event;
+}
+
+template<typename A1, typename A2, typename A3, typename A4, typename A5,
+         typename A6, typename A7, typename A8, typename A9>
+Event KernelFunctor::operator()(
+    const A1& a1, 
+    const A2& a2, 
+    const A3& a3, 
+    const A4& a4, 
+    const A5& a5,
+    const A6& a6, 
+    const A7& a7, 
+    const A8& a8, 
+    const A9& a9,
+    const VECTOR_CLASS<Event>* events)
+{
+    Event event;
+
+    kernel_.setArg(0,a1);
+    kernel_.setArg(1,a2);
+    kernel_.setArg(2,a3);
+    kernel_.setArg(3,a4);
+    kernel_.setArg(4,a5);
+    kernel_.setArg(5,a6);
+    kernel_.setArg(6,a7);
+    kernel_.setArg(7,a8);
+    kernel_.setArg(8,a9);
+
+    err_ = queue_.enqueueNDRangeKernel(
+        kernel_,
+        offset_,
+        global_,
+        local_,
+        NULL,    // bgaster_fixme - do we want to allow wait event lists?
+        &event);
+
+    return event;
+}
+
+template<typename A1, typename A2, typename A3, typename A4, typename A5,
+         typename A6, typename A7, typename A8, typename A9, typename A10>
+Event KernelFunctor::operator()(
+    const A1& a1, 
+    const A2& a2, 
+    const A3& a3, 
+    const A4& a4, 
+    const A5& a5, 
+    const A6& a6,
+    const A7& a7, 
+    const A8& a8, 
+    const A9& a9, 
+    const A10& a10,
+    const VECTOR_CLASS<Event>* events)
+{
+    Event event;
+
+    kernel_.setArg(0,a1);
+    kernel_.setArg(1,a2);
+    kernel_.setArg(2,a3);
+    kernel_.setArg(3,a4);
+    kernel_.setArg(4,a5);
+    kernel_.setArg(5,a6);
+    kernel_.setArg(6,a7);
+    kernel_.setArg(7,a8);
+    kernel_.setArg(8,a9);
+    kernel_.setArg(9,a10);
+
+    err_ = queue_.enqueueNDRangeKernel(
+        kernel_,
+        offset_,
+        global_,
+        local_,
+        NULL,    // bgaster_fixme - do we want to allow wait event lists?
+        &event);
+
+    return event;
+}
+
+template<class A1, class A2, class A3, class A4, class A5,
+         class A6, class A7, class A8, class A9, class A10,
+         class A11>
+Event KernelFunctor::operator()(
+    const A1& a1, 
+    const A2& a2, 
+    const A3& a3, 
+    const A4& a4, 
+    const A5& a5, 
+    const A6& a6,
+    const A7& a7, 
+    const A8& a8, 
+    const A9& a9, 
+    const A10& a10, 
+    const A11& a11,
+    const VECTOR_CLASS<Event>* events)
+{
+    Event event;
+
+    kernel_.setArg(0,a1);
+    kernel_.setArg(1,a2);
+    kernel_.setArg(2,a3);
+    kernel_.setArg(3,a4);
+    kernel_.setArg(4,a5);
+    kernel_.setArg(5,a6);
+    kernel_.setArg(6,a7);
+    kernel_.setArg(7,a8);
+    kernel_.setArg(8,a9);
+    kernel_.setArg(9,a10);
+    kernel_.setArg(10,a11);
+
+    err_ = queue_.enqueueNDRangeKernel(
+        kernel_,
+        offset_,
+        global_,
+        local_,
+        NULL,    // bgaster_fixme - do we want to allow wait event lists?
+        &event);
+
+    return event;
+}
+
+template<class A1, class A2, class A3, class A4, class A5,
+         class A6, class A7, class A8, class A9, class A10,
+         class A11, class A12>
+Event KernelFunctor::operator()(
+    const A1& a1, 
+    const A2& a2, 
+    const A3& a3, 
+    const A4& a4, 
+    const A5& a5, 
+    const A6& a6,
+    const A7& a7, 
+    const A8& a8, 
+    const A9& a9, 
+    const A10& a10, 
+    const A11& a11, 
+    const A12& a12,
+    const VECTOR_CLASS<Event>* events)
+{
+    Event event;
+
+    kernel_.setArg(0,a1);
+    kernel_.setArg(1,a2);
+    kernel_.setArg(2,a3);
+    kernel_.setArg(3,a4);
+    kernel_.setArg(4,a5);
+    kernel_.setArg(5,a6);
+    kernel_.setArg(6,a7);
+    kernel_.setArg(7,a8);
+    kernel_.setArg(8,a9);
+    kernel_.setArg(9,a10);
+    kernel_.setArg(10,a11);
+    kernel_.setArg(11,a12);
+
+    err_ = queue_.enqueueNDRangeKernel(
+        kernel_,
+        offset_,
+        global_,
+        local_,
+        NULL,    // bgaster_fixme - do we want to allow wait event lists?
+        &event);
+
+    return event;
+}
+
+template<class A1, class A2, class A3, class A4, class A5,
+         class A6, class A7, class A8, class A9, class A10,
+         class A11, class A12, class A13>
+Event KernelFunctor::operator()(
+    const A1& a1, 
+    const A2& a2, 
+    const A3& a3, 
+    const A4& a4, 
+    const A5& a5, 
+    const A6& a6,
+    const A7& a7, 
+    const A8& a8, 
+    const A9& a9, 
+    const A10& a10, 
+    const A11& a11, 
+    const A12& a12, 
+    const A13& a13,
+    const VECTOR_CLASS<Event>* events)
+{
+    Event event;
+    
+    kernel_.setArg(0,a1);
+    kernel_.setArg(1,a2);
+    kernel_.setArg(2,a3);
+    kernel_.setArg(3,a4);
+    kernel_.setArg(4,a5);
+    kernel_.setArg(5,a6);
+    kernel_.setArg(6,a7);
+    kernel_.setArg(7,a8);
+    kernel_.setArg(8,a9);
+    kernel_.setArg(9,a10);
+    kernel_.setArg(10,a11);
+    kernel_.setArg(11,a12);
+    kernel_.setArg(12,a13);
+
+    err_ = queue_.enqueueNDRangeKernel(
+        kernel_,
+        offset_,
+        global_,
+        local_,
+        NULL,    // bgaster_fixme - do we want to allow wait event lists?
+        &event);
+
+    return event;
+}
+
+template<class A1, class A2, class A3, class A4, class A5,
+         class A6, class A7, class A8, class A9, class A10,
+         class A11, class A12, class A13, class A14>
+Event KernelFunctor::operator()(
+    const A1& a1, 
+    const A2& a2, 
+    const A3& a3, 
+    const A4& a4, 
+    const A5& a5, 
+    const A6& a6,
+    const A7& a7, 
+    const A8& a8, 
+    const A9& a9, 
+    const A10& a10, 
+    const A11& a11,
+    const A12& a12, 
+    const A13& a13, 
+    const A14& a14,
+    const VECTOR_CLASS<Event>* events)
+{
+    Event event;
+    
+    kernel_.setArg(0,a1);
+    kernel_.setArg(1,a2);
+    kernel_.setArg(2,a3);
+    kernel_.setArg(3,a4);
+    kernel_.setArg(4,a5);
+    kernel_.setArg(5,a6);
+    kernel_.setArg(6,a7);
+    kernel_.setArg(7,a8);
+    kernel_.setArg(8,a9);
+    kernel_.setArg(9,a10);
+    kernel_.setArg(10,a11);
+    kernel_.setArg(11,a12);
+    kernel_.setArg(12,a13);
+    kernel_.setArg(13,a14);
+
+    err_ = queue_.enqueueNDRangeKernel(
+        kernel_,
+        offset_,
+        global_,
+        local_,
+        NULL,    // bgaster_fixme - do we want to allow wait event lists?
+        &event);
+
+    return event;
+}
+
+template<class A1, class A2, class A3, class A4, class A5,
+         class A6, class A7, class A8, class A9, class A10,
+         class A11, class A12, class A13, class A14, class A15>
+Event KernelFunctor::operator()(
+    const A1& a1, 
+    const A2& a2, 
+    const A3& a3, 
+    const A4& a4, 
+    const A5& a5,
+    const A6& a6, 
+    const A7& a7, 
+    const A8& a8, 
+    const A9& a9, 
+    const A10& a10, 
+    const A11& a11,
+    const A12& a12, 
+    const A13& a13, 
+    const A14& a14, 
+    const A15& a15,
+    const VECTOR_CLASS<Event>* events)
+{
+    Event event;
+    
+    kernel_.setArg(0,a1);
+    kernel_.setArg(1,a2);
+    kernel_.setArg(2,a3);
+    kernel_.setArg(3,a4);
+    kernel_.setArg(4,a5);
+    kernel_.setArg(5,a6);
+    kernel_.setArg(6,a7);
+    kernel_.setArg(7,a8);
+    kernel_.setArg(8,a9);
+    kernel_.setArg(9,a10);
+    kernel_.setArg(10,a11);
+    kernel_.setArg(11,a12);
+    kernel_.setArg(12,a13);
+    kernel_.setArg(13,a14);
+    kernel_.setArg(14,a15);
+
+    err_ = queue_.enqueueNDRangeKernel(
+        kernel_,
+        offset_,
+        global_,
+        local_,
+        NULL,    // bgaster_fixme - do we want to allow wait event lists?
+        &event);
+
+    return event;
+}
+
+#undef __ERR_STR
+#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS)
+#undef __GET_DEVICE_INFO_ERR
+#undef __GET_PLATFORM_INFO_ERR
+#undef __GET_DEVICE_IDS_ERR
+#undef __GET_CONTEXT_INFO_ERR
+#undef __GET_EVENT_INFO_ERR
+#undef __GET_EVENT_PROFILE_INFO_ERR
+#undef __GET_MEM_OBJECT_INFO_ERR
+#undef __GET_IMAGE_INFO_ERR
+#undef __GET_SAMPLER_INFO_ERR
+#undef __GET_KERNEL_INFO_ERR
+#undef __GET_KERNEL_WORK_GROUP_INFO_ERR
+#undef __GET_PROGRAM_INFO_ERR
+#undef __GET_PROGRAM_BUILD_INFO_ERR
+#undef __GET_COMMAND_QUEUE_INFO_ERR
+
+#undef __CREATE_CONTEXT_FROM_TYPE_ERR
+#undef __GET_SUPPORTED_IMAGE_FORMATS_ERR
+
+#undef __CREATE_BUFFER_ERR
+#undef __CREATE_SUBBUFFER_ERR
+#undef __CREATE_IMAGE2D_ERR
+#undef __CREATE_IMAGE3D_ERR
+#undef __CREATE_SAMPLER_ERR
+#undef __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR
+
+#undef __CREATE_USER_EVENT_ERR
+#undef __SET_USER_EVENT_STATUS_ERR
+#undef __SET_EVENT_CALLBACK_ERR
+
+#undef __WAIT_FOR_EVENTS_ERR
+
+#undef __CREATE_KERNEL_ERR
+#undef __SET_KERNEL_ARGS_ERR
+#undef __CREATE_PROGRAM_WITH_SOURCE_ERR
+#undef __CREATE_PROGRAM_WITH_BINARY_ERR
+#undef __BUILD_PROGRAM_ERR
+#undef __CREATE_KERNELS_IN_PROGRAM_ERR
+
+#undef __CREATE_COMMAND_QUEUE_ERR
+#undef __SET_COMMAND_QUEUE_PROPERTY_ERR
+#undef __ENQUEUE_READ_BUFFER_ERR
+#undef __ENQUEUE_WRITE_BUFFER_ERR
+#undef __ENQUEUE_READ_BUFFER_RECT_ERR
+#undef __ENQUEUE_WRITE_BUFFER_RECT_ERR
+#undef __ENQEUE_COPY_BUFFER_ERR
+#undef __ENQEUE_COPY_BUFFER_RECT_ERR
+#undef __ENQUEUE_READ_IMAGE_ERR
+#undef __ENQUEUE_WRITE_IMAGE_ERR
+#undef __ENQUEUE_COPY_IMAGE_ERR
+#undef __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR
+#undef __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR
+#undef __ENQUEUE_MAP_BUFFER_ERR
+#undef __ENQUEUE_MAP_IMAGE_ERR
+#undef __ENQUEUE_UNMAP_MEM_OBJECT_ERR
+#undef __ENQUEUE_NDRANGE_KERNEL_ERR
+#undef __ENQUEUE_TASK_ERR
+#undef __ENQUEUE_NATIVE_KERNEL
+
+#undef __UNLOAD_COMPILER_ERR
+#endif //__CL_USER_OVERRIDE_ERROR_STRINGS
+
+#undef __GET_INFO_HELPER_WITH_RETAIN
+
+// Extensions
+#undef __INIT_CL_EXT_FCN_PTR
+#undef __CREATE_SUB_DEVICES
+
+#if defined(USE_CL_DEVICE_FISSION)
+#undef __PARAM_NAME_DEVICE_FISSION
+#endif // USE_CL_DEVICE_FISSION
+
+} // namespace cl
+
+#endif // CL_HPP_
diff --git a/include/CL/cl_ext.h b/include/CL/cl_ext.h
new file mode 100644
index 00000000000..4e92c7e634b
--- /dev/null
+++ b/include/CL/cl_ext.h
@@ -0,0 +1,213 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2010 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/* $Revision: 11928 $ on $Date: 2010-07-13 09:04:56 -0700 (Tue, 13 Jul 2010) $ */
+
+/* cl_ext.h contains OpenCL extensions which don't have external */
+/* (OpenGL, D3D) dependencies.                                   */
+
+#ifndef __CL_EXT_H
+#define __CL_EXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+	#include <OpenCL/cl.h>
+    #include <AvailabilityMacros.h>
+#else
+	#include <CL/cl.h>
+#endif
+
+/* cl_khr_fp64 extension - no extension #define since it has no functions  */
+#define CL_DEVICE_DOUBLE_FP_CONFIG                  0x1032
+
+/* cl_khr_fp16 extension - no extension #define since it has no functions  */
+#define CL_DEVICE_HALF_FP_CONFIG                    0x1033
+
+/* Memory object destruction
+ *
+ * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR
+ *
+ * Registers a user callback function that will be called when the memory object is deleted and its resources 
+ * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback 
+ * stack associated with memobj. The registered user callback functions are called in the reverse order in 
+ * which they were registered. The user callback functions are called and then the memory object is deleted 
+ * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be 
+ * notified when the memory referenced by host_ptr, specified when the memory object is created and used as 
+ * the storage bits for the memory object, can be reused or freed.
+ *
+ * The application may not call CL api's with the cl_mem object passed to the pfn_notify.
+ *
+ * Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
+ * before using.
+ */
+#define cl_APPLE_SetMemObjectDestructor 1
+cl_int	CL_API_ENTRY clSetMemObjectDestructorAPPLE(  cl_mem /* memobj */, 
+                                        void (* /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), 
+                                        void * /*user_data */ )             CL_EXT_SUFFIX__VERSION_1_0;  
+
+
+/* Context Logging Functions
+ *
+ * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext().
+ * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
+ * before using.
+ *
+ * clLogMessagesToSystemLog fowards on all log messages to the Apple System Logger 
+ */
+#define cl_APPLE_ContextLoggingFunctions 1
+extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE(  const char * /* errstr */, 
+                                            const void * /* private_info */, 
+                                            size_t       /* cb */, 
+                                            void *       /* user_data */ )  CL_EXT_SUFFIX__VERSION_1_0;
+
+/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */
+extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE(   const char * /* errstr */, 
+                                          const void * /* private_info */, 
+                                          size_t       /* cb */, 
+                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
+
+/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */
+extern void CL_API_ENTRY clLogMessagesToStderrAPPLE(   const char * /* errstr */, 
+                                          const void * /* private_info */, 
+                                          size_t       /* cb */, 
+                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
+
+
+/************************ 
+* cl_khr_icd extension *                                                  
+************************/
+#define cl_khr_icd 1
+
+/* cl_platform_info                                                        */
+#define CL_PLATFORM_ICD_SUFFIX_KHR                  0x0920
+
+/* Additional Error Codes                                                  */
+#define CL_PLATFORM_NOT_FOUND_KHR                   -1001
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clIcdGetPlatformIDsKHR(cl_uint          /* num_entries */,
+                       cl_platform_id * /* platforms */,
+                       cl_uint *        /* num_platforms */);
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(
+    cl_uint          /* num_entries */,
+    cl_platform_id * /* platforms */,
+    cl_uint *        /* num_platforms */);
+
+
+/******************************************
+* cl_nv_device_attribute_query extension *
+******************************************/
+/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
+#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV       0x4000
+#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV       0x4001
+#define CL_DEVICE_REGISTERS_PER_BLOCK_NV            0x4002
+#define CL_DEVICE_WARP_SIZE_NV                      0x4003
+#define CL_DEVICE_GPU_OVERLAP_NV                    0x4004
+#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005
+#define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006
+
+
+/*********************************
+* cl_amd_device_attribute_query *
+*********************************/
+#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD        0x4036
+
+
+#ifdef CL_VERSION_1_1
+   /***********************************
+    * cl_ext_device_fission extension *
+    ***********************************/
+    #define cl_ext_device_fission   1
+    
+    extern CL_API_ENTRY cl_int CL_API_CALL
+    clReleaseDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; 
+    
+    typedef CL_API_ENTRY cl_int 
+    (CL_API_CALL *clReleaseDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+    extern CL_API_ENTRY cl_int CL_API_CALL
+    clRetainDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; 
+    
+    typedef CL_API_ENTRY cl_int 
+    (CL_API_CALL *clRetainDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+    typedef cl_ulong  cl_device_partition_property_ext;
+    extern CL_API_ENTRY cl_int CL_API_CALL
+    clCreateSubDevicesEXT(  cl_device_id /*in_device*/,
+                            const cl_device_partition_property_ext * /* properties */,
+                            cl_uint /*num_entries*/,
+                            cl_device_id * /*out_devices*/,
+                            cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+    typedef CL_API_ENTRY cl_int 
+    ( CL_API_CALL * clCreateSubDevicesEXT_fn)(  cl_device_id /*in_device*/,
+                                                const cl_device_partition_property_ext * /* properties */,
+                                                cl_uint /*num_entries*/,
+                                                cl_device_id * /*out_devices*/,
+                                                cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+    /* cl_device_partition_property_ext */
+    #define CL_DEVICE_PARTITION_EQUALLY_EXT             0x4050
+    #define CL_DEVICE_PARTITION_BY_COUNTS_EXT           0x4051
+    #define CL_DEVICE_PARTITION_BY_NAMES_EXT            0x4052
+    #define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT  0x4053
+    
+    /* clDeviceGetInfo selectors */
+    #define CL_DEVICE_PARENT_DEVICE_EXT                 0x4054
+    #define CL_DEVICE_PARTITION_TYPES_EXT               0x4055
+    #define CL_DEVICE_AFFINITY_DOMAINS_EXT              0x4056
+    #define CL_DEVICE_REFERENCE_COUNT_EXT               0x4057
+    #define CL_DEVICE_PARTITION_STYLE_EXT               0x4058
+    
+    /* error codes */
+    #define CL_DEVICE_PARTITION_FAILED_EXT              -1057
+    #define CL_INVALID_PARTITION_COUNT_EXT              -1058
+    #define CL_INVALID_PARTITION_NAME_EXT               -1059
+    
+    /* CL_AFFINITY_DOMAINs */
+    #define CL_AFFINITY_DOMAIN_L1_CACHE_EXT             0x1
+    #define CL_AFFINITY_DOMAIN_L2_CACHE_EXT             0x2
+    #define CL_AFFINITY_DOMAIN_L3_CACHE_EXT             0x3
+    #define CL_AFFINITY_DOMAIN_L4_CACHE_EXT             0x4
+    #define CL_AFFINITY_DOMAIN_NUMA_EXT                 0x10
+    #define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT     0x100
+    
+    /* cl_device_partition_property_ext list terminators */
+    #define CL_PROPERTIES_LIST_END_EXT                  ((cl_device_partition_property_ext) 0)
+    #define CL_PARTITION_BY_COUNTS_LIST_END_EXT         ((cl_device_partition_property_ext) 0)
+    #define CL_PARTITION_BY_NAMES_LIST_END_EXT          ((cl_device_partition_property_ext) 0 - 1)
+
+
+
+#endif /* CL_VERSION_1_1 */
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* __CL_EXT_H */
diff --git a/include/CL/cl_gl.h b/include/CL/cl_gl.h
new file mode 100644
index 00000000000..3b4fe0690e5
--- /dev/null
+++ b/include/CL/cl_gl.h
@@ -0,0 +1,155 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2010 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+/*
+ * cl_gl.h contains Khronos-approved (KHR) OpenCL extensions which have
+ * OpenGL dependencies. The application is responsible for #including
+ * OpenGL or OpenGL ES headers before #including cl_gl.h.
+ */
+
+#ifndef __OPENCL_CL_GL_H
+#define __OPENCL_CL_GL_H
+
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#include <OpenGL/CGLDevice.h>
+#else
+#include <CL/cl.h>
+#endif	
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef cl_uint     cl_gl_object_type;
+typedef cl_uint     cl_gl_texture_info;
+typedef cl_uint     cl_gl_platform_info;
+typedef struct __GLsync *cl_GLsync;
+
+/* cl_gl_object_type */
+#define CL_GL_OBJECT_BUFFER             0x2000
+#define CL_GL_OBJECT_TEXTURE2D          0x2001
+#define CL_GL_OBJECT_TEXTURE3D          0x2002
+#define CL_GL_OBJECT_RENDERBUFFER       0x2003
+
+/* cl_gl_texture_info */
+#define CL_GL_TEXTURE_TARGET            0x2004
+#define CL_GL_MIPMAP_LEVEL              0x2005
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLBuffer(cl_context     /* context */,
+                     cl_mem_flags   /* flags */,
+                     cl_GLuint      /* bufobj */,
+                     int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLTexture2D(cl_context      /* context */,
+                        cl_mem_flags    /* flags */,
+                        cl_GLenum       /* target */,
+                        cl_GLint        /* miplevel */,
+                        cl_GLuint       /* texture */,
+                        cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLTexture3D(cl_context      /* context */,
+                        cl_mem_flags    /* flags */,
+                        cl_GLenum       /* target */,
+                        cl_GLint        /* miplevel */,
+                        cl_GLuint       /* texture */,
+                        cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLRenderbuffer(cl_context   /* context */,
+                           cl_mem_flags /* flags */,
+                           cl_GLuint    /* renderbuffer */,
+                           cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLObjectInfo(cl_mem                /* memobj */,
+                  cl_gl_object_type *   /* gl_object_type */,
+                  cl_GLuint *              /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0;
+                  
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLTextureInfo(cl_mem               /* memobj */,
+                   cl_gl_texture_info   /* param_name */,
+                   size_t               /* param_value_size */,
+                   void *               /* param_value */,
+                   size_t *             /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireGLObjects(cl_command_queue      /* command_queue */,
+                          cl_uint               /* num_objects */,
+                          const cl_mem *        /* mem_objects */,
+                          cl_uint               /* num_events_in_wait_list */,
+                          const cl_event *      /* event_wait_list */,
+                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseGLObjects(cl_command_queue      /* command_queue */,
+                          cl_uint               /* num_objects */,
+                          const cl_mem *        /* mem_objects */,
+                          cl_uint               /* num_events_in_wait_list */,
+                          const cl_event *      /* event_wait_list */,
+                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+/* cl_khr_gl_sharing extension  */
+
+#define cl_khr_gl_sharing 1
+
+typedef cl_uint     cl_gl_context_info;
+
+/* Additional Error Codes  */
+#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR  -1000
+
+/* cl_gl_context_info  */
+#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR    0x2006
+#define CL_DEVICES_FOR_GL_CONTEXT_KHR           0x2007
+
+/* Additional cl_context_properties  */
+#define CL_GL_CONTEXT_KHR                       0x2008
+#define CL_EGL_DISPLAY_KHR                      0x2009
+#define CL_GLX_DISPLAY_KHR                      0x200A
+#define CL_WGL_HDC_KHR                          0x200B
+#define CL_CGL_SHAREGROUP_KHR                   0x200C
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLContextInfoKHR(const cl_context_properties * /* properties */,
+                      cl_gl_context_info            /* param_name */,
+                      size_t                        /* param_value_size */,
+                      void *                        /* param_value */,
+                      size_t *                      /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
+    const cl_context_properties * properties,
+    cl_gl_context_info            param_name,
+    size_t                        param_value_size,
+    void *                        param_value,
+    size_t *                      param_value_size_ret);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_GL_H  */
diff --git a/include/CL/cl_gl_ext.h b/include/CL/cl_gl_ext.h
new file mode 100644
index 00000000000..26e47821f9e
--- /dev/null
+++ b/include/CL/cl_gl_ext.h
@@ -0,0 +1,69 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2010 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+/* cl_gl_ext.h contains vendor (non-KHR) OpenCL extensions which have           */
+/* OpenGL dependencies.                                                         */
+
+#ifndef __OPENCL_CL_GL_EXT_H
+#define __OPENCL_CL_GL_EXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+    #include <OpenCL/cl_gl.h>
+#else
+    #include <CL/cl_gl.h>
+#endif
+
+/*
+ * For each extension, follow this template
+ * /* cl_VEN_extname extension  */
+/* #define cl_VEN_extname 1
+ * ... define new types, if any
+ * ... define new tokens, if any
+ * ... define new APIs, if any
+ *
+ *  If you need GLtypes here, mirror them with a cl_GLtype, rather than including a GL header
+ *  This allows us to avoid having to decide whether to include GL headers or GLES here.
+ */
+
+/* 
+ *  cl_khr_gl_event  extension
+ *  See section 9.9 in the OpenCL 1.1 spec for more information
+ */
+#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR     0x200D
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromGLsyncKHR(cl_context           /* context */,
+                           cl_GLsync            /* cl_GLsync */,
+                           cl_int *             /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* __OPENCL_CL_GL_EXT_H  */
diff --git a/include/CL/cl_platform.h b/include/CL/cl_platform.h
new file mode 100644
index 00000000000..043b0489df7
--- /dev/null
+++ b/include/CL/cl_platform.h
@@ -0,0 +1,1198 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2010 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11803 $ on $Date: 2010-06-25 10:02:12 -0700 (Fri, 25 Jun 2010) $ */
+
+#ifndef __CL_PLATFORM_H
+#define __CL_PLATFORM_H
+
+#ifdef __APPLE__
+    /* Contains #defines for AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER below */
+    #include <AvailabilityMacros.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(_WIN32)
+    #define CL_API_ENTRY
+    #define CL_API_CALL     __stdcall
+    #define CL_CALLBACK     __stdcall
+#else
+    #define CL_API_ENTRY
+    #define CL_API_CALL
+    #define CL_CALLBACK
+#endif
+
+#ifdef __APPLE__
+    #define CL_EXTENSION_WEAK_LINK                  __attribute__((weak_import))       
+    #define CL_API_SUFFIX__VERSION_1_0              AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+    #define CL_EXT_SUFFIX__VERSION_1_0              CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+    #define CL_API_SUFFIX__VERSION_1_1              CL_EXTENSION_WEAK_LINK
+    #define CL_EXT_SUFFIX__VERSION_1_1              CL_EXTENSION_WEAK_LINK
+    #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED   CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+#else
+    #define CL_EXTENSION_WEAK_LINK                         
+    #define CL_API_SUFFIX__VERSION_1_0
+    #define CL_EXT_SUFFIX__VERSION_1_0
+    #define CL_API_SUFFIX__VERSION_1_1
+    #define CL_EXT_SUFFIX__VERSION_1_1
+    #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+#endif
+
+#if (defined (_WIN32) && defined(_MSC_VER))
+
+/* scalar types  */
+typedef signed   __int8         cl_char;
+typedef unsigned __int8         cl_uchar;
+typedef signed   __int16        cl_short;
+typedef unsigned __int16        cl_ushort;
+typedef signed   __int32        cl_int;
+typedef unsigned __int32        cl_uint;
+typedef signed   __int64        cl_long;
+typedef unsigned __int64        cl_ulong;
+
+typedef unsigned __int16        cl_half;
+typedef float                   cl_float;
+typedef double                  cl_double;
+
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT         8
+#define CL_SCHAR_MAX        127
+#define CL_SCHAR_MIN        (-127-1)
+#define CL_CHAR_MAX         CL_SCHAR_MAX
+#define CL_CHAR_MIN         CL_SCHAR_MIN
+#define CL_UCHAR_MAX        255
+#define CL_SHRT_MAX         32767
+#define CL_SHRT_MIN         (-32767-1)
+#define CL_USHRT_MAX        65535
+#define CL_INT_MAX          2147483647
+#define CL_INT_MIN          (-2147483647-1)
+#define CL_UINT_MAX         0xffffffffU
+#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+
+#define CL_FLT_DIG          6
+#define CL_FLT_MANT_DIG     24
+#define CL_FLT_MAX_10_EXP   +38
+#define CL_FLT_MAX_EXP      +128
+#define CL_FLT_MIN_10_EXP   -37
+#define CL_FLT_MIN_EXP      -125
+#define CL_FLT_RADIX        2
+#define CL_FLT_MAX          340282346638528859811704183484516925440.0f
+#define CL_FLT_MIN          1.175494350822287507969e-38f
+#define CL_FLT_EPSILON      0x1.0p-23f
+
+#define CL_DBL_DIG          15
+#define CL_DBL_MANT_DIG     53
+#define CL_DBL_MAX_10_EXP   +308
+#define CL_DBL_MAX_EXP      +1024
+#define CL_DBL_MIN_10_EXP   -307
+#define CL_DBL_MIN_EXP      -1021
+#define CL_DBL_RADIX        2
+#define CL_DBL_MAX          179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0
+#define CL_DBL_MIN          2.225073858507201383090e-308
+#define CL_DBL_EPSILON      2.220446049250313080847e-16
+
+#define  CL_M_E             2.718281828459045090796
+#define  CL_M_LOG2E         1.442695040888963387005
+#define  CL_M_LOG10E        0.434294481903251816668
+#define  CL_M_LN2           0.693147180559945286227
+#define  CL_M_LN10          2.302585092994045901094
+#define  CL_M_PI            3.141592653589793115998
+#define  CL_M_PI_2          1.570796326794896557999
+#define  CL_M_PI_4          0.785398163397448278999
+#define  CL_M_1_PI          0.318309886183790691216
+#define  CL_M_2_PI          0.636619772367581382433
+#define  CL_M_2_SQRTPI      1.128379167095512558561
+#define  CL_M_SQRT2         1.414213562373095145475
+#define  CL_M_SQRT1_2       0.707106781186547572737
+
+#define  CL_M_E_F           2.71828174591064f
+#define  CL_M_LOG2E_F       1.44269502162933f
+#define  CL_M_LOG10E_F      0.43429449200630f
+#define  CL_M_LN2_F         0.69314718246460f
+#define  CL_M_LN10_F        2.30258512496948f
+#define  CL_M_PI_F          3.14159274101257f
+#define  CL_M_PI_2_F        1.57079637050629f
+#define  CL_M_PI_4_F        0.78539818525314f
+#define  CL_M_1_PI_F        0.31830987334251f
+#define  CL_M_2_PI_F        0.63661974668503f
+#define  CL_M_2_SQRTPI_F    1.12837922573090f
+#define  CL_M_SQRT2_F       1.41421353816986f
+#define  CL_M_SQRT1_2_F     0.70710676908493f
+
+#define CL_NAN              (CL_INFINITY - CL_INFINITY)
+#define CL_HUGE_VALF        ((cl_float) 1e50)
+#define CL_HUGE_VAL         ((cl_double) 1e500)
+#define CL_MAXFLOAT         CL_FLT_MAX
+#define CL_INFINITY         CL_HUGE_VALF
+
+#else
+
+#include <stdint.h>
+
+/* scalar types  */
+typedef int8_t          cl_char;
+typedef uint8_t         cl_uchar;
+typedef int16_t         cl_short    __attribute__((aligned(2)));
+typedef uint16_t        cl_ushort   __attribute__((aligned(2)));
+typedef int32_t         cl_int      __attribute__((aligned(4)));
+typedef uint32_t        cl_uint     __attribute__((aligned(4)));
+typedef int64_t         cl_long     __attribute__((aligned(8)));
+typedef uint64_t        cl_ulong    __attribute__((aligned(8)));
+
+typedef uint16_t        cl_half     __attribute__((aligned(2)));
+typedef float           cl_float    __attribute__((aligned(4)));
+typedef double          cl_double   __attribute__((aligned(8)));
+
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT         8
+#define CL_SCHAR_MAX        127
+#define CL_SCHAR_MIN        (-127-1)
+#define CL_CHAR_MAX         CL_SCHAR_MAX
+#define CL_CHAR_MIN         CL_SCHAR_MIN
+#define CL_UCHAR_MAX        255
+#define CL_SHRT_MAX         32767
+#define CL_SHRT_MIN         (-32767-1)
+#define CL_USHRT_MAX        65535
+#define CL_INT_MAX          2147483647
+#define CL_INT_MIN          (-2147483647-1)
+#define CL_UINT_MAX         0xffffffffU
+#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+
+#define CL_FLT_DIG          6
+#define CL_FLT_MANT_DIG     24
+#define CL_FLT_MAX_10_EXP   +38
+#define CL_FLT_MAX_EXP      +128
+#define CL_FLT_MIN_10_EXP   -37
+#define CL_FLT_MIN_EXP      -125
+#define CL_FLT_RADIX        2
+#define CL_FLT_MAX          0x1.fffffep127f
+#define CL_FLT_MIN          0x1.0p-126f
+#define CL_FLT_EPSILON      0x1.0p-23f
+
+#define CL_DBL_DIG          15
+#define CL_DBL_MANT_DIG     53
+#define CL_DBL_MAX_10_EXP   +308
+#define CL_DBL_MAX_EXP      +1024
+#define CL_DBL_MIN_10_EXP   -307
+#define CL_DBL_MIN_EXP      -1021
+#define CL_DBL_RADIX        2
+#define CL_DBL_MAX          0x1.fffffffffffffp1023
+#define CL_DBL_MIN          0x1.0p-1022
+#define CL_DBL_EPSILON      0x1.0p-52
+
+#define  CL_M_E             2.718281828459045090796
+#define  CL_M_LOG2E         1.442695040888963387005
+#define  CL_M_LOG10E        0.434294481903251816668
+#define  CL_M_LN2           0.693147180559945286227
+#define  CL_M_LN10          2.302585092994045901094
+#define  CL_M_PI            3.141592653589793115998
+#define  CL_M_PI_2          1.570796326794896557999
+#define  CL_M_PI_4          0.785398163397448278999
+#define  CL_M_1_PI          0.318309886183790691216
+#define  CL_M_2_PI          0.636619772367581382433
+#define  CL_M_2_SQRTPI      1.128379167095512558561
+#define  CL_M_SQRT2         1.414213562373095145475
+#define  CL_M_SQRT1_2       0.707106781186547572737
+
+#define  CL_M_E_F           2.71828174591064f
+#define  CL_M_LOG2E_F       1.44269502162933f
+#define  CL_M_LOG10E_F      0.43429449200630f
+#define  CL_M_LN2_F         0.69314718246460f
+#define  CL_M_LN10_F        2.30258512496948f
+#define  CL_M_PI_F          3.14159274101257f
+#define  CL_M_PI_2_F        1.57079637050629f
+#define  CL_M_PI_4_F        0.78539818525314f
+#define  CL_M_1_PI_F        0.31830987334251f
+#define  CL_M_2_PI_F        0.63661974668503f
+#define  CL_M_2_SQRTPI_F    1.12837922573090f
+#define  CL_M_SQRT2_F       1.41421353816986f
+#define  CL_M_SQRT1_2_F     0.70710676908493f
+
+#if defined( __GNUC__ )
+   #define CL_HUGE_VALF     __builtin_huge_valf()
+   #define CL_HUGE_VAL      __builtin_huge_val()
+   #define CL_NAN           __builtin_nanf( "" )
+#else
+   #define CL_HUGE_VALF     ((cl_float) 1e50)
+   #define CL_HUGE_VAL      ((cl_double) 1e500)
+   float nanf( const char * );
+   #define CL_NAN           nanf( "" )  
+#endif
+#define CL_MAXFLOAT         CL_FLT_MAX
+#define CL_INFINITY         CL_HUGE_VALF
+
+#endif
+
+#include <stddef.h>
+
+/* Mirror types to GL types. Mirror types allow us to avoid deciding which headers to load based on whether we are using GL or GLES here. */
+typedef unsigned int cl_GLuint;
+typedef int          cl_GLint;
+typedef unsigned int cl_GLenum;
+
+/*
+ * Vector types 
+ *
+ *  Note:   OpenCL requires that all types be naturally aligned. 
+ *          This means that vector types must be naturally aligned.
+ *          For example, a vector of four floats must be aligned to
+ *          a 16 byte boundary (calculated as 4 * the natural 4-byte 
+ *          alignment of the float).  The alignment qualifiers here
+ *          will only function properly if your compiler supports them
+ *          and if you don't actively work to defeat them.  For example,
+ *          in order for a cl_float4 to be 16 byte aligned in a struct,
+ *          the start of the struct must itself be 16-byte aligned. 
+ *
+ *          Maintaining proper alignment is the user's responsibility.
+ */
+
+/* Define basic vector types */
+#if defined( __VEC__ )
+   #include <altivec.h>   /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */
+   typedef vector unsigned char     __cl_uchar16;
+   typedef vector signed char       __cl_char16;
+   typedef vector unsigned short    __cl_ushort8;
+   typedef vector signed short      __cl_short8;
+   typedef vector unsigned int      __cl_uint4;
+   typedef vector signed int        __cl_int4;
+   typedef vector float             __cl_float4;
+   #define  __CL_UCHAR16__  1
+   #define  __CL_CHAR16__   1
+   #define  __CL_USHORT8__  1
+   #define  __CL_SHORT8__   1
+   #define  __CL_UINT4__    1
+   #define  __CL_INT4__     1
+   #define  __CL_FLOAT4__   1
+#endif
+
+#if defined( __SSE__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <xmmintrin.h>
+    #endif
+    #if defined( __GNUC__ )
+        typedef float __cl_float4   __attribute__((vector_size(16)));
+    #else
+        typedef __m128 __cl_float4;
+    #endif
+    #define __CL_FLOAT4__   1
+#endif
+
+#if defined( __SSE2__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <emmintrin.h>
+    #endif
+    #if defined( __GNUC__ )
+        typedef cl_uchar    __cl_uchar16    __attribute__((vector_size(16)));
+        typedef cl_char     __cl_char16     __attribute__((vector_size(16)));
+        typedef cl_ushort   __cl_ushort8    __attribute__((vector_size(16)));
+        typedef cl_short    __cl_short8     __attribute__((vector_size(16)));
+        typedef cl_uint     __cl_uint4      __attribute__((vector_size(16)));
+        typedef cl_int      __cl_int4       __attribute__((vector_size(16)));
+        typedef cl_ulong    __cl_ulong2     __attribute__((vector_size(16)));
+        typedef cl_long     __cl_long2      __attribute__((vector_size(16)));
+        typedef cl_double   __cl_double2    __attribute__((vector_size(16)));
+    #else
+        typedef __m128i __cl_uchar16;
+        typedef __m128i __cl_char16;
+        typedef __m128i __cl_ushort8;
+        typedef __m128i __cl_short8;
+        typedef __m128i __cl_uint4;
+        typedef __m128i __cl_int4;
+        typedef __m128i __cl_ulong2;
+        typedef __m128i __cl_long2;
+        typedef __m128d __cl_double2;
+    #endif
+    #define __CL_UCHAR16__  1
+    #define __CL_CHAR16__   1
+    #define __CL_USHORT8__  1
+    #define __CL_SHORT8__   1
+    #define __CL_INT4__     1
+    #define __CL_UINT4__    1
+    #define __CL_ULONG2__   1
+    #define __CL_LONG2__    1
+    #define __CL_DOUBLE2__  1
+#endif
+
+#if defined( __MMX__ )
+    #include <mmintrin.h>
+    #if defined( __GNUC__ )
+        typedef cl_uchar    __cl_uchar8     __attribute__((vector_size(8)));
+        typedef cl_char     __cl_char8      __attribute__((vector_size(8)));
+        typedef cl_ushort   __cl_ushort4    __attribute__((vector_size(8)));
+        typedef cl_short    __cl_short4     __attribute__((vector_size(8)));
+        typedef cl_uint     __cl_uint2      __attribute__((vector_size(8)));
+        typedef cl_int      __cl_int2       __attribute__((vector_size(8)));
+        typedef cl_ulong    __cl_ulong1     __attribute__((vector_size(8)));
+        typedef cl_long     __cl_long1      __attribute__((vector_size(8)));
+        typedef cl_float    __cl_float2     __attribute__((vector_size(8)));
+    #else
+        typedef __m64       __cl_uchar8;
+        typedef __m64       __cl_char8;
+        typedef __m64       __cl_ushort4;
+        typedef __m64       __cl_short4;
+        typedef __m64       __cl_uint2;
+        typedef __m64       __cl_int2;
+        typedef __m64       __cl_ulong1;
+        typedef __m64       __cl_long1;
+        typedef __m64       __cl_float2;
+    #endif
+    #define __CL_UCHAR8__   1
+    #define __CL_CHAR8__    1
+    #define __CL_USHORT4__  1
+    #define __CL_SHORT4__   1
+    #define __CL_INT2__     1
+    #define __CL_UINT2__    1
+    #define __CL_ULONG1__   1
+    #define __CL_LONG1__    1
+    #define __CL_FLOAT2__   1
+#endif
+
+#if defined( __AVX__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <immintrin.h> 
+    #endif
+    #if defined( __GNUC__ )
+        typedef cl_float    __cl_float8     __attribute__((vector_size(32)));
+        typedef cl_double   __cl_double4    __attribute__((vector_size(32)));
+    #else
+        typedef __m256      __cl_float8;
+        typedef __m256d     __cl_double4;
+    #endif
+    #define __CL_FLOAT8__   1
+    #define __CL_DOUBLE4__  1
+#endif
+
+/* Define alignment keys */
+#if defined( __GNUC__ )
+    #define CL_ALIGNED(_x)          __attribute__ ((aligned(_x)))
+#elif defined( _WIN32) && (_MSC_VER)
+    /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements     */
+    /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx                                                 */
+    /* #include <crtdefs.h>                                                                                             */
+    /* #define CL_ALIGNED(_x)          _CRT_ALIGN(_x)                                                                   */
+    #define CL_ALIGNED(_x)
+#else
+   #warning  Need to implement some method to align data here
+   #define  CL_ALIGNED(_x)
+#endif
+
+/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+    /* .xyzw and .s0123...{f|F} are supported */
+    #define CL_HAS_NAMED_VECTOR_FIELDS 1
+    /* .hi and .lo are supported */
+    #define CL_HAS_HI_LO_VECTOR_FIELDS 1
+#endif
+
+/* Define cl_vector types */
+
+/* ---- cl_charn ---- */
+typedef union
+{
+    cl_char  CL_ALIGNED(2) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_char  x, y; };
+   __extension__ struct{ cl_char  s0, s1; };
+   __extension__ struct{ cl_char  lo, hi; };
+#endif
+#if defined( __CL_CHAR2__) 
+    __cl_char2     v2;
+#endif
+}cl_char2;
+
+typedef union
+{
+    cl_char  CL_ALIGNED(4) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_char  x, y, z, w; };
+   __extension__ struct{ cl_char  s0, s1, s2, s3; };
+   __extension__ struct{ cl_char2 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__) 
+    __cl_char2     v2[2];
+#endif
+#if defined( __CL_CHAR4__) 
+    __cl_char4     v4;
+#endif
+}cl_char4;
+
+/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */
+typedef  cl_char4  cl_char3;
+
+typedef union
+{
+    cl_char   CL_ALIGNED(8) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_char  x, y, z, w; };
+   __extension__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_char4 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__) 
+    __cl_char2     v2[4];
+#endif
+#if defined( __CL_CHAR4__) 
+    __cl_char4     v4[2];
+#endif
+#if defined( __CL_CHAR8__ )
+    __cl_char8     v8;
+#endif
+}cl_char8;
+
+typedef union
+{
+    cl_char  CL_ALIGNED(16) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_char  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_char8 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__) 
+    __cl_char2     v2[8];
+#endif
+#if defined( __CL_CHAR4__) 
+    __cl_char4     v4[4];
+#endif
+#if defined( __CL_CHAR8__ )
+    __cl_char8     v8[2];
+#endif
+#if defined( __CL_CHAR16__ )
+    __cl_char16    v16;
+#endif
+}cl_char16;
+
+
+/* ---- cl_ucharn ---- */
+typedef union
+{
+    cl_uchar  CL_ALIGNED(2) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_uchar  x, y; };
+   __extension__ struct{ cl_uchar  s0, s1; };
+   __extension__ struct{ cl_uchar  lo, hi; };
+#endif
+#if defined( __cl_uchar2__) 
+    __cl_uchar2     v2;
+#endif
+}cl_uchar2;
+
+typedef union
+{
+    cl_uchar  CL_ALIGNED(4) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_uchar  x, y, z, w; };
+   __extension__ struct{ cl_uchar  s0, s1, s2, s3; };
+   __extension__ struct{ cl_uchar2 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__) 
+    __cl_uchar2     v2[2];
+#endif
+#if defined( __CL_UCHAR4__) 
+    __cl_uchar4     v4;
+#endif
+}cl_uchar4;
+
+/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */
+typedef  cl_uchar4  cl_uchar3;
+
+typedef union
+{
+    cl_uchar   CL_ALIGNED(8) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_uchar  x, y, z, w; };
+   __extension__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_uchar4 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__) 
+    __cl_uchar2     v2[4];
+#endif
+#if defined( __CL_UCHAR4__) 
+    __cl_uchar4     v4[2];
+#endif
+#if defined( __CL_UCHAR8__ )
+    __cl_uchar8     v8;
+#endif
+}cl_uchar8;
+
+typedef union
+{
+    cl_uchar  CL_ALIGNED(16) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_uchar  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_uchar8 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__) 
+    __cl_uchar2     v2[8];
+#endif
+#if defined( __CL_UCHAR4__) 
+    __cl_uchar4     v4[4];
+#endif
+#if defined( __CL_UCHAR8__ )
+    __cl_uchar8     v8[2];
+#endif
+#if defined( __CL_UCHAR16__ )
+    __cl_uchar16    v16;
+#endif
+}cl_uchar16;
+
+
+/* ---- cl_shortn ---- */
+typedef union
+{
+    cl_short  CL_ALIGNED(4) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_short  x, y; };
+   __extension__ struct{ cl_short  s0, s1; };
+   __extension__ struct{ cl_short  lo, hi; };
+#endif
+#if defined( __CL_SHORT2__) 
+    __cl_short2     v2;
+#endif
+}cl_short2;
+
+typedef union
+{
+    cl_short  CL_ALIGNED(8) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_short  x, y, z, w; };
+   __extension__ struct{ cl_short  s0, s1, s2, s3; };
+   __extension__ struct{ cl_short2 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__) 
+    __cl_short2     v2[2];
+#endif
+#if defined( __CL_SHORT4__) 
+    __cl_short4     v4;
+#endif
+}cl_short4;
+
+/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */
+typedef  cl_short4  cl_short3;
+
+typedef union
+{
+    cl_short   CL_ALIGNED(16) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_short  x, y, z, w; };
+   __extension__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_short4 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__) 
+    __cl_short2     v2[4];
+#endif
+#if defined( __CL_SHORT4__) 
+    __cl_short4     v4[2];
+#endif
+#if defined( __CL_SHORT8__ )
+    __cl_short8     v8;
+#endif
+}cl_short8;
+
+typedef union
+{
+    cl_short  CL_ALIGNED(32) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_short  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_short8 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__) 
+    __cl_short2     v2[8];
+#endif
+#if defined( __CL_SHORT4__) 
+    __cl_short4     v4[4];
+#endif
+#if defined( __CL_SHORT8__ )
+    __cl_short8     v8[2];
+#endif
+#if defined( __CL_SHORT16__ )
+    __cl_short16    v16;
+#endif
+}cl_short16;
+
+
+/* ---- cl_ushortn ---- */
+typedef union
+{
+    cl_ushort  CL_ALIGNED(4) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_ushort  x, y; };
+   __extension__ struct{ cl_ushort  s0, s1; };
+   __extension__ struct{ cl_ushort  lo, hi; };
+#endif
+#if defined( __CL_USHORT2__) 
+    __cl_ushort2     v2;
+#endif
+}cl_ushort2;
+
+typedef union
+{
+    cl_ushort  CL_ALIGNED(8) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_ushort  x, y, z, w; };
+   __extension__ struct{ cl_ushort  s0, s1, s2, s3; };
+   __extension__ struct{ cl_ushort2 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__) 
+    __cl_ushort2     v2[2];
+#endif
+#if defined( __CL_USHORT4__) 
+    __cl_ushort4     v4;
+#endif
+}cl_ushort4;
+
+/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */
+typedef  cl_ushort4  cl_ushort3;
+
+typedef union
+{
+    cl_ushort   CL_ALIGNED(16) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_ushort  x, y, z, w; };
+   __extension__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_ushort4 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__) 
+    __cl_ushort2     v2[4];
+#endif
+#if defined( __CL_USHORT4__) 
+    __cl_ushort4     v4[2];
+#endif
+#if defined( __CL_USHORT8__ )
+    __cl_ushort8     v8;
+#endif
+}cl_ushort8;
+
+typedef union
+{
+    cl_ushort  CL_ALIGNED(32) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_ushort  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_ushort8 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__) 
+    __cl_ushort2     v2[8];
+#endif
+#if defined( __CL_USHORT4__) 
+    __cl_ushort4     v4[4];
+#endif
+#if defined( __CL_USHORT8__ )
+    __cl_ushort8     v8[2];
+#endif
+#if defined( __CL_USHORT16__ )
+    __cl_ushort16    v16;
+#endif
+}cl_ushort16;
+
+/* ---- cl_intn ---- */
+typedef union
+{
+    cl_int  CL_ALIGNED(8) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_int  x, y; };
+   __extension__ struct{ cl_int  s0, s1; };
+   __extension__ struct{ cl_int  lo, hi; };
+#endif
+#if defined( __CL_INT2__) 
+    __cl_int2     v2;
+#endif
+}cl_int2;
+
+typedef union
+{
+    cl_int  CL_ALIGNED(16) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_int  x, y, z, w; };
+   __extension__ struct{ cl_int  s0, s1, s2, s3; };
+   __extension__ struct{ cl_int2 lo, hi; };
+#endif
+#if defined( __CL_INT2__) 
+    __cl_int2     v2[2];
+#endif
+#if defined( __CL_INT4__) 
+    __cl_int4     v4;
+#endif
+}cl_int4;
+
+/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */
+typedef  cl_int4  cl_int3;
+
+typedef union
+{
+    cl_int   CL_ALIGNED(32) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_int  x, y, z, w; };
+   __extension__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_int4 lo, hi; };
+#endif
+#if defined( __CL_INT2__) 
+    __cl_int2     v2[4];
+#endif
+#if defined( __CL_INT4__) 
+    __cl_int4     v4[2];
+#endif
+#if defined( __CL_INT8__ )
+    __cl_int8     v8;
+#endif
+}cl_int8;
+
+typedef union
+{
+    cl_int  CL_ALIGNED(64) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_int  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_int8 lo, hi; };
+#endif
+#if defined( __CL_INT2__) 
+    __cl_int2     v2[8];
+#endif
+#if defined( __CL_INT4__) 
+    __cl_int4     v4[4];
+#endif
+#if defined( __CL_INT8__ )
+    __cl_int8     v8[2];
+#endif
+#if defined( __CL_INT16__ )
+    __cl_int16    v16;
+#endif
+}cl_int16;
+
+
+/* ---- cl_uintn ---- */
+typedef union
+{
+    cl_uint  CL_ALIGNED(8) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_uint  x, y; };
+   __extension__ struct{ cl_uint  s0, s1; };
+   __extension__ struct{ cl_uint  lo, hi; };
+#endif
+#if defined( __CL_UINT2__) 
+    __cl_uint2     v2;
+#endif
+}cl_uint2;
+
+typedef union
+{
+    cl_uint  CL_ALIGNED(16) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_uint  x, y, z, w; };
+   __extension__ struct{ cl_uint  s0, s1, s2, s3; };
+   __extension__ struct{ cl_uint2 lo, hi; };
+#endif
+#if defined( __CL_UINT2__) 
+    __cl_uint2     v2[2];
+#endif
+#if defined( __CL_UINT4__) 
+    __cl_uint4     v4;
+#endif
+}cl_uint4;
+
+/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */
+typedef  cl_uint4  cl_uint3;
+
+typedef union
+{
+    cl_uint   CL_ALIGNED(32) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_uint  x, y, z, w; };
+   __extension__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_uint4 lo, hi; };
+#endif
+#if defined( __CL_UINT2__) 
+    __cl_uint2     v2[4];
+#endif
+#if defined( __CL_UINT4__) 
+    __cl_uint4     v4[2];
+#endif
+#if defined( __CL_UINT8__ )
+    __cl_uint8     v8;
+#endif
+}cl_uint8;
+
+typedef union
+{
+    cl_uint  CL_ALIGNED(64) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_uint  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_uint8 lo, hi; };
+#endif
+#if defined( __CL_UINT2__) 
+    __cl_uint2     v2[8];
+#endif
+#if defined( __CL_UINT4__) 
+    __cl_uint4     v4[4];
+#endif
+#if defined( __CL_UINT8__ )
+    __cl_uint8     v8[2];
+#endif
+#if defined( __CL_UINT16__ )
+    __cl_uint16    v16;
+#endif
+}cl_uint16;
+
+/* ---- cl_longn ---- */
+typedef union
+{
+    cl_long  CL_ALIGNED(16) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_long  x, y; };
+   __extension__ struct{ cl_long  s0, s1; };
+   __extension__ struct{ cl_long  lo, hi; };
+#endif
+#if defined( __CL_LONG2__) 
+    __cl_long2     v2;
+#endif
+}cl_long2;
+
+typedef union
+{
+    cl_long  CL_ALIGNED(32) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_long  x, y, z, w; };
+   __extension__ struct{ cl_long  s0, s1, s2, s3; };
+   __extension__ struct{ cl_long2 lo, hi; };
+#endif
+#if defined( __CL_LONG2__) 
+    __cl_long2     v2[2];
+#endif
+#if defined( __CL_LONG4__) 
+    __cl_long4     v4;
+#endif
+}cl_long4;
+
+/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */
+typedef  cl_long4  cl_long3;
+
+typedef union
+{
+    cl_long   CL_ALIGNED(64) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_long  x, y, z, w; };
+   __extension__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_long4 lo, hi; };
+#endif
+#if defined( __CL_LONG2__) 
+    __cl_long2     v2[4];
+#endif
+#if defined( __CL_LONG4__) 
+    __cl_long4     v4[2];
+#endif
+#if defined( __CL_LONG8__ )
+    __cl_long8     v8;
+#endif
+}cl_long8;
+
+typedef union
+{
+    cl_long  CL_ALIGNED(128) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_long  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_long8 lo, hi; };
+#endif
+#if defined( __CL_LONG2__) 
+    __cl_long2     v2[8];
+#endif
+#if defined( __CL_LONG4__) 
+    __cl_long4     v4[4];
+#endif
+#if defined( __CL_LONG8__ )
+    __cl_long8     v8[2];
+#endif
+#if defined( __CL_LONG16__ )
+    __cl_long16    v16;
+#endif
+}cl_long16;
+
+
+/* ---- cl_ulongn ---- */
+typedef union
+{
+    cl_ulong  CL_ALIGNED(16) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_ulong  x, y; };
+   __extension__ struct{ cl_ulong  s0, s1; };
+   __extension__ struct{ cl_ulong  lo, hi; };
+#endif
+#if defined( __CL_ULONG2__) 
+    __cl_ulong2     v2;
+#endif
+}cl_ulong2;
+
+typedef union
+{
+    cl_ulong  CL_ALIGNED(32) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_ulong  x, y, z, w; };
+   __extension__ struct{ cl_ulong  s0, s1, s2, s3; };
+   __extension__ struct{ cl_ulong2 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__) 
+    __cl_ulong2     v2[2];
+#endif
+#if defined( __CL_ULONG4__) 
+    __cl_ulong4     v4;
+#endif
+}cl_ulong4;
+
+/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */
+typedef  cl_ulong4  cl_ulong3;
+
+typedef union
+{
+    cl_ulong   CL_ALIGNED(64) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_ulong  x, y, z, w; };
+   __extension__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_ulong4 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__) 
+    __cl_ulong2     v2[4];
+#endif
+#if defined( __CL_ULONG4__) 
+    __cl_ulong4     v4[2];
+#endif
+#if defined( __CL_ULONG8__ )
+    __cl_ulong8     v8;
+#endif
+}cl_ulong8;
+
+typedef union
+{
+    cl_ulong  CL_ALIGNED(128) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_ulong  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_ulong8 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__) 
+    __cl_ulong2     v2[8];
+#endif
+#if defined( __CL_ULONG4__) 
+    __cl_ulong4     v4[4];
+#endif
+#if defined( __CL_ULONG8__ )
+    __cl_ulong8     v8[2];
+#endif
+#if defined( __CL_ULONG16__ )
+    __cl_ulong16    v16;
+#endif
+}cl_ulong16;
+
+
+/* --- cl_floatn ---- */
+
+typedef union
+{
+    cl_float  CL_ALIGNED(8) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_float  x, y; };
+   __extension__ struct{ cl_float  s0, s1; };
+   __extension__ struct{ cl_float  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__) 
+    __cl_float2     v2;
+#endif
+}cl_float2;
+
+typedef union
+{
+    cl_float  CL_ALIGNED(16) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_float   x, y, z, w; };
+   __extension__ struct{ cl_float   s0, s1, s2, s3; };
+   __extension__ struct{ cl_float2  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__) 
+    __cl_float2     v2[2];
+#endif
+#if defined( __CL_FLOAT4__) 
+    __cl_float4     v4;
+#endif
+}cl_float4;
+
+/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */
+typedef  cl_float4  cl_float3;
+
+typedef union
+{
+    cl_float   CL_ALIGNED(32) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_float   x, y, z, w; };
+   __extension__ struct{ cl_float   s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_float4  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__) 
+    __cl_float2     v2[4];
+#endif
+#if defined( __CL_FLOAT4__) 
+    __cl_float4     v4[2];
+#endif
+#if defined( __CL_FLOAT8__ )
+    __cl_float8     v8;
+#endif
+}cl_float8;
+
+typedef union
+{
+    cl_float  CL_ALIGNED(64) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_float  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_float  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_float8 lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__) 
+    __cl_float2     v2[8];
+#endif
+#if defined( __CL_FLOAT4__) 
+    __cl_float4     v4[4];
+#endif
+#if defined( __CL_FLOAT8__ )
+    __cl_float8     v8[2];
+#endif
+#if defined( __CL_FLOAT16__ )
+    __cl_float16    v16;
+#endif
+}cl_float16;
+
+/* --- cl_doublen ---- */
+
+typedef union
+{
+    cl_double  CL_ALIGNED(16) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_double  x, y; };
+   __extension__ struct{ cl_double s0, s1; };
+   __extension__ struct{ cl_double lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__) 
+    __cl_double2     v2;
+#endif
+}cl_double2;
+
+typedef union
+{
+    cl_double  CL_ALIGNED(32) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_double  x, y, z, w; };
+   __extension__ struct{ cl_double  s0, s1, s2, s3; };
+   __extension__ struct{ cl_double2 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__) 
+    __cl_double2     v2[2];
+#endif
+#if defined( __CL_DOUBLE4__) 
+    __cl_double4     v4;
+#endif
+}cl_double4;
+
+/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */
+typedef  cl_double4  cl_double3;
+
+typedef union
+{
+    cl_double   CL_ALIGNED(64) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_double  x, y, z, w; };
+   __extension__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_double4 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__) 
+    __cl_double2     v2[4];
+#endif
+#if defined( __CL_DOUBLE4__) 
+    __cl_double4     v4[2];
+#endif
+#if defined( __CL_DOUBLE8__ )
+    __cl_double8     v8;
+#endif
+}cl_double8;
+
+typedef union
+{
+    cl_double  CL_ALIGNED(128) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_double  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_double8 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__) 
+    __cl_double2     v2[8];
+#endif
+#if defined( __CL_DOUBLE4__) 
+    __cl_double4     v4[4];
+#endif
+#if defined( __CL_DOUBLE8__ )
+    __cl_double8     v8[2];
+#endif
+#if defined( __CL_DOUBLE16__ )
+    __cl_double16    v16;
+#endif
+}cl_double16;
+
+/* Macro to facilitate debugging 
+ * Usage:
+ *   Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source. 
+ *   The first line ends with:   CL_PROGRAM_STRING_BEGIN \"
+ *   Each line thereafter of OpenCL C source must end with: \n\
+ *   The last line ends in ";
+ *
+ *   Example:
+ *
+ *   const char *my_program = CL_PROGRAM_STRING_BEGIN "\
+ *   kernel void foo( int a, float * b )             \n\
+ *   {                                               \n\
+ *      // my comment                                \n\
+ *      *b[ get_global_id(0)] = a;                   \n\
+ *   }                                               \n\
+ *   ";
+ *
+ * This should correctly set up the line, (column) and file information for your source 
+ * string so you can do source level debugging.
+ */
+#define  __CL_STRINGIFY( _x )               # _x
+#define  _CL_STRINGIFY( _x )                __CL_STRINGIFY( _x )
+#define  CL_PROGRAM_STRING_DEBUG_INFO       "#line "  _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n" 
+  
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __CL_PLATFORM_H  */
diff --git a/include/CL/opencl.h b/include/CL/opencl.h
new file mode 100644
index 00000000000..26a63899758
--- /dev/null
+++ b/include/CL/opencl.h
@@ -0,0 +1,54 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2010 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_H
+#define __OPENCL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+
+#include <OpenCL/cl.h>
+#include <OpenCL/cl_gl.h>
+#include <OpenCL/cl_gl_ext.h>
+#include <OpenCL/cl_ext.h>
+
+#else
+
+#include <CL/cl.h>
+#include <CL/cl_gl.h>
+#include <CL/cl_gl_ext.h>
+#include <CL/cl_ext.h>
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_H   */
+
diff --git a/include/GL/internal/dri_interface.h b/include/GL/internal/dri_interface.h
index da8366652fe..e37917eda99 100644
--- a/include/GL/internal/dri_interface.h
+++ b/include/GL/internal/dri_interface.h
@@ -894,7 +894,7 @@ struct __DRIdri2ExtensionRec {
  * extensions.
  */
 #define __DRI_IMAGE "DRI_IMAGE"
-#define __DRI_IMAGE_VERSION 3
+#define __DRI_IMAGE_VERSION 4
 
 /**
  * These formats correspond to the similarly named MESA_FORMAT_*
@@ -906,10 +906,12 @@ struct __DRIdri2ExtensionRec {
 #define __DRI_IMAGE_FORMAT_XRGB8888     0x1002
 #define __DRI_IMAGE_FORMAT_ARGB8888     0x1003
 #define __DRI_IMAGE_FORMAT_ABGR8888     0x1004
+#define __DRI_IMAGE_FORMAT_XBGR8888     0x1005
 
 #define __DRI_IMAGE_USE_SHARE		0x0001
 #define __DRI_IMAGE_USE_SCANOUT		0x0002
 #define __DRI_IMAGE_USE_CURSOR		0x0004
+#define __DRI_IMAGE_USE_WRITE		0x0008
 
 /**
  * queryImage attributes
@@ -954,6 +956,13 @@ struct __DRIimageExtensionRec {
     * \since 2
     */
    GLboolean (*validateUsage)(__DRIimage *image, unsigned int use);
+
+   /**
+    * Write data into image.
+    *
+    * \since 4
+    */
+   int (*write)(__DRIimage *image, const void *buf, size_t count);
 };
 
 
diff --git a/src/SConscript b/src/SConscript
index ba6be0be1a0..777ad23f2c9 100644
--- a/src/SConscript
+++ b/src/SConscript
@@ -24,7 +24,7 @@ SConscript('mapi/vgapi/SConscript')
 if not env['embedded']:
     if env['platform'] not in ['windows', 'darwin', 'haiku']:
         SConscript('glx/SConscript')
-    if env['platform'] not in ['darwin', 'haiku']:
+    if env['platform'] not in ['darwin', 'haiku', 'sunos']:
         SConscript('egl/main/SConscript')
     if env['platform'] not in ['darwin']:
         SConscript('glu/sgi/SConscript')
diff --git a/src/egl/drivers/dri2/Makefile.am b/src/egl/drivers/dri2/Makefile.am
index e4d4abb966d..49ec06bbec7 100644
--- a/src/egl/drivers/dri2/Makefile.am
+++ b/src/egl/drivers/dri2/Makefile.am
@@ -26,6 +26,7 @@ AM_CFLAGS = \
 	-I$(top_srcdir)/src/gbm/backends/dri \
 	-I$(top_srcdir)/src/egl/wayland/wayland-egl \
 	-I$(top_srcdir)/src/egl/wayland/wayland-drm \
+	-I$(top_builddir)/src/egl/wayland/wayland-drm \
 	$(DEFINES) \
 	$(LIBDRM_CFLAGS) \
 	$(LIBUDEV_CFLAGS) \
diff --git a/src/egl/drivers/dri2/platform_android.c b/src/egl/drivers/dri2/platform_android.c
index 1998941f829..66142cdee5f 100644
--- a/src/egl/drivers/dri2/platform_android.c
+++ b/src/egl/drivers/dri2/platform_android.c
@@ -311,6 +311,8 @@ dri2_create_image_android_native_buffer(_EGLDisplay *disp,
       format = __DRI_IMAGE_FORMAT_ABGR8888;
       break;
    case HAL_PIXEL_FORMAT_RGBX_8888:
+      format = __DRI_IMAGE_FORMAT_XBGR8888;
+      break;
    case HAL_PIXEL_FORMAT_RGB_888:
    case HAL_PIXEL_FORMAT_RGBA_5551:
    case HAL_PIXEL_FORMAT_RGBA_4444:
diff --git a/src/gallium/SConscript b/src/gallium/SConscript
index da2e4dd5ded..b61eba0390e 100644
--- a/src/gallium/SConscript
+++ b/src/gallium/SConscript
@@ -33,8 +33,11 @@ if env['drm']:
         SConscript([
             'drivers/r300/SConscript',
             'drivers/r600/SConscript',
-            'drivers/radeonsi/SConscript',
         ])
+        if env['llvm']:
+            SConscript([
+                'drivers/radeonsi/SConscript',
+            ])
     # XXX: nouveau drivers have a tight dependency on libdrm, so to enable
     # we need some version logic before we enable them. Also, ATM there is
     # no nouveau target in scons
@@ -55,7 +58,7 @@ SConscript('winsys/sw/null/SConscript')
 
 if not env['embedded']:
     SConscript('state_trackers/vega/SConscript')
-    if env['platform'] not in ['darwin', 'haiku']:
+    if env['platform'] not in ['darwin', 'haiku', 'sunos']:
         SConscript('state_trackers/egl/SConscript')
 
     if env['x11']:
@@ -121,7 +124,7 @@ SConscript([
 ])
 
 if not env['embedded']:
-    if env['platform'] not in ['darwin', 'haiku']:
+    if env['platform'] not in ['darwin', 'haiku', 'sunos']:
         SConscript([
             'targets/egl-static/SConscript'
         ])
@@ -153,8 +156,11 @@ if not env['embedded']:
             SConscript([
                 'targets/dri-r300/SConscript',
                 'targets/dri-r600/SConscript',
-                'targets/dri-radeonsi/SConscript',
             ])
+            if env['llvm']:
+                SConscript([
+                    'targets/dri-radeonsi/SConscript',
+                ])
 
     if env['xorg'] and env['drm']:
         SConscript([
diff --git a/src/gallium/auxiliary/Makefile b/src/gallium/auxiliary/Makefile
index 896c058fde9..a70ae7384fb 100644
--- a/src/gallium/auxiliary/Makefile
+++ b/src/gallium/auxiliary/Makefile
@@ -18,6 +18,14 @@ endif
 
 include ../Makefile.template
 
+default install clean: %: subdirs-%
+
+subdirs-%:
+	@for dir in $(SUBDIRS) ; do \
+		if [ -d $$dir ] ; then \
+			(cd $$dir && $(MAKE) $*) || exit 1; \
+		fi \
+	done
 
 indices/u_indices_gen.c: indices/u_indices_gen.py
 	$(PYTHON2) $< > $@
diff --git a/src/gallium/auxiliary/Makefile.sources b/src/gallium/auxiliary/Makefile.sources
index 44cf2d326de..277428b38be 100644
--- a/src/gallium/auxiliary/Makefile.sources
+++ b/src/gallium/auxiliary/Makefile.sources
@@ -1,3 +1,5 @@
+SUBDIRS := pipe-loader
+
 C_SOURCES := \
 	cso_cache/cso_cache.c \
 	cso_cache/cso_context.c \
diff --git a/src/gallium/auxiliary/draw/draw_pipe_aaline.c b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
index ea0a4fb3f25..d6b981195b2 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_aaline.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
@@ -237,12 +237,13 @@ aa_transform_inst(struct tgsi_transform_context *ctx,
       decl = tgsi_default_full_declaration();
       decl.Declaration.File = TGSI_FILE_INPUT;
       /* XXX this could be linear... */
-      decl.Declaration.Interpolate = TGSI_INTERPOLATE_PERSPECTIVE;
+      decl.Declaration.Interpolate = 1;
       decl.Declaration.Semantic = 1;
       decl.Semantic.Name = TGSI_SEMANTIC_GENERIC;
       decl.Semantic.Index = aactx->maxGeneric + 1;
       decl.Range.First = 
       decl.Range.Last = aactx->maxInput + 1;
+      decl.Interp.Interpolate = TGSI_INTERPOLATE_PERSPECTIVE;
       ctx->emit_declaration(ctx, &decl);
 
       /* declare new sampler */
diff --git a/src/gallium/auxiliary/draw/draw_pipe_aapoint.c b/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
index a900dd3ab54..ec703d0b394 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
@@ -201,12 +201,13 @@ aa_transform_inst(struct tgsi_transform_context *ctx,
       decl = tgsi_default_full_declaration();
       decl.Declaration.File = TGSI_FILE_INPUT;
       /* XXX this could be linear... */
-      decl.Declaration.Interpolate = TGSI_INTERPOLATE_PERSPECTIVE;
+      decl.Declaration.Interpolate = 1;
       decl.Declaration.Semantic = 1;
       decl.Semantic.Name = TGSI_SEMANTIC_GENERIC;
       decl.Semantic.Index = aactx->maxGeneric + 1;
       decl.Range.First = 
       decl.Range.Last = texInput;
+      decl.Interp.Interpolate = TGSI_INTERPOLATE_PERSPECTIVE;
       ctx->emit_declaration(ctx, &decl);
 
       /* declare new temp regs */
diff --git a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
index cfb6ef4453a..842f6eeba22 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
@@ -234,12 +234,13 @@ pstip_transform_inst(struct tgsi_transform_context *ctx,
          /* declare new position input reg */
          decl = tgsi_default_full_declaration();
          decl.Declaration.File = TGSI_FILE_INPUT;
-         decl.Declaration.Interpolate = TGSI_INTERPOLATE_LINEAR; /* XXX? */
+         decl.Declaration.Interpolate = 1;
          decl.Declaration.Semantic = 1;
          decl.Semantic.Name = TGSI_SEMANTIC_POSITION;
          decl.Semantic.Index = 0;
          decl.Range.First = 
             decl.Range.Last = wincoordInput;
+         decl.Interp.Interpolate = TGSI_INTERPOLATE_LINEAR; /* XXX? */
          ctx->emit_declaration(ctx, &decl);
       }
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index 4ba4aa54596..3a74790e89f 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -926,7 +926,7 @@ lp_build_sgn(struct lp_build_context *bld,
    }
    else
    {
-      LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
+      LLVMValueRef minus_one = lp_build_const_int_vec(bld->gallivm, type, -1.0);
       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
       res = lp_build_select(bld, cond, bld->one, minus_one);
    }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_assert.c b/src/gallium/auxiliary/gallivm/lp_bld_assert.c
index 9de5e8e7b51..37c142bd2ae 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_assert.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_assert.c
@@ -29,6 +29,7 @@
 #include "util/u_memory.h"
 #include "lp_bld_assert.h"
 #include "lp_bld_init.h"
+#include "lp_bld_const.h"
 #include "lp_bld_printf.h"
 
 
@@ -55,48 +56,37 @@ lp_assert(int condition, const char *msg)
  * \param condition should be an 'i1' or 'i32' value
  * \param msg  a string to print if the assertion fails.
  */
-LLVMValueRef
+void
 lp_build_assert(struct gallivm_state *gallivm,
                 LLVMValueRef condition,
                 const char *msg)
 {
    LLVMBuilderRef builder = gallivm->builder;
    LLVMContextRef context = gallivm->context;
-   LLVMModuleRef module = gallivm->module;
    LLVMTypeRef arg_types[2];
-   LLVMValueRef msg_string, assert_func, params[2], r;
+   LLVMTypeRef ret_type;
+   LLVMValueRef function;
+   LLVMValueRef args[2];
+   LLVMValueRef msg_string;
 
-   msg_string = lp_build_const_string_variable(module, context,
-                                               msg, strlen(msg) + 1);
+   msg_string = lp_build_const_string(gallivm, msg);
 
+   ret_type = LLVMVoidTypeInContext(context);
    arg_types[0] = LLVMInt32TypeInContext(context);
    arg_types[1] = LLVMPointerType(LLVMInt8TypeInContext(context), 0);
 
-   /* lookup the lp_assert function */
-   assert_func = LLVMGetNamedFunction(module, "lp_assert");
-
-   /* Create the assertion function if not found */
-   if (!assert_func) {
-      LLVMTypeRef func_type =
-         LLVMFunctionType(LLVMVoidTypeInContext(context), arg_types, 2, 0);
-
-      assert_func = LLVMAddFunction(module, "lp_assert", func_type);
-      LLVMSetFunctionCallConv(assert_func, LLVMCCallConv);
-      LLVMSetLinkage(assert_func, LLVMExternalLinkage);
-      LLVMAddGlobalMapping(gallivm->engine, assert_func,
-                           func_to_pointer((func_pointer)lp_assert));
-   }
-   assert(assert_func);
+   function = lp_build_const_func_pointer(gallivm,
+                                          func_to_pointer((func_pointer)lp_assert),
+                                          ret_type, arg_types, Elements(arg_types),
+                                          "assert");
 
    /* build function call param list */
-   params[0] = LLVMBuildZExt(builder, condition, arg_types[0], "");
-   params[1] = LLVMBuildBitCast(builder, msg_string, arg_types[1], "");
+   args[0] = LLVMBuildZExt(builder, condition, arg_types[0], "");
+   args[1] = msg_string;
 
    /* check arg types */
-   assert(LLVMTypeOf(params[0]) == arg_types[0]);
-   assert(LLVMTypeOf(params[1]) == arg_types[1]);
-
-   r = LLVMBuildCall(builder, assert_func, params, 2, "");
+   assert(LLVMTypeOf(args[0]) == arg_types[0]);
+   assert(LLVMTypeOf(args[1]) == arg_types[1]);
 
-   return r;
+   LLVMBuildCall(builder, function, args, Elements(args), "");
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_assert.h b/src/gallium/auxiliary/gallivm/lp_bld_assert.h
index 1d2baab30a2..e377b59bbed 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_assert.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_assert.h
@@ -33,7 +33,7 @@
 #include "lp_bld_init.h"
 
 
-LLVMValueRef
+void
 lp_build_assert(struct gallivm_state *gallivm,
                 LLVMValueRef condition,
                 const char *msg);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_const.c b/src/gallium/auxiliary/gallivm/lp_bld_const.c
index 6d8b7c26fc8..f0611b158d9 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_const.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_const.c
@@ -409,3 +409,69 @@ lp_build_const_mask_aos(struct gallivm_state *gallivm,
 
    return LLVMConstVector(masks, type.length);
 }
+
+
+/**
+ * Performs lp_build_const_mask_aos, but first swizzles the mask
+ */
+LLVMValueRef
+lp_build_const_mask_aos_swizzled(struct gallivm_state *gallivm,
+                        struct lp_type type,
+                        unsigned mask,
+                        const unsigned char *swizzle)
+{
+   mask =
+           ((mask & (1 << swizzle[0])) >> swizzle[0])
+        | (((mask & (1 << swizzle[1])) >> swizzle[1]) << 1)
+        | (((mask & (1 << swizzle[2])) >> swizzle[2]) << 2)
+        | (((mask & (1 << swizzle[3])) >> swizzle[3]) << 3);
+
+   return lp_build_const_mask_aos(gallivm, type, mask);
+}
+
+
+/**
+ * Build a zero-terminated constant string.
+ */
+LLVMValueRef
+lp_build_const_string(struct gallivm_state *gallivm,
+                      const char *str)
+{
+   unsigned len = strlen(str) + 1;
+   LLVMTypeRef i8 = LLVMInt8TypeInContext(gallivm->context);
+   LLVMValueRef string = LLVMAddGlobal(gallivm->module, LLVMArrayType(i8, len), "");
+   LLVMSetGlobalConstant(string, TRUE);
+   LLVMSetLinkage(string, LLVMInternalLinkage);
+   LLVMSetInitializer(string, LLVMConstStringInContext(gallivm->context, str, len, TRUE));
+   string = LLVMConstBitCast(string, LLVMPointerType(i8, 0));
+   return string;
+}
+
+
+/**
+ * Build a callable function pointer.
+ *
+ * We this casts instead of LLVMAddGlobalMapping()
+ * to work around a bug in LLVM 2.6, and for efficiency/simplicity.
+ */
+LLVMValueRef
+lp_build_const_func_pointer(struct gallivm_state *gallivm,
+                            const void *ptr,
+                            LLVMTypeRef ret_type,
+                            LLVMTypeRef *arg_types,
+                            unsigned num_args,
+                            const char *name)
+{
+   LLVMTypeRef function_type;
+   LLVMValueRef function;
+
+   function_type = LLVMFunctionType(ret_type, arg_types, num_args, 0);
+
+   function = lp_build_const_int_pointer(gallivm, ptr);
+
+   function = LLVMBuildBitCast(gallivm->builder, function,
+                               LLVMPointerType(function_type, 0),
+                               name);
+
+   return function;
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_const.h b/src/gallium/auxiliary/gallivm/lp_bld_const.h
index 69718eb4b3d..2205616274f 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_const.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_const.h
@@ -111,6 +111,13 @@ lp_build_const_mask_aos(struct gallivm_state *gallivm,
                         unsigned mask);
 
 
+LLVMValueRef
+lp_build_const_mask_aos_swizzled(struct gallivm_state *gallivm,
+                        struct lp_type type,
+                        unsigned mask,
+                        const unsigned char *swizzle);
+
+
 static INLINE LLVMValueRef
 lp_build_const_int32(struct gallivm_state *gallivm, int i)
 {
@@ -142,5 +149,18 @@ lp_build_const_int_pointer(struct gallivm_state *gallivm, const void *ptr)
 }
 
 
+LLVMValueRef
+lp_build_const_string(struct gallivm_state *gallivm,
+                      const char *str);
+
+
+LLVMValueRef
+lp_build_const_func_pointer(struct gallivm_state *gallivm,
+                            const void *ptr,
+                            LLVMTypeRef ret_type,
+                            LLVMTypeRef *arg_types,
+                            unsigned num_args,
+                            const char *name);
+
 
 #endif /* !LP_BLD_CONST_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.c b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
index a9c9c7af10c..d2b3713ed2d 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_flow.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
@@ -265,6 +265,73 @@ lp_build_loop_end(struct lp_build_loop_state *state,
    lp_build_loop_end_cond(state, end, step, LLVMIntNE);
 }
 
+/**
+ * Creates a c-style for loop,
+ * contrasts lp_build_loop as this checks condition on entry
+ * e.g. for(i = start; i cmp_op end; i += step)
+ * \param state      the for loop state, initialized here
+ * \param gallivm    the gallivm state
+ * \param start      starting value of iterator
+ * \param cmp_op     comparison operator used for comparing current value with end value
+ * \param end        value used to compare against iterator
+ * \param step       value added to iterator at end of each loop
+ */
+void
+lp_build_for_loop_begin(struct lp_build_for_loop_state *state,
+                        struct gallivm_state *gallivm,
+                        LLVMValueRef start,
+                        LLVMIntPredicate cmp_op,
+                        LLVMValueRef end,
+                        LLVMValueRef step)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+
+   assert(LLVMTypeOf(start) == LLVMTypeOf(end));
+   assert(LLVMTypeOf(start) == LLVMTypeOf(step));
+
+   state->begin = lp_build_insert_new_block(gallivm, "loop_begin");
+   state->step  = step;
+   state->counter_var = lp_build_alloca(gallivm, LLVMTypeOf(start), "loop_counter");
+   state->gallivm = gallivm;
+   state->cond = cmp_op;
+   state->end = end;
+
+   LLVMBuildStore(builder, start, state->counter_var);
+   LLVMBuildBr(builder, state->begin);
+
+   LLVMPositionBuilderAtEnd(builder, state->begin);
+   state->counter = LLVMBuildLoad(builder, state->counter_var, "");
+
+   state->body = lp_build_insert_new_block(gallivm, "loop_body");
+   LLVMPositionBuilderAtEnd(builder, state->body);
+}
+
+/**
+ * End the for loop.
+ */
+void
+lp_build_for_loop_end(struct lp_build_for_loop_state *state)
+{
+   LLVMValueRef next, cond;
+   LLVMBuilderRef builder = state->gallivm->builder;
+
+   next = LLVMBuildAdd(builder, state->counter, state->step, "");
+   LLVMBuildStore(builder, next, state->counter_var);
+   LLVMBuildBr(builder, state->begin);
+
+   state->exit = lp_build_insert_new_block(state->gallivm, "loop_exit");
+
+   /*
+    * We build the comparison for the begin block here,
+    * if we build it earlier the output llvm ir is not human readable
+    * as the code produced is not in the standard begin -> body -> end order.
+    */
+   LLVMPositionBuilderAtEnd(builder, state->begin);
+   cond = LLVMBuildICmp(builder, state->cond, state->counter, state->end, "");
+   LLVMBuildCondBr(builder, cond, state->body, state->exit);
+
+   LLVMPositionBuilderAtEnd(builder, state->exit);
+}
 
 
 /*
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.h b/src/gallium/auxiliary/gallivm/lp_bld_flow.h
index 3cd5a9f42a5..0da849bfe0c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_flow.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.h
@@ -131,6 +131,33 @@ lp_build_loop_end_cond(struct lp_build_loop_state *state,
                        LLVMIntPredicate cond);
 
 
+/**
+ * Implementation of simple C-style for loops
+ */
+struct lp_build_for_loop_state
+{
+   LLVMBasicBlockRef begin;
+   LLVMBasicBlockRef body;
+   LLVMBasicBlockRef exit;
+   LLVMValueRef counter_var;
+   LLVMValueRef counter;
+   LLVMValueRef step;
+   LLVMIntPredicate cond;
+   LLVMValueRef end;
+   struct gallivm_state *gallivm;
+};
+
+void
+lp_build_for_loop_begin(struct lp_build_for_loop_state *state,
+                        struct gallivm_state *gallivm,
+                        LLVMValueRef start,
+                        LLVMIntPredicate llvm_cond,
+                        LLVMValueRef end,
+                        LLVMValueRef step);
+
+void
+lp_build_for_loop_end(struct lp_build_for_loop_state *state);
+
 
 /**
  * if/else/endif.
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
index 82ab19eda14..e4b8da6bcfd 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
@@ -643,28 +643,18 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
           */
          LLVMTypeRef ret_type;
          LLVMTypeRef arg_types[4];
-         LLVMTypeRef function_type;
 
          ret_type = LLVMVoidTypeInContext(gallivm->context);
          arg_types[0] = pf32t;
          arg_types[1] = pi8t;
          arg_types[2] = i32t;
          arg_types[3] = i32t;
-         function_type = LLVMFunctionType(ret_type, arg_types,
-                                          Elements(arg_types), 0);
 
-         /* Note: we're using this casting here instead of LLVMAddGlobalMapping()
-          * to work around a bug in LLVM 2.6, and for efficiency/simplicity.
-          */
-
-         /* make const pointer for the C fetch_rgba_float function */
-         function = lp_build_const_int_pointer(gallivm,
-            func_to_pointer((func_pointer) format_desc->fetch_rgba_float));
-
-         /* cast the callee pointer to the function's type */
-         function = LLVMBuildBitCast(builder, function,
-                                     LLVMPointerType(function_type, 0),
-                                     "cast callee");
+         function = lp_build_const_func_pointer(gallivm,
+                                                func_to_pointer((func_pointer) format_desc->fetch_rgba_float),
+                                                ret_type,
+                                                arg_types, Elements(arg_types),
+                                                format_desc->short_name);
       }
 
       tmp_ptr = lp_build_alloca(gallivm, f32x4t, "");
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.h b/src/gallium/auxiliary/gallivm/lp_bld_init.h
index f68bf75a851..5fc0f996c64 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.h
@@ -81,5 +81,12 @@ extern LLVMValueRef
 lp_build_load_volatile(LLVMBuilderRef B, LLVMValueRef PointerVal,
                        const char *Name);
 
+void
+lp_set_load_alignment(LLVMValueRef Inst,
+                       unsigned Align);
+
+void
+lp_set_store_alignment(LLVMValueRef Inst,
+		       unsigned Align);
 
 #endif /* !LP_BLD_INIT_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
index 68f8808f3ef..6c4586c4212 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
@@ -165,3 +165,18 @@ lp_build_load_volatile(LLVMBuilderRef B, LLVMValueRef PointerVal,
    return llvm::wrap(llvm::unwrap(B)->CreateLoad(llvm::unwrap(PointerVal), true, Name));
 }
 
+extern "C"
+void
+lp_set_load_alignment(LLVMValueRef Inst,
+                       unsigned Align)
+{
+   llvm::unwrap<llvm::LoadInst>(Inst)->setAlignment(Align);
+}
+
+extern "C"
+void
+lp_set_store_alignment(LLVMValueRef Inst,
+		       unsigned Align)
+{
+   llvm::unwrap<llvm::StoreInst>(Inst)->setAlignment(Align);
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_printf.c b/src/gallium/auxiliary/gallivm/lp_bld_printf.c
index 56ff4269588..5e359ceaa20 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_printf.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_printf.c
@@ -66,19 +66,6 @@ lp_get_printf_arg_count(const char *fmt)
    return count;
 }
 
-LLVMValueRef 
-lp_build_const_string_variable(LLVMModuleRef module,
-                               LLVMContextRef context,
-                               const char *str, int len)
-{
-   LLVMValueRef string = LLVMAddGlobal(module, LLVMArrayType(LLVMInt8TypeInContext(context), len + 1), "");
-   LLVMSetGlobalConstant(string, TRUE);
-   LLVMSetLinkage(string, LLVMInternalLinkage);
-   LLVMSetInitializer(string, LLVMConstStringInContext(context, str, len + 1, TRUE));
-   return string;
-}
- 
-
 /**
  * lp_build_printf.
  *
@@ -94,24 +81,22 @@ lp_build_printf(struct gallivm_state *gallivm, const char *fmt, ...)
    int argcount = lp_get_printf_arg_count(fmt);
    LLVMBuilderRef builder = gallivm->builder;
    LLVMContextRef context = gallivm->context;
-   LLVMModuleRef module = gallivm->module;
    LLVMValueRef params[50];
-   LLVMValueRef fmtarg = lp_build_const_string_variable(module, context,
-                                                        fmt, strlen(fmt) + 1);
-   LLVMValueRef int0 = lp_build_const_int32(gallivm, 0);
-   LLVMValueRef index[2];
-   LLVMValueRef func_printf = LLVMGetNamedFunction(module, "printf");
+   LLVMValueRef fmtarg = lp_build_const_string(gallivm, fmt);
+   LLVMTypeRef printf_type;
+   LLVMValueRef func_printf;
 
    assert(Elements(params) >= argcount + 1);
 
-   index[0] = index[1] = int0;
+   printf_type = LLVMFunctionType(LLVMIntTypeInContext(context, 32), NULL, 0, 1);
 
-   if (!func_printf) {
-      LLVMTypeRef printf_type = LLVMFunctionType(LLVMIntTypeInContext(context, 32), NULL, 0, 1);
-      func_printf = LLVMAddFunction(module, "printf", printf_type);
-   }
+   func_printf = lp_build_const_int_pointer(gallivm, func_to_pointer((func_pointer)debug_printf));
+
+   func_printf = LLVMBuildBitCast(gallivm->builder, func_printf,
+                                  LLVMPointerType(printf_type, 0),
+                                  "debug_printf");
 
-   params[0] = LLVMBuildGEP(builder, fmtarg, index, 2, "");
+   params[0] = fmtarg;
 
    va_start(arglist, fmt);
    for (i = 1; i <= argcount; i++) {
@@ -170,3 +155,30 @@ lp_build_print_ivec4(struct gallivm_state *gallivm,
    util_snprintf(format, sizeof(format), "%s %%i %%i %%i %%i\n", msg);
    return lp_build_printf(gallivm, format, x, y, z, w);
 }
+
+
+/**
+ * Print a uint8[16] vector.
+ */
+LLVMValueRef
+lp_build_print_uvec16(struct gallivm_state *gallivm,
+                    const char *msg, LLVMValueRef vec)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   char format[1000];
+   LLVMValueRef args[16];
+   int i;
+
+   for (i = 0; i < 16; ++i) {
+      args[i] = LLVMBuildExtractElement(builder, vec, lp_build_const_int32(gallivm, i), "");
+   }
+
+   util_snprintf(format, sizeof(format), "%s %%u %%u %%u %%u %%u %%u %%u %%u %%u %%u %%u %%u %%u %%u %%u %%u\n", msg);
+
+   return lp_build_printf(
+            gallivm, format,
+            args[ 0], args[ 1], args[ 2], args[ 3],
+            args[ 4], args[ 5], args[ 6], args[ 7],
+            args[ 8], args[ 9], args[10], args[11],
+            args[12], args[13], args[14], args[15]);
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_printf.h b/src/gallium/auxiliary/gallivm/lp_bld_printf.h
index 79db74d8886..7a2b26d41f4 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_printf.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_printf.h
@@ -34,12 +34,9 @@
 #include "lp_bld_init.h"
 
 
-LLVMValueRef lp_build_const_string_variable(LLVMModuleRef module,
-                                            LLVMContextRef context,
-                                            const char *str, int len);
-
-LLVMValueRef lp_build_printf(struct gallivm_state *gallivm,
-                             const char *fmt, ...);
+LLVMValueRef
+lp_build_printf(struct gallivm_state *gallivm,
+                const char *fmt, ...);
 
 LLVMValueRef
 lp_build_print_vec4(struct gallivm_state *gallivm,
@@ -49,5 +46,9 @@ LLVMValueRef
 lp_build_print_ivec4(struct gallivm_state *gallivm,
                      const char *msg, LLVMValueRef vec);
 
+LLVMValueRef
+lp_build_print_uvec16(struct gallivm_state *gallivm,
+                     const char *msg, LLVMValueRef vec);
+
 #endif
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index 07f4f3bf6b8..c6d4f1bcc28 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -349,7 +349,7 @@ lp_build_brilinear_lod(struct lp_build_context *bld,
  * Combined log2 and brilinear lod computation.
  *
  * It's in all identical to calling lp_build_fast_log2() and
- * lp_build_brilinear_lod() above, but by combining we can compute the interger
+ * lp_build_brilinear_lod() above, but by combining we can compute the integer
  * and fractional part independently.
  */
 static void
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_struct.c b/src/gallium/auxiliary/gallivm/lp_bld_struct.c
index 0dc2f24d10a..cc248d15e97 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_struct.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_struct.c
@@ -146,6 +146,25 @@ lp_build_pointer_get(LLVMBuilderRef builder,
 }
 
 
+LLVMValueRef
+lp_build_pointer_get_unaligned(LLVMBuilderRef builder,
+                               LLVMValueRef ptr,
+                               LLVMValueRef index,
+                               unsigned alignment)
+{
+   LLVMValueRef element_ptr;
+   LLVMValueRef res;
+   assert(LLVMGetTypeKind(LLVMTypeOf(ptr)) == LLVMPointerTypeKind);
+   element_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
+   res = LLVMBuildLoad(builder, element_ptr, "");
+   lp_set_load_alignment(res, alignment);
+#ifdef DEBUG
+   lp_build_name(res, "%s[%s]", LLVMGetValueName(ptr), LLVMGetValueName(index));
+#endif
+   return res;
+}
+
+
 void
 lp_build_pointer_set(LLVMBuilderRef builder,
                      LLVMValueRef ptr,
@@ -156,3 +175,18 @@ lp_build_pointer_set(LLVMBuilderRef builder,
    element_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
    LLVMBuildStore(builder, value, element_ptr);
 }
+
+
+void
+lp_build_pointer_set_unaligned(LLVMBuilderRef builder,
+                               LLVMValueRef ptr,
+                               LLVMValueRef index,
+                               LLVMValueRef value,
+                               unsigned alignment)
+{
+   LLVMValueRef element_ptr;
+   LLVMValueRef instr;
+   element_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
+   instr = LLVMBuildStore(builder, value, element_ptr);
+   lp_set_store_alignment(instr, alignment);
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_struct.h b/src/gallium/auxiliary/gallivm/lp_bld_struct.h
index 11605c685f0..6b7b4f2a6bf 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_struct.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_struct.h
@@ -104,6 +104,18 @@ lp_build_pointer_get(LLVMBuilderRef builder,
                    LLVMValueRef ptr,
                    LLVMValueRef index);
 
+/**
+ * Get the value of an array element, with explicit alignment.
+ *
+ * If the element size is different from the alignment this will
+ * cause llvm to emit an unaligned load
+ */
+LLVMValueRef
+lp_build_pointer_get_unaligned(LLVMBuilderRef builder,
+                               LLVMValueRef ptr,
+                               LLVMValueRef index,
+                               unsigned alignment);
+
 /**
  * Set the value of an array element.
  */
@@ -113,4 +125,17 @@ lp_build_pointer_set(LLVMBuilderRef builder,
                      LLVMValueRef index,
                      LLVMValueRef value);
 
+/**
+ * Set the value of an array element, with explicit alignment.
+ *
+ * If the element size is different from the alignment this will
+ * cause llvm to emit an unaligned store
+ */
+void
+lp_build_pointer_set_unaligned(LLVMBuilderRef builder,
+                               LLVMValueRef ptr,
+                               LLVMValueRef index,
+                               LLVMValueRef value,
+                               unsigned alignment);
+
 #endif /* !LP_BLD_STRUCT_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
index 45bbf81eb29..680c85f843c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
@@ -215,8 +215,6 @@ lp_build_tgsi_inst_llvm(
    case TGSI_OPCODE_PUSHA:
    case TGSI_OPCODE_POPA:
    case TGSI_OPCODE_SAD:
-   case TGSI_OPCODE_TXF:
-   case TGSI_OPCODE_TXQ:
       /* deprecated? */
       assert(0);
       return FALSE;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
index 888221d4d64..773c679a4d8 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
@@ -121,6 +121,11 @@ struct lp_tgsi_info
     */
    unsigned indirect_textures:1;
 
+   /*
+    * Whether any immediate values are outside the range of 0 and 1
+    */
+   unsigned unclamped_immediates:1;
+
    /*
     * Texture opcode description. Aimed at detecting and described direct
     * texture opcodes.
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
index 7f011563264..d278444ce90 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
@@ -1007,6 +1007,17 @@ i2f_emit_cpu(
                                                               emit_data->args[0]);
 }
 
+/* TGSI_OPCODE_IABS (CPU Only) */
+static void
+iabs_emit_cpu(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   emit_data->output[emit_data->chan] = lp_build_abs(&bld_base->int_bld,
+                                                       emit_data->args[0]);
+}
+
 /* TGSI_OPCODE_IDIV (CPU Only) */
 static void
 idiv_emit_cpu(
@@ -1101,6 +1112,18 @@ islt_emit_cpu(
    iset_emit_cpu(action, bld_base, emit_data, PIPE_FUNC_LESS);
 }
 
+
+/* TGSI_OPCODE_ISSG (CPU Only) */
+static void
+issg_emit_cpu(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   emit_data->output[emit_data->chan] = lp_build_sgn(&bld_base->int_bld,
+                                                       emit_data->args[0]);
+}
+
 /* TGSI_OPCODE_LG2 (CPU Only) */
 static void
 lg2_emit_cpu(
@@ -1541,6 +1564,7 @@ lp_set_default_actions_cpu(
    bld_base->op_actions[TGSI_OPCODE_FLR].emit = flr_emit_cpu;
 
    bld_base->op_actions[TGSI_OPCODE_I2F].emit = i2f_emit_cpu;
+   bld_base->op_actions[TGSI_OPCODE_IABS].emit = iabs_emit_cpu;
    bld_base->op_actions[TGSI_OPCODE_IDIV].emit = idiv_emit_cpu;
    bld_base->op_actions[TGSI_OPCODE_INEG].emit = ineg_emit_cpu;
    bld_base->op_actions[TGSI_OPCODE_IMAX].emit = imax_emit_cpu;
@@ -1548,6 +1572,7 @@ lp_set_default_actions_cpu(
    bld_base->op_actions[TGSI_OPCODE_ISGE].emit = isge_emit_cpu;
    bld_base->op_actions[TGSI_OPCODE_ISHR].emit = ishr_emit_cpu;
    bld_base->op_actions[TGSI_OPCODE_ISLT].emit = islt_emit_cpu;
+   bld_base->op_actions[TGSI_OPCODE_ISSG].emit = issg_emit_cpu;
 
    bld_base->op_actions[TGSI_OPCODE_LG2].emit = lg2_emit_cpu;
    bld_base->op_actions[TGSI_OPCODE_LOG].emit = log_emit_cpu;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
index 80c148124ee..24bc13a9be8 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
@@ -325,8 +325,10 @@ lp_emit_store_aos(
    if (reg->Register.WriteMask != TGSI_WRITEMASK_XYZW) {
       LLVMValueRef writemask;
 
-      writemask = lp_build_const_mask_aos(bld->bld_base.base.gallivm, bld->bld_base.base.type,
-                                          reg->Register.WriteMask);
+      writemask = lp_build_const_mask_aos_swizzled(bld->bld_base.base.gallivm,
+                                                   bld->bld_base.base.type,
+                                                   reg->Register.WriteMask,
+                                                   bld->swizzles);
 
       if (mask) {
          mask = LLVMBuildAnd(builder, mask, writemask, "");
@@ -1089,6 +1091,7 @@ lp_build_tgsi_aos(struct gallivm_state *gallivm,
       debug_printf("2222222222222222222222222222 \n");
    }
    tgsi_parse_free(&parse);
+   FREE(bld.bld_base.instructions);
 
    if (0) {
       LLVMModuleRef module = LLVMGetGlobalParent(
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c
index 3373ed4426d..ab393ed942a 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c
@@ -442,8 +442,12 @@ lp_build_tgsi_info(const struct tgsi_token *tokens,
             assert(size <= 4);
             if (ctx.num_imms < Elements(ctx.imm)) {
                for (chan = 0; chan < size; ++chan) {
-                  ctx.imm[ctx.num_imms][chan] =
-                        parse.FullToken.FullImmediate.u[chan].Float;
+                  float value = parse.FullToken.FullImmediate.u[chan].Float;
+                  ctx.imm[ctx.num_imms][chan] = value;
+
+                  if (value < 0.0f || value > 1.0f) {
+                     info->unclamped_immediates = TRUE;
+                  }
                }
                ++ctx.num_imms;
             }
diff --git a/src/gallium/auxiliary/os/os_thread.h b/src/gallium/auxiliary/os/os_thread.h
index 3e1c273027b..3773be945dd 100644
--- a/src/gallium/auxiliary/os/os_thread.h
+++ b/src/gallium/auxiliary/os/os_thread.h
@@ -44,6 +44,7 @@
 
 #include <pthread.h> /* POSIX threads headers */
 #include <stdio.h> /* for perror() */
+#include <signal.h>
 
 
 /* pipe_thread
diff --git a/src/gallium/auxiliary/pipe-loader/Makefile.am b/src/gallium/auxiliary/pipe-loader/Makefile.am
new file mode 100644
index 00000000000..c63dce3fe4a
--- /dev/null
+++ b/src/gallium/auxiliary/pipe-loader/Makefile.am
@@ -0,0 +1,38 @@
+AUTOMAKE_OPTIONS = subdir-objects
+
+AM_CPPFLAGS = $(DEFINES) \
+	$(GALLIUM_PIPE_LOADER_DEFINES) \
+	-I$(top_srcdir)/include \
+	-I$(top_srcdir)/src/gallium/include \
+	-I$(top_srcdir)/src/gallium/auxiliary \
+	-I$(top_srcdir)/src/gallium/winsys
+
+AM_CFLAGS = $(PIC_FLAGS)
+
+noinst_LTLIBRARIES =
+
+if HAVE_LOADER_GALLIUM
+noinst_LTLIBRARIES += libpipe_loader.la
+
+libpipe_loader_la_SOURCES = \
+	pipe_loader.h \
+	pipe_loader_priv.h \
+	pipe_loader.c \
+	pipe_loader_sw.c
+
+if HAVE_DRM_LOADER_GALLIUM
+libpipe_loader_la_SOURCES += pipe_loader_drm.c
+AM_CFLAGS += $(LIBDRM_CFLAGS)
+endif
+
+# Provide compatibility with scripts for the old Mesa build system for
+# a while by putting a link to the library in the current directory.
+all-local: libpipe_loader.la
+	ln -f .libs/libpipe_loader.a .
+
+clean-local:
+	rm -f libpipe_loader.a
+endif
+
+# FIXME: Remove when the rest of Gallium is converted to automake.
+default: all
diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader.c b/src/gallium/auxiliary/pipe-loader/pipe_loader.c
new file mode 100644
index 00000000000..6a10ac3998d
--- /dev/null
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader.c
@@ -0,0 +1,102 @@
+/**************************************************************************
+ *
+ * Copyright 2012 Francisco Jerez
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "pipe_loader_priv.h"
+
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_string.h"
+#include "util/u_dl.h"
+
+#define MODULE_PREFIX "pipe_"
+
+static int (*backends[])(struct pipe_loader_device **, int) = {
+#ifdef HAVE_PIPE_LOADER_DRM
+   &pipe_loader_drm_probe,
+#endif
+   &pipe_loader_sw_probe
+};
+
+int
+pipe_loader_probe(struct pipe_loader_device **devs, int ndev)
+{
+   int i, n = 0;
+
+   for (i = 0; i < Elements(backends); i++)
+      n += backends[i](&devs[n], MAX2(0, ndev - n));
+
+   return n;
+}
+
+void
+pipe_loader_release(struct pipe_loader_device **devs, int ndev)
+{
+   int i;
+
+   for (i = 0; i < ndev; i++)
+      devs[i]->ops->release(&devs[i]);
+}
+
+struct pipe_screen *
+pipe_loader_create_screen(struct pipe_loader_device *dev,
+                          const char *library_paths)
+{
+   return dev->ops->create_screen(dev, library_paths);
+}
+
+struct util_dl_library *
+pipe_loader_find_module(struct pipe_loader_device *dev,
+                        const char *library_paths)
+{
+   struct util_dl_library *lib;
+   const char *next;
+   char path[PATH_MAX];
+   int len, ret;
+
+   for (next = library_paths; *next; library_paths = next + 1) {
+      next = util_strchrnul(library_paths, ':');
+      len = next - library_paths;
+
+      if (len)
+         ret = util_snprintf(path, sizeof(path), "%.*s/%s%s%s",
+                             len, library_paths,
+                             MODULE_PREFIX, dev->driver_name, UTIL_DL_EXT);
+      else
+         ret = util_snprintf(path, sizeof(path), "%s%s%s",
+                             MODULE_PREFIX, dev->driver_name, UTIL_DL_EXT);
+
+      if (ret > 0 && ret < sizeof(path)) {
+         lib = util_dl_open(path);
+         if (lib) {
+            debug_printf("loaded %s\n", path);
+            return lib;
+         }
+      }
+   }
+
+   return NULL;
+}
diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader.h b/src/gallium/auxiliary/pipe-loader/pipe_loader.h
new file mode 100644
index 00000000000..e41969458dd
--- /dev/null
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader.h
@@ -0,0 +1,144 @@
+/**************************************************************************
+ *
+ * Copyright 2012 Francisco Jerez
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file Library that provides device enumeration and creation of
+ * winsys/pipe_screen instances.
+ */
+
+#ifndef PIPE_LOADER_H
+#define PIPE_LOADER_H
+
+#include "pipe/p_compiler.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct pipe_screen;
+
+enum pipe_loader_device_type {
+   PIPE_LOADER_DEVICE_SOFTWARE,
+   PIPE_LOADER_DEVICE_PCI,
+   NUM_PIPE_LOADER_DEVICE_TYPES
+};
+
+/**
+ * A device known to the pipe loader.
+ */
+struct pipe_loader_device {
+   enum pipe_loader_device_type type;
+
+   union {
+      struct {
+         int vendor_id;
+         int chip_id;
+      } pci;
+   }; /**< Discriminated by \a type */
+
+   const char *driver_name;
+   const struct pipe_loader_ops *ops;
+};
+
+/**
+ * Get a list of known devices.
+ *
+ * \param devs Array that will be filled with pointers to the devices
+ *             available in the system.
+ * \param ndev Maximum number of devices to return.
+ * \return Number of devices available in the system.
+ */
+int
+pipe_loader_probe(struct pipe_loader_device **devs, int ndev);
+
+/**
+ * Create a pipe_screen for the specified device.
+ *
+ * \param dev Device the screen will be created for.
+ * \param library_paths Colon-separated list of filesystem paths that
+ *                      will be used to look for the pipe driver
+ *                      module that handles this device.
+ */
+struct pipe_screen *
+pipe_loader_create_screen(struct pipe_loader_device *dev,
+                          const char *library_paths);
+
+/**
+ * Release resources allocated for a list of devices.
+ *
+ * Should be called when the specified devices are no longer in use to
+ * release any resources allocated by pipe_loader_probe.
+ *
+ * \param devs Devices to release.
+ * \param ndev Number of devices to release.
+ */
+void
+pipe_loader_release(struct pipe_loader_device **devs, int ndev);
+
+#ifdef HAVE_PIPE_LOADER_SW
+
+/**
+ * Get a list of known software devices.
+ *
+ * This function is platform-specific.
+ *
+ * \sa pipe_loader_probe
+ */
+int
+pipe_loader_sw_probe(struct pipe_loader_device **devs, int ndev);
+
+#endif
+
+#ifdef HAVE_PIPE_LOADER_DRM
+
+/**
+ * Get a list of known DRM devices.
+ *
+ * This function is platform-specific.
+ *
+ * \sa pipe_loader_probe
+ */
+int
+pipe_loader_drm_probe(struct pipe_loader_device **devs, int ndev);
+
+/**
+ * Initialize a DRM device in an already opened fd.
+ *
+ * This function is platform-specific.
+ *
+ * \sa pipe_loader_probe
+ */
+boolean
+pipe_loader_drm_probe_fd(struct pipe_loader_device **dev, int fd);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* PIPE_LOADER_H */
diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
new file mode 100644
index 00000000000..7a7e9942f3b
--- /dev/null
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
@@ -0,0 +1,218 @@
+/**************************************************************************
+ *
+ * Copyright 2011 Intel Corporation
+ * Copyright 2012 Francisco Jerez
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Kristian HÃ¸gsberg <krh@bitplanet.net>
+ *    Benjamin Franzke <benjaminfranzke@googlemail.com>
+ *
+ **************************************************************************/
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <libudev.h>
+#include <xf86drm.h>
+
+#include "state_tracker/drm_driver.h"
+#include "pipe_loader_priv.h"
+
+#include "util/u_memory.h"
+#include "util/u_dl.h"
+#include "util/u_debug.h"
+
+#define DRIVER_MAP_GALLIUM_ONLY
+#include "pci_ids/pci_id_driver_map.h"
+
+struct pipe_loader_drm_device {
+   struct pipe_loader_device base;
+   struct util_dl_library *lib;
+   int fd;
+};
+
+#define pipe_loader_drm_device(dev) ((struct pipe_loader_drm_device *)dev)
+
+static boolean
+find_drm_pci_id(struct pipe_loader_drm_device *ddev)
+{
+   struct udev *udev = NULL;
+   struct udev_device *parent, *device = NULL;
+   struct stat stat;
+   const char *pci_id;
+
+   if (fstat(ddev->fd, &stat) < 0)
+      goto fail;
+
+   udev = udev_new();
+   if (!udev)
+      goto fail;
+
+   device = udev_device_new_from_devnum(udev, 'c', stat.st_rdev);
+   if (!device)
+      goto fail;
+
+   parent = udev_device_get_parent(device);
+   if (!parent)
+      goto fail;
+
+   pci_id = udev_device_get_property_value(parent, "PCI_ID");
+   if (!pci_id ||
+       sscanf(pci_id, "%x:%x", &ddev->base.pci.vendor_id,
+              &ddev->base.pci.chip_id) != 2)
+      goto fail;
+
+   return TRUE;
+
+  fail:
+   if (device)
+      udev_device_unref(device);
+   if (udev)
+      udev_unref(udev);
+
+   debug_printf("pci id for fd %d not found\n", ddev->fd);
+   return FALSE;
+}
+
+static boolean
+find_drm_driver_name(struct pipe_loader_drm_device *ddev)
+{
+   struct pipe_loader_device *dev = &ddev->base;
+   int i, j;
+
+   for (i = 0; driver_map[i].driver; i++) {
+      if (dev->pci.vendor_id != driver_map[i].vendor_id)
+         continue;
+
+      if (driver_map[i].num_chips_ids == -1) {
+         dev->driver_name = driver_map[i].driver;
+         goto found;
+      }
+
+      for (j = 0; j < driver_map[i].num_chips_ids; j++) {
+         if (dev->pci.chip_id == driver_map[i].chip_ids[j]) {
+            dev->driver_name = driver_map[i].driver;
+            goto found;
+         }
+      }
+   }
+
+   return FALSE;
+
+  found:
+   debug_printf("driver for %04x:%04x: %s\n", dev->pci.vendor_id,
+                dev->pci.chip_id, dev->driver_name);
+   return TRUE;
+}
+
+static struct pipe_loader_ops pipe_loader_drm_ops;
+
+boolean
+pipe_loader_drm_probe_fd(struct pipe_loader_device **dev, int fd)
+{
+   struct pipe_loader_drm_device *ddev = CALLOC_STRUCT(pipe_loader_drm_device);
+
+   ddev->base.type = PIPE_LOADER_DEVICE_PCI;
+   ddev->base.ops = &pipe_loader_drm_ops;
+   ddev->fd = fd;
+
+   if (!find_drm_pci_id(ddev))
+      goto fail;
+
+   if (!find_drm_driver_name(ddev))
+      goto fail;
+
+   *dev = &ddev->base;
+   return TRUE;
+
+  fail:
+   FREE(ddev);
+   return FALSE;
+}
+
+static int
+open_drm_minor(int minor)
+{
+   char path[PATH_MAX];
+   snprintf(path, sizeof(path), DRM_DEV_NAME, DRM_DIR_NAME, minor);
+   return open(path, O_RDWR, 0);
+}
+
+int
+pipe_loader_drm_probe(struct pipe_loader_device **devs, int ndev)
+{
+   int i, j, fd;
+
+   for (i = 0, j = 0; i < DRM_MAX_MINOR; i++) {
+      fd = open_drm_minor(i);
+      if (fd < 0)
+         continue;
+
+      if (j >= ndev || !pipe_loader_drm_probe_fd(&devs[j], fd))
+         close(fd);
+
+      j++;
+   }
+
+   return j;
+}
+
+static void
+pipe_loader_drm_release(struct pipe_loader_device **dev)
+{
+   struct pipe_loader_drm_device *ddev = pipe_loader_drm_device(*dev);
+
+   if (ddev->lib)
+      util_dl_close(ddev->lib);
+
+   close(ddev->fd);
+   FREE(ddev);
+   *dev = NULL;
+}
+
+static struct pipe_screen *
+pipe_loader_drm_create_screen(struct pipe_loader_device *dev,
+                              const char *library_paths)
+{
+   struct pipe_loader_drm_device *ddev = pipe_loader_drm_device(dev);
+   const struct drm_driver_descriptor *dd;
+
+   if (!ddev->lib)
+      ddev->lib = pipe_loader_find_module(dev, library_paths);
+   if (!ddev->lib)
+      return NULL;
+
+   dd = (const struct drm_driver_descriptor *)
+      util_dl_get_proc_address(ddev->lib, "driver_descriptor");
+
+   /* sanity check on the name */
+   if (!dd || strcmp(dd->name, ddev->base.driver_name) != 0)
+      return NULL;
+
+   return dd->create_screen(ddev->fd);
+}
+
+static struct pipe_loader_ops pipe_loader_drm_ops = {
+   .create_screen = pipe_loader_drm_create_screen,
+   .release = pipe_loader_drm_release
+};
diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader_priv.h b/src/gallium/auxiliary/pipe-loader/pipe_loader_priv.h
new file mode 100644
index 00000000000..0be833a0f3f
--- /dev/null
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader_priv.h
@@ -0,0 +1,47 @@
+/**************************************************************************
+ *
+ * Copyright 2012 Francisco Jerez
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef PIPE_LOADER_PRIV_H
+#define PIPE_LOADER_PRIV_H
+
+#include "pipe_loader.h"
+
+struct pipe_loader_ops {
+   struct pipe_screen *(*create_screen)(struct pipe_loader_device *dev,
+                                        const char *library_paths);
+
+   void (*release)(struct pipe_loader_device **dev);
+};
+
+/**
+ * Open the pipe driver module that handles a specified device.
+ */
+struct util_dl_library *
+pipe_loader_find_module(struct pipe_loader_device *dev,
+                        const char *library_paths);
+
+#endif /* PIPE_LOADER_PRIV_H */
diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader_sw.c b/src/gallium/auxiliary/pipe-loader/pipe_loader_sw.c
new file mode 100644
index 00000000000..c2b78c636a7
--- /dev/null
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader_sw.c
@@ -0,0 +1,107 @@
+/**************************************************************************
+ *
+ * Copyright 2012 Francisco Jerez
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "pipe_loader_priv.h"
+
+#include "util/u_memory.h"
+#include "util/u_dl.h"
+#include "sw/null/null_sw_winsys.h"
+#include "target-helpers/inline_sw_helper.h"
+#include "state_tracker/xlib_sw_winsys.h"
+
+struct pipe_loader_sw_device {
+   struct pipe_loader_device base;
+   struct util_dl_library *lib;
+   struct sw_winsys *ws;
+};
+
+#define pipe_loader_sw_device(dev) ((struct pipe_loader_sw_device *)dev)
+
+static struct pipe_loader_ops pipe_loader_sw_ops;
+
+static struct sw_winsys *(*backends[])() = {
+#ifdef HAVE_WINSYS_XLIB
+   x11_sw_create,
+#endif
+   null_sw_create
+};
+
+int
+pipe_loader_sw_probe(struct pipe_loader_device **devs, int ndev)
+{
+   int i;
+
+   for (i = 0; i < Elements(backends); i++) {
+      if (i < ndev) {
+         struct pipe_loader_sw_device *sdev = CALLOC_STRUCT(pipe_loader_sw_device);
+
+         sdev->base.type = PIPE_LOADER_DEVICE_SOFTWARE;
+         sdev->base.driver_name = "swrast";
+         sdev->base.ops = &pipe_loader_sw_ops;
+         sdev->ws = backends[i]();
+         devs[i] = &sdev->base;
+      }
+   }
+
+   return i;
+}
+
+static void
+pipe_loader_sw_release(struct pipe_loader_device **dev)
+{
+   struct pipe_loader_sw_device *sdev = pipe_loader_sw_device(*dev);
+
+   if (sdev->lib)
+      util_dl_close(sdev->lib);
+
+   FREE(sdev);
+   *dev = NULL;
+}
+
+static struct pipe_screen *
+pipe_loader_sw_create_screen(struct pipe_loader_device *dev,
+                             const char *library_paths)
+{
+   struct pipe_loader_sw_device *sdev = pipe_loader_sw_device(dev);
+   struct pipe_screen *(*init)(struct sw_winsys *);
+
+   if (!sdev->lib)
+      sdev->lib = pipe_loader_find_module(dev, library_paths);
+   if (!sdev->lib)
+      return NULL;
+
+   init = (void *)util_dl_get_proc_address(sdev->lib, "swrast_create_screen");
+   if (!init)
+      return NULL;
+
+   return init(sdev->ws);
+}
+
+static struct pipe_loader_ops pipe_loader_sw_ops = {
+   .create_screen = pipe_loader_sw_create_screen,
+   .release = pipe_loader_sw_release
+};
diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c b/src/gallium/auxiliary/tgsi/tgsi_build.c
index 6ec2b0d8f21..1c24b9bdbed 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_build.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.c
@@ -104,12 +104,11 @@ tgsi_default_declaration( void )
    declaration.NrTokens = 1;
    declaration.File = TGSI_FILE_NULL;
    declaration.UsageMask = TGSI_WRITEMASK_XYZW;
-   declaration.Interpolate = TGSI_INTERPOLATE_CONSTANT;
+   declaration.Interpolate = 0;
    declaration.Dimension = 0;
    declaration.Semantic = 0;
-   declaration.Centroid = 0;
    declaration.Invariant = 0;
-   declaration.CylindricalWrap = 0;
+   declaration.Local = 0;
 
    return declaration;
 }
@@ -121,9 +120,8 @@ tgsi_build_declaration(
    unsigned interpolate,
    unsigned dimension,
    unsigned semantic,
-   unsigned centroid,
    unsigned invariant,
-   unsigned cylindrical_wrap,
+   unsigned local,
    struct tgsi_header *header )
 {
    struct tgsi_declaration declaration;
@@ -137,9 +135,8 @@ tgsi_build_declaration(
    declaration.Interpolate = interpolate;
    declaration.Dimension = dimension;
    declaration.Semantic = semantic;
-   declaration.Centroid = centroid;
    declaration.Invariant = invariant;
-   declaration.CylindricalWrap = cylindrical_wrap;
+   declaration.Local = local;
 
    header_bodysize_grow( header );
 
@@ -194,6 +191,36 @@ tgsi_build_declaration_dimension(unsigned index_2d,
    return dd;
 }
 
+static struct tgsi_declaration_interp
+tgsi_default_declaration_interp( void )
+{
+   struct tgsi_declaration_interp di;
+
+   di.Interpolate = TGSI_INTERPOLATE_CONSTANT;
+   di.Centroid = 0;
+   di.CylindricalWrap = 0;
+
+   return di;
+}
+
+static struct tgsi_declaration_interp
+tgsi_build_declaration_interp(unsigned interpolate,
+                              unsigned centroid,
+                              unsigned cylindrical_wrap,
+                              struct tgsi_declaration *declaration,
+                              struct tgsi_header *header)
+{
+   struct tgsi_declaration_interp di;
+
+   di.Interpolate = interpolate;
+   di.Centroid = centroid;
+   di.CylindricalWrap = cylindrical_wrap;
+
+   declaration_grow(declaration, header);
+
+   return di;
+}
+
 static struct tgsi_declaration_semantic
 tgsi_default_declaration_semantic( void )
 {
@@ -227,42 +254,72 @@ tgsi_build_declaration_semantic(
    return ds;
 }
 
-
 static struct tgsi_declaration_resource
 tgsi_default_declaration_resource(void)
 {
-   struct tgsi_declaration_resource declaration_resource;
+   struct tgsi_declaration_resource dr;
 
-   declaration_resource.Resource = TGSI_TEXTURE_UNKNOWN;
-   declaration_resource.ReturnTypeX = PIPE_TYPE_UNORM;
-   declaration_resource.ReturnTypeY = PIPE_TYPE_UNORM;
-   declaration_resource.ReturnTypeZ = PIPE_TYPE_UNORM;
-   declaration_resource.ReturnTypeW = PIPE_TYPE_UNORM;
+   dr.Resource = TGSI_TEXTURE_BUFFER;
+   dr.Raw = 0;
+   dr.Writable = 0;
 
-   return declaration_resource;
+   return dr;
 }
 
 static struct tgsi_declaration_resource
 tgsi_build_declaration_resource(unsigned texture,
-                                unsigned return_type_x,
-                                unsigned return_type_y,
-                                unsigned return_type_z,
-                                unsigned return_type_w,
+                                unsigned raw,
+                                unsigned writable,
                                 struct tgsi_declaration *declaration,
                                 struct tgsi_header *header)
 {
-   struct tgsi_declaration_resource declaration_resource;
+   struct tgsi_declaration_resource dr;
+
+   dr = tgsi_default_declaration_resource();
+   dr.Resource = texture;
+   dr.Raw = raw;
+   dr.Writable = writable;
+
+   declaration_grow(declaration, header);
+
+   return dr;
+}
+
+static struct tgsi_declaration_sampler_view
+tgsi_default_declaration_sampler_view(void)
+{
+   struct tgsi_declaration_sampler_view dsv;
 
-   declaration_resource = tgsi_default_declaration_resource();
-   declaration_resource.Resource = texture;
-   declaration_resource.ReturnTypeX = return_type_x;
-   declaration_resource.ReturnTypeY = return_type_y;
-   declaration_resource.ReturnTypeZ = return_type_z;
-   declaration_resource.ReturnTypeW = return_type_w;
+   dsv.Resource = TGSI_TEXTURE_BUFFER;
+   dsv.ReturnTypeX = PIPE_TYPE_UNORM;
+   dsv.ReturnTypeY = PIPE_TYPE_UNORM;
+   dsv.ReturnTypeZ = PIPE_TYPE_UNORM;
+   dsv.ReturnTypeW = PIPE_TYPE_UNORM;
+
+   return dsv;
+}
+
+static struct tgsi_declaration_sampler_view
+tgsi_build_declaration_sampler_view(unsigned texture,
+                                    unsigned return_type_x,
+                                    unsigned return_type_y,
+                                    unsigned return_type_z,
+                                    unsigned return_type_w,
+                                    struct tgsi_declaration *declaration,
+                                    struct tgsi_header *header)
+{
+   struct tgsi_declaration_sampler_view dsv;
+
+   dsv = tgsi_default_declaration_sampler_view();
+   dsv.Resource = texture;
+   dsv.ReturnTypeX = return_type_x;
+   dsv.ReturnTypeY = return_type_y;
+   dsv.ReturnTypeZ = return_type_z;
+   dsv.ReturnTypeW = return_type_w;
 
    declaration_grow(declaration, header);
 
-   return declaration_resource;
+   return dsv;
 }
 
 
@@ -274,8 +331,10 @@ tgsi_default_full_declaration( void )
    full_declaration.Declaration  = tgsi_default_declaration();
    full_declaration.Range = tgsi_default_declaration_range();
    full_declaration.Semantic = tgsi_default_declaration_semantic();
+   full_declaration.Interp = tgsi_default_declaration_interp();
    full_declaration.ImmediateData.u = NULL;
    full_declaration.Resource = tgsi_default_declaration_resource();
+   full_declaration.SamplerView = tgsi_default_declaration_sampler_view();
 
    return full_declaration;
 }
@@ -302,9 +361,8 @@ tgsi_build_full_declaration(
       full_decl->Declaration.Interpolate,
       full_decl->Declaration.Dimension,
       full_decl->Declaration.Semantic,
-      full_decl->Declaration.Centroid,
       full_decl->Declaration.Invariant,
-      full_decl->Declaration.CylindricalWrap,
+      full_decl->Declaration.Local,
       header );
 
    if (maxsize <= size)
@@ -332,6 +390,22 @@ tgsi_build_full_declaration(
                                              header);
    }
 
+   if (full_decl->Declaration.Interpolate) {
+      struct tgsi_declaration_interp *di;
+
+      if (maxsize <= size) {
+         return 0;
+      }
+      di = (struct tgsi_declaration_interp *)&tokens[size];
+      size++;
+
+      *di = tgsi_build_declaration_interp(full_decl->Interp.Interpolate,
+                                          full_decl->Interp.Centroid,
+                                          full_decl->Interp.CylindricalWrap,
+                                          declaration,
+                                          header);
+   }
+
    if( full_decl->Declaration.Semantic ) {
       struct tgsi_declaration_semantic *ds;
 
@@ -375,14 +449,31 @@ tgsi_build_full_declaration(
       size++;
 
       *dr = tgsi_build_declaration_resource(full_decl->Resource.Resource,
-                                            full_decl->Resource.ReturnTypeX,
-                                            full_decl->Resource.ReturnTypeY,
-                                            full_decl->Resource.ReturnTypeZ,
-                                            full_decl->Resource.ReturnTypeW,
+                                            full_decl->Resource.Raw,
+                                            full_decl->Resource.Writable,
                                             declaration,
                                             header);
    }
 
+   if (full_decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) {
+      struct tgsi_declaration_sampler_view *dsv;
+
+      if (maxsize <= size) {
+         return  0;
+      }
+      dsv = (struct tgsi_declaration_sampler_view *)&tokens[size];
+      size++;
+
+      *dsv = tgsi_build_declaration_sampler_view(
+         full_decl->SamplerView.Resource,
+         full_decl->SamplerView.ReturnTypeX,
+         full_decl->SamplerView.ReturnTypeY,
+         full_decl->SamplerView.ReturnTypeZ,
+         full_decl->SamplerView.ReturnTypeW,
+         declaration,
+         header);
+   }
+
    return size;
 }
 
@@ -405,11 +496,13 @@ tgsi_default_immediate( void )
 
 static struct tgsi_immediate
 tgsi_build_immediate(
-   struct tgsi_header *header )
+   struct tgsi_header *header,
+   unsigned type )
 {
    struct tgsi_immediate immediate;
 
    immediate = tgsi_default_immediate();
+   immediate.DataType = type;
 
    header_bodysize_grow( header );
 
@@ -442,21 +535,6 @@ immediate_grow(
    header_bodysize_grow( header );
 }
 
-static union tgsi_immediate_data
-tgsi_build_immediate_float32(
-   float value,
-   struct tgsi_immediate *immediate,
-   struct tgsi_header *header )
-{
-   union tgsi_immediate_data immediate_data;
-
-   immediate_data.Float = value;
-
-   immediate_grow( immediate, header );
-
-   return immediate_data;
-}
-
 unsigned
 tgsi_build_full_immediate(
    const struct tgsi_full_immediate *full_imm,
@@ -472,7 +550,7 @@ tgsi_build_full_immediate(
    immediate = (struct tgsi_immediate *) &tokens[size];
    size++;
 
-   *immediate = tgsi_build_immediate( header );
+   *immediate = tgsi_build_immediate( header, full_imm->Immediate.DataType );
 
    assert( full_imm->Immediate.NrTokens <= 4 + 1 );
 
@@ -481,13 +559,12 @@ tgsi_build_full_immediate(
 
       if( maxsize <= size )
          return  0;
+
       data = (union tgsi_immediate_data *) &tokens[size];
-      size++;
+      *data = full_imm->u[i];
 
-      *data = tgsi_build_immediate_float32(
-         full_imm->u[i].Float,
-         immediate,
-         header );
+      immediate_grow( immediate, header );
+      size++;
    }
 
    return size;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c
index 635212b7e86..383c54590af 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c
@@ -271,6 +271,9 @@ iter_declaration(
       ctx,
       decl->Declaration.UsageMask );
 
+   if (decl->Declaration.Local)
+      TXT( ", LOCAL" );
+
    if (decl->Declaration.Semantic) {
       TXT( ", " );
       ENM( decl->Semantic.Name, tgsi_semantic_names );
@@ -285,53 +288,64 @@ iter_declaration(
    if (decl->Declaration.File == TGSI_FILE_RESOURCE) {
       TXT(", ");
       ENM(decl->Resource.Resource, tgsi_texture_names);
+      if (decl->Resource.Writable)
+         TXT(", WR");
+      if (decl->Resource.Raw)
+         TXT(", RAW");
+   }
+
+   if (decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) {
       TXT(", ");
-      if ((decl->Resource.ReturnTypeX == decl->Resource.ReturnTypeY) &&
-          (decl->Resource.ReturnTypeX == decl->Resource.ReturnTypeZ) &&
-          (decl->Resource.ReturnTypeX == decl->Resource.ReturnTypeW)) {
-         ENM(decl->Resource.ReturnTypeX, tgsi_type_names);
+      ENM(decl->SamplerView.Resource, tgsi_texture_names);
+      TXT(", ");
+      if ((decl->SamplerView.ReturnTypeX == decl->SamplerView.ReturnTypeY) &&
+          (decl->SamplerView.ReturnTypeX == decl->SamplerView.ReturnTypeZ) &&
+          (decl->SamplerView.ReturnTypeX == decl->SamplerView.ReturnTypeW)) {
+         ENM(decl->SamplerView.ReturnTypeX, tgsi_type_names);
       } else {
-         ENM(decl->Resource.ReturnTypeX, tgsi_type_names);
+         ENM(decl->SamplerView.ReturnTypeX, tgsi_type_names);
          TXT(", ");
-         ENM(decl->Resource.ReturnTypeY, tgsi_type_names);
+         ENM(decl->SamplerView.ReturnTypeY, tgsi_type_names);
          TXT(", ");
-         ENM(decl->Resource.ReturnTypeZ, tgsi_type_names);
+         ENM(decl->SamplerView.ReturnTypeZ, tgsi_type_names);
          TXT(", ");
-         ENM(decl->Resource.ReturnTypeW, tgsi_type_names);
+         ENM(decl->SamplerView.ReturnTypeW, tgsi_type_names);
       }
-
    }
 
-   if (iter->processor.Processor == TGSI_PROCESSOR_FRAGMENT &&
-       decl->Declaration.File == TGSI_FILE_INPUT)
-   {
-      TXT( ", " );
-      ENM( decl->Declaration.Interpolate, tgsi_interpolate_names );
-   }
+   if (decl->Declaration.Interpolate) {
+      if (iter->processor.Processor == TGSI_PROCESSOR_FRAGMENT &&
+          decl->Declaration.File == TGSI_FILE_INPUT)
+      {
+         TXT( ", " );
+         ENM( decl->Interp.Interpolate, tgsi_interpolate_names );
+      }
+
+      if (decl->Interp.Centroid) {
+         TXT( ", CENTROID" );
+      }
 
-   if (decl->Declaration.Centroid) {
-      TXT( ", CENTROID" );
+      if (decl->Interp.CylindricalWrap) {
+         TXT(", CYLWRAP_");
+         if (decl->Interp.CylindricalWrap & TGSI_CYLINDRICAL_WRAP_X) {
+            CHR('X');
+         }
+         if (decl->Interp.CylindricalWrap & TGSI_CYLINDRICAL_WRAP_Y) {
+            CHR('Y');
+         }
+         if (decl->Interp.CylindricalWrap & TGSI_CYLINDRICAL_WRAP_Z) {
+            CHR('Z');
+         }
+         if (decl->Interp.CylindricalWrap & TGSI_CYLINDRICAL_WRAP_W) {
+            CHR('W');
+         }
+      }
    }
 
    if (decl->Declaration.Invariant) {
       TXT( ", INVARIANT" );
    }
 
-   if (decl->Declaration.CylindricalWrap) {
-      TXT(", CYLWRAP_");
-      if (decl->Declaration.CylindricalWrap & TGSI_CYLINDRICAL_WRAP_X) {
-         CHR('X');
-      }
-      if (decl->Declaration.CylindricalWrap & TGSI_CYLINDRICAL_WRAP_Y) {
-         CHR('Y');
-      }
-      if (decl->Declaration.CylindricalWrap & TGSI_CYLINDRICAL_WRAP_Z) {
-         CHR('Z');
-      }
-      if (decl->Declaration.CylindricalWrap & TGSI_CYLINDRICAL_WRAP_W) {
-         CHR('W');
-      }
-   }
 
    if (decl->Declaration.File == TGSI_FILE_IMMEDIATE_ARRAY) {
       unsigned i;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index c4ad34b1e61..5e23f5da65b 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -2121,7 +2121,7 @@ exec_sample(struct tgsi_exec_machine *mach,
       control = tgsi_sampler_lod_bias;
    }
 
-   switch (mach->Resources[resource_unit].Resource) {
+   switch (mach->SamplerViews[resource_unit].Resource) {
    case TGSI_TEXTURE_1D:
    case TGSI_TEXTURE_SHADOW1D:
       FETCH(&r[0], 0, TGSI_CHAN_X);
@@ -2215,7 +2215,7 @@ exec_sample_d(struct tgsi_exec_machine *mach,
     * XXX: This is fake SAMPLE_D -- the derivatives are not taken into account, yet.
     */
 
-   switch (mach->Resources[resource_unit].Resource) {
+   switch (mach->SamplerViews[resource_unit].Resource) {
    case TGSI_TEXTURE_1D:
    case TGSI_TEXTURE_SHADOW1D:
 
@@ -2338,8 +2338,8 @@ static void
 exec_declaration(struct tgsi_exec_machine *mach,
                  const struct tgsi_full_declaration *decl)
 {
-   if (decl->Declaration.File == TGSI_FILE_RESOURCE) {
-      mach->Resources[decl->Range.First] = decl->Resource;
+   if (decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) {
+      mach->SamplerViews[decl->Range.First] = decl->SamplerView;
       return;
    }
 
@@ -2371,7 +2371,7 @@ exec_declaration(struct tgsi_exec_machine *mach,
             eval_coef_func eval;
             uint i, j;
 
-            switch (decl->Declaration.Interpolate) {
+            switch (decl->Interp.Interpolate) {
             case TGSI_INTERPOLATE_CONSTANT:
                eval = eval_constant_coef;
                break;
@@ -4154,11 +4154,11 @@ exec_instruction(
       exec_endswitch(mach);
       break;
 
-   case TGSI_OPCODE_LOAD:
+   case TGSI_OPCODE_SAMPLE_I:
       assert(0);
       break;
 
-   case TGSI_OPCODE_LOAD_MS:
+   case TGSI_OPCODE_SAMPLE_I_MS:
       assert(0);
       break;
 
@@ -4190,7 +4190,7 @@ exec_instruction(
       assert(0);
       break;
 
-   case TGSI_OPCODE_RESINFO:
+   case TGSI_OPCODE_SVIEWINFO:
       assert(0);
       break;
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h
index d9e93ce138d..0ecb4e952bb 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h
@@ -331,7 +331,8 @@ struct tgsi_exec_machine
    struct tgsi_full_declaration *Declarations;
    uint NumDeclarations;
 
-   struct tgsi_declaration_resource Resources[PIPE_MAX_SHADER_RESOURCES];
+   struct tgsi_declaration_sampler_view
+      SamplerViews[PIPE_MAX_SHADER_SAMPLER_VIEWS];
 
    boolean UsedGeometryShader;
 };
diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.c b/src/gallium/auxiliary/tgsi/tgsi_info.c
index 81df96b3c7a..8bf9aeb4284 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_info.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_info.c
@@ -183,22 +183,39 @@ static const struct tgsi_opcode_info opcode_info[TGSI_OPCODE_LAST] =
    { 0, 0, 0, 0, 0, 0, NONE, "DEFAULT", TGSI_OPCODE_DEFAULT },
    { 0, 0, 0, 0, 0, 0, NONE, "ENDSWITCH", TGSI_OPCODE_ENDSWITCH },
 
-   { 1, 2, 0, 0, 0, 0, OTHR, "LOAD",        TGSI_OPCODE_LOAD },
-   { 1, 2, 0, 0, 0, 0, OTHR, "LOAD_MS",     TGSI_OPCODE_LOAD_MS },
    { 1, 3, 0, 0, 0, 0, OTHR, "SAMPLE",      TGSI_OPCODE_SAMPLE },
+   { 1, 2, 0, 0, 0, 0, OTHR, "SAMPLE_I",    TGSI_OPCODE_SAMPLE_I },
+   { 1, 2, 0, 0, 0, 0, OTHR, "SAMPLE_I_MS", TGSI_OPCODE_SAMPLE_I_MS },
    { 1, 4, 0, 0, 0, 0, OTHR, "SAMPLE_B",    TGSI_OPCODE_SAMPLE_B },
    { 1, 4, 0, 0, 0, 0, OTHR, "SAMPLE_C",    TGSI_OPCODE_SAMPLE_C },
    { 1, 4, 0, 0, 0, 0, OTHR, "SAMPLE_C_LZ", TGSI_OPCODE_SAMPLE_C_LZ },
    { 1, 5, 0, 0, 0, 0, OTHR, "SAMPLE_D",    TGSI_OPCODE_SAMPLE_D },
    { 1, 3, 0, 0, 0, 0, OTHR, "SAMPLE_L",    TGSI_OPCODE_SAMPLE_L },
    { 1, 3, 0, 0, 0, 0, OTHR, "GATHER4",     TGSI_OPCODE_GATHER4 },
-   { 1, 2, 0, 0, 0, 0, OTHR, "RESINFO",     TGSI_OPCODE_RESINFO },
+   { 1, 2, 0, 0, 0, 0, OTHR, "SVIEWINFO",   TGSI_OPCODE_SVIEWINFO },
    { 1, 2, 0, 0, 0, 0, OTHR, "SAMPLE_POS",  TGSI_OPCODE_SAMPLE_POS },
    { 1, 2, 0, 0, 0, 0, OTHR, "SAMPLE_INFO", TGSI_OPCODE_SAMPLE_INFO },
    { 1, 1, 0, 0, 0, 0, COMP, "UARL", TGSI_OPCODE_UARL },
    { 1, 3, 0, 0, 0, 0, COMP, "UCMP", TGSI_OPCODE_UCMP },
    { 1, 1, 0, 0, 0, 0, COMP, "IABS", TGSI_OPCODE_IABS },
    { 1, 1, 0, 0, 0, 0, COMP, "ISSG", TGSI_OPCODE_ISSG },
+   { 1, 2, 0, 0, 0, 0, OTHR, "LOAD", TGSI_OPCODE_LOAD },
+   { 1, 2, 0, 0, 0, 0, OTHR, "STORE", TGSI_OPCODE_STORE },
+   { 1, 0, 0, 0, 0, 0, OTHR, "MFENCE", TGSI_OPCODE_MFENCE },
+   { 1, 0, 0, 0, 0, 0, OTHR, "LFENCE", TGSI_OPCODE_LFENCE },
+   { 1, 0, 0, 0, 0, 0, OTHR, "SFENCE", TGSI_OPCODE_SFENCE },
+   { 0, 0, 0, 0, 0, 0, OTHR, "BARRIER", TGSI_OPCODE_BARRIER },
+
+   { 1, 3, 0, 0, 0, 0, OTHR, "ATOMUADD", TGSI_OPCODE_ATOMUADD },
+   { 1, 3, 0, 0, 0, 0, OTHR, "ATOMXCHG", TGSI_OPCODE_ATOMXCHG },
+   { 1, 4, 0, 0, 0, 0, OTHR, "ATOMCAS", TGSI_OPCODE_ATOMCAS },
+   { 1, 3, 0, 0, 0, 0, OTHR, "ATOMAND", TGSI_OPCODE_ATOMAND },
+   { 1, 3, 0, 0, 0, 0, OTHR, "ATOMOR", TGSI_OPCODE_ATOMOR },
+   { 1, 3, 0, 0, 0, 0, OTHR, "ATOMXOR", TGSI_OPCODE_ATOMXOR },
+   { 1, 3, 0, 0, 0, 0, OTHR, "ATOMUMIN", TGSI_OPCODE_ATOMUMIN },
+   { 1, 3, 0, 0, 0, 0, OTHR, "ATOMUMAX", TGSI_OPCODE_ATOMUMAX },
+   { 1, 3, 0, 0, 0, 0, OTHR, "ATOMIMIN", TGSI_OPCODE_ATOMIMIN },
+   { 1, 3, 0, 0, 0, 0, OTHR, "ATOMIMAX", TGSI_OPCODE_ATOMIMAX }
 };
 
 const struct tgsi_opcode_info *
diff --git a/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h b/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
index b5d4504425b..96b864f50d0 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
@@ -168,16 +168,16 @@ OP01(CASE)
 OP00(DEFAULT)
 OP00(ENDSWITCH)
 
-OP12(LOAD)
-OP12(LOAD_MS)
 OP13(SAMPLE)
+OP12(SAMPLE_I)
+OP12(SAMPLE_I_MS)
 OP14(SAMPLE_B)
 OP14(SAMPLE_C)
 OP14(SAMPLE_C_LZ)
 OP15(SAMPLE_D)
 OP13(SAMPLE_L)
 OP13(GATHER4)
-OP12(RESINFO)
+OP12(SVIEWINFO)
 OP13(SAMPLE_POS)
 OP12(SAMPLE_INFO)
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.c b/src/gallium/auxiliary/tgsi/tgsi_parse.c
index e1902eb1862..45c5c41ec82 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.c
@@ -113,6 +113,10 @@ tgsi_parse_token(
          next_token(ctx, &decl->Dim);
       }
 
+      if( decl->Declaration.Interpolate ) {
+         next_token( ctx, &decl->Interp );
+      }
+
       if( decl->Declaration.Semantic ) {
          next_token( ctx, &decl->Semantic );
       }
@@ -132,6 +136,10 @@ tgsi_parse_token(
          next_token(ctx, &decl->Resource);
       }
 
+      if (decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) {
+         next_token(ctx, &decl->SamplerView);
+      }
+
       break;
    }
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.h b/src/gallium/auxiliary/tgsi/tgsi_parse.h
index f7b7e6edc98..3f8bf99e3c1 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.h
@@ -67,9 +67,11 @@ struct tgsi_full_declaration
    struct tgsi_declaration Declaration;
    struct tgsi_declaration_range Range;
    struct tgsi_declaration_dimension Dim;
+   struct tgsi_declaration_interp Interp;
    struct tgsi_declaration_semantic Semantic;
    struct tgsi_immediate_array_data ImmediateData;
    struct tgsi_declaration_resource Resource;
+   struct tgsi_declaration_sampler_view SamplerView;
 };
 
 struct tgsi_full_immediate
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index 1e00e2e54ed..ce728ecee06 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -1170,7 +1170,7 @@ emit_declaration(
       for( i = first; i <= last; i++ ) {
          for( j = 0; j < NUM_CHANNELS; j++ ) {
             if( mask & (1 << j) ) {
-               switch( decl->Declaration.Interpolate ) {
+               switch( decl->Interp.Interpolate ) {
                case TGSI_INTERPOLATE_CONSTANT:
                   emit_coef_a0( func, 0, i, j );
                   emit_inputs( func, 0, i, j );
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index e4e9c032e02..df299baa9c1 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -68,7 +68,8 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
    procType = parse.FullHeader.Processor.Processor;
    assert(procType == TGSI_PROCESSOR_FRAGMENT ||
           procType == TGSI_PROCESSOR_VERTEX ||
-          procType == TGSI_PROCESSOR_GEOMETRY);
+          procType == TGSI_PROCESSOR_GEOMETRY ||
+          procType == TGSI_PROCESSOR_COMPUTE);
 
 
    /**
@@ -157,9 +158,9 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
                if (file == TGSI_FILE_INPUT) {
                   info->input_semantic_name[reg] = (ubyte)fulldecl->Semantic.Name;
                   info->input_semantic_index[reg] = (ubyte)fulldecl->Semantic.Index;
-                  info->input_interpolate[reg] = (ubyte)fulldecl->Declaration.Interpolate;
-                  info->input_centroid[reg] = (ubyte)fulldecl->Declaration.Centroid;
-                  info->input_cylindrical_wrap[reg] = (ubyte)fulldecl->Declaration.CylindricalWrap;
+                  info->input_interpolate[reg] = (ubyte)fulldecl->Interp.Interpolate;
+                  info->input_centroid[reg] = (ubyte)fulldecl->Interp.Centroid;
+                  info->input_cylindrical_wrap[reg] = (ubyte)fulldecl->Interp.CylindricalWrap;
                   info->num_inputs++;
 
                   if (procType == TGSI_PROCESSOR_FRAGMENT &&
diff --git a/src/gallium/auxiliary/tgsi/tgsi_strings.c b/src/gallium/auxiliary/tgsi/tgsi_strings.c
index 520452ce1b8..b5fd1fc0a65 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_strings.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_strings.c
@@ -32,11 +32,12 @@
 #include "tgsi_strings.h"
 
 
-const char *tgsi_processor_type_names[3] =
+const char *tgsi_processor_type_names[4] =
 {
    "FRAG",
    "VERT",
-   "GEOM"
+   "GEOM",
+   "COMP"
 };
 
 const char *tgsi_file_names[TGSI_FILE_COUNT] =
@@ -53,7 +54,8 @@ const char *tgsi_file_names[TGSI_FILE_COUNT] =
    "SV",
    "IMMX",
    "TEMPX",
-   "RES"
+   "RES",
+   "SVIEW"
 };
 
 const char *tgsi_semantic_names[TGSI_SEMANTIC_COUNT] =
@@ -72,12 +74,16 @@ const char *tgsi_semantic_names[TGSI_SEMANTIC_COUNT] =
    "VERTEXID",
    "STENCIL",
    "CLIPDIST",
-   "CLIPVERTEX"
+   "CLIPVERTEX",
+   "GRID_SIZE",
+   "BLOCK_ID",
+   "BLOCK_SIZE",
+   "THREAD_ID"
 };
 
 const char *tgsi_texture_names[TGSI_TEXTURE_COUNT] =
 {
-   "UNKNOWN",
+   "BUFFER",
    "1D",
    "2D",
    "3D",
@@ -90,7 +96,8 @@ const char *tgsi_texture_names[TGSI_TEXTURE_COUNT] =
    "2DARRAY",
    "SHADOW1DARRAY",
    "SHADOW2DARRAY",
-   "SHADOWCUBE"
+   "SHADOWCUBE",
+   "UNKNOWN"
 };
 
 const char *tgsi_property_names[TGSI_PROPERTY_COUNT] =
diff --git a/src/gallium/auxiliary/tgsi/tgsi_strings.h b/src/gallium/auxiliary/tgsi/tgsi_strings.h
index 0946a58f135..5c57e229c28 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_strings.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_strings.h
@@ -38,7 +38,7 @@ extern "C" {
 #endif
 
 
-extern const char *tgsi_processor_type_names[3];
+extern const char *tgsi_processor_type_names[4];
 
 extern const char *tgsi_file_names[TGSI_FILE_COUNT];
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_text.c b/src/gallium/auxiliary/tgsi/tgsi_text.c
index 4b3d22c3072..279a046e202 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_text.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_text.c
@@ -132,6 +132,23 @@ static boolean parse_uint( const char **pcur, uint *val )
    return FALSE;
 }
 
+static boolean parse_int( const char **pcur, int *val )
+{
+   const char *cur = *pcur;
+   int sign = (*cur == '-' ? -1 : 1);
+
+   if (*cur == '+' || *cur == '-')
+      cur++;
+
+   if (parse_uint(&cur, (uint *)val)) {
+      *val *= sign;
+      *pcur = cur;
+      return TRUE;
+   }
+
+   return FALSE;
+}
+
 static boolean parse_identifier( const char **pcur, char *ret )
 {
    const char *cur = *pcur;
@@ -238,6 +255,8 @@ static boolean parse_header( struct translate_ctx *ctx )
       processor = TGSI_PROCESSOR_VERTEX;
    else if (str_match_no_case( &ctx->cur, "GEOM" ))
       processor = TGSI_PROCESSOR_GEOMETRY;
+   else if (str_match_no_case( &ctx->cur, "COMP" ))
+      processor = TGSI_PROCESSOR_COMPUTE;
    else {
       report_error( ctx, "Unknown header" );
       return FALSE;
@@ -447,24 +466,10 @@ parse_register_bracket(
          eat_opt_white(&ctx->cur);
       }
 
-      if (*ctx->cur == '+' || *ctx->cur == '-') {
-         boolean negate;
-
-         negate = *ctx->cur == '-';
-         ctx->cur++;
-         eat_opt_white( &ctx->cur );
-         if (!parse_uint( &ctx->cur, &uindex )) {
-            report_error( ctx, "Expected literal unsigned integer" );
-            return FALSE;
-         }
-         if (negate)
-            brackets->index = -(int) uindex;
-         else
-            brackets->index = (int) uindex;
-      }
-      else {
+      if (*ctx->cur == '+' || *ctx->cur == '-')
+         parse_int( &ctx->cur, &brackets->index );
+      else
          brackets->index = 0;
-      }
    }
    else {
       if (!parse_uint( &ctx->cur, &uindex )) {
@@ -819,6 +824,7 @@ parse_instruction(
    uint saturate = TGSI_SAT_NONE;
    const struct tgsi_opcode_info *info;
    struct tgsi_full_instruction inst;
+   const char *cur;
    uint advance;
 
    inst = tgsi_default_full_instruction();
@@ -864,7 +870,7 @@ parse_instruction(
     */
    eat_opt_white( &ctx->cur );
    for (i = 0; i < TGSI_OPCODE_LAST; i++) {
-      const char *cur = ctx->cur;
+      cur = ctx->cur;
 
       info = tgsi_get_opcode_info( i );
       if (match_inst_mnemonic(&cur, info)) {
@@ -938,22 +944,20 @@ parse_instruction(
       }
    }
 
-   if (info->is_branch) {
+   cur = ctx->cur;
+   eat_opt_white( &cur );
+   if (info->is_branch && *cur == ':') {
       uint target;
 
-      eat_opt_white( &ctx->cur );
-      if (*ctx->cur != ':') {
-         report_error( ctx, "Expected `:'" );
-         return FALSE;
-      }
-      ctx->cur++;
-      eat_opt_white( &ctx->cur );
-      if (!parse_uint( &ctx->cur, &target )) {
+      cur++;
+      eat_opt_white( &cur );
+      if (!parse_uint( &cur, &target )) {
          report_error( ctx, "Expected a label" );
          return FALSE;
       }
       inst.Instruction.Label = 1;
       inst.Label.Label = target;
+      ctx->cur = cur;
    }
 
    advance = tgsi_build_full_instruction(
@@ -970,10 +974,11 @@ parse_instruction(
 
 /* parses a 4-touple of the form {x, y, z, w}
  * where x, y, z, w are numbers */
-static boolean parse_immediate_data(struct translate_ctx *ctx,
-                                    float *values)
+static boolean parse_immediate_data(struct translate_ctx *ctx, unsigned type,
+                                    union tgsi_immediate_data *values)
 {
    unsigned i;
+   int ret;
 
    eat_opt_white( &ctx->cur );
    if (*ctx->cur != '{') {
@@ -991,8 +996,21 @@ static boolean parse_immediate_data(struct translate_ctx *ctx,
          ctx->cur++;
          eat_opt_white( &ctx->cur );
       }
-      if (!parse_float( &ctx->cur, &values[i] )) {
-         report_error( ctx, "Expected literal floating point" );
+
+      switch (type) {
+      case TGSI_IMM_FLOAT32:
+         ret = parse_float(&ctx->cur, &values[i].Float);
+         break;
+      case TGSI_IMM_UINT32:
+         ret = parse_uint(&ctx->cur, &values[i].Uint);
+         break;
+      case TGSI_IMM_INT32:
+         ret = parse_int(&ctx->cur, &values[i].Int);
+         break;
+      }
+
+      if (!ret) {
+         report_error( ctx, "Expected immediate constant" );
          return FALSE;
       }
    }
@@ -1013,7 +1031,7 @@ static boolean parse_declaration( struct translate_ctx *ctx )
    struct parsed_dcl_bracket brackets[2];
    int num_brackets;
    uint writemask;
-   const char *cur;
+   const char *cur, *cur2;
    uint advance;
    boolean is_vs_input;
    boolean is_imm_array;
@@ -1066,6 +1084,42 @@ static boolean parse_declaration( struct translate_ctx *ctx )
             report_error(ctx, "Expected texture target");
             return FALSE;
          }
+
+         cur2 = cur;
+         eat_opt_white(&cur2);
+         while (*cur2 == ',') {
+            cur2++;
+            eat_opt_white(&cur2);
+            if (str_match_no_case(&cur2, "RAW") &&
+                !is_digit_alpha_underscore(cur2)) {
+               decl.Resource.Raw = 1;
+
+            } else if (str_match_no_case(&cur2, "WR") &&
+                !is_digit_alpha_underscore(cur2)) {
+               decl.Resource.Writable = 1;
+
+            } else {
+               break;
+            }
+            cur = cur2;
+            eat_opt_white(&cur2);
+         }
+
+         ctx->cur = cur;
+
+      } else if (file == TGSI_FILE_SAMPLER_VIEW) {
+         for (i = 0; i < TGSI_TEXTURE_COUNT; i++) {
+            if (str_match_no_case(&cur, tgsi_texture_names[i])) {
+               if (!is_digit_alpha_underscore(cur)) {
+                  decl.SamplerView.Resource = i;
+                  break;
+               }
+            }
+         }
+         if (i == TGSI_TEXTURE_COUNT) {
+            report_error(ctx, "Expected texture target");
+            return FALSE;
+         }
          eat_opt_white( &cur );
          if (*cur != ',') {
             report_error( ctx, "Expected `,'" );
@@ -1079,16 +1133,16 @@ static boolean parse_declaration( struct translate_ctx *ctx )
                   if (!is_digit_alpha_underscore(cur)) {
                      switch (j) {
                      case 0:
-                        decl.Resource.ReturnTypeX = i;
+                        decl.SamplerView.ReturnTypeX = i;
                         break;
                      case 1:
-                        decl.Resource.ReturnTypeY = i;
+                        decl.SamplerView.ReturnTypeY = i;
                         break;
                      case 2:
-                        decl.Resource.ReturnTypeZ = i;
+                        decl.SamplerView.ReturnTypeZ = i;
                         break;
                      case 3:
-                        decl.Resource.ReturnTypeW = i;
+                        decl.SamplerView.ReturnTypeW = i;
                         break;
                      default:
                         assert(0);
@@ -1104,7 +1158,7 @@ static boolean parse_declaration( struct translate_ctx *ctx )
                }
                break;
             } else {
-               const char *cur2 = cur;
+               cur2 = cur;
                eat_opt_white( &cur2 );
                if (*cur2 == ',') {
                   cur2++;
@@ -1116,51 +1170,64 @@ static boolean parse_declaration( struct translate_ctx *ctx )
             }
          }
          if (j < 4) {
-            decl.Resource.ReturnTypeY =
-               decl.Resource.ReturnTypeZ =
-               decl.Resource.ReturnTypeW =
-               decl.Resource.ReturnTypeX;
+            decl.SamplerView.ReturnTypeY =
+               decl.SamplerView.ReturnTypeZ =
+               decl.SamplerView.ReturnTypeW =
+               decl.SamplerView.ReturnTypeX;
          }
          ctx->cur = cur;
       } else {
-         for (i = 0; i < TGSI_SEMANTIC_COUNT; i++) {
-            if (str_match_no_case( &cur, tgsi_semantic_names[i] )) {
-               const char *cur2 = cur;
-               uint index;
+         if (str_match_no_case(&cur, "LOCAL") &&
+             !is_digit_alpha_underscore(cur)) {
+            decl.Declaration.Local = 1;
+            ctx->cur = cur;
+         }
 
-               if (is_digit_alpha_underscore( cur ))
-                  continue;
-               eat_opt_white( &cur2 );
-               if (*cur2 == '[') {
-                  cur2++;
-                  eat_opt_white( &cur2 );
-                  if (!parse_uint( &cur2, &index )) {
-                     report_error( ctx, "Expected literal integer" );
-                     return FALSE;
-                  }
+         cur = ctx->cur;
+         eat_opt_white( &cur );
+         if (*cur == ',') {
+            cur++;
+            eat_opt_white( &cur );
+
+            for (i = 0; i < TGSI_SEMANTIC_COUNT; i++) {
+               if (str_match_no_case( &cur, tgsi_semantic_names[i] )) {
+                  uint index;
+
+                  if (is_digit_alpha_underscore( cur ))
+                     continue;
+                  cur2 = cur;
                   eat_opt_white( &cur2 );
-                  if (*cur2 != ']') {
-                     report_error( ctx, "Expected `]'" );
-                     return FALSE;
-                  }
-                  cur2++;
+                  if (*cur2 == '[') {
+                     cur2++;
+                     eat_opt_white( &cur2 );
+                     if (!parse_uint( &cur2, &index )) {
+                        report_error( ctx, "Expected literal integer" );
+                        return FALSE;
+                     }
+                     eat_opt_white( &cur2 );
+                     if (*cur2 != ']') {
+                        report_error( ctx, "Expected `]'" );
+                        return FALSE;
+                     }
+                     cur2++;
 
-                  decl.Semantic.Index = index;
+                     decl.Semantic.Index = index;
 
-                  cur = cur2;
-               }
+                     cur = cur2;
+                  }
 
-               decl.Declaration.Semantic = 1;
-               decl.Semantic.Name = i;
+                  decl.Declaration.Semantic = 1;
+                  decl.Semantic.Name = i;
 
-               ctx->cur = cur;
-               break;
+                  ctx->cur = cur;
+                  break;
+               }
             }
          }
       }
    } else if (is_imm_array) {
       unsigned i;
-      float *vals_itr;
+      union tgsi_immediate_data *vals_itr;
       /* we have our immediate data */
       if (*cur != '{') {
          report_error( ctx, "Immediate array without data" );
@@ -1172,9 +1239,9 @@ static boolean parse_declaration( struct translate_ctx *ctx )
       decl.ImmediateData.u =
          MALLOC(sizeof(union tgsi_immediate_data) * 4 *
                 (decl.Range.Last + 1));
-      vals_itr = (float*)decl.ImmediateData.u;
+      vals_itr = decl.ImmediateData.u;
       for (i = 0; i <= decl.Range.Last; ++i) {
-         if (!parse_immediate_data(ctx, vals_itr)) {
+         if (!parse_immediate_data(ctx, TGSI_IMM_FLOAT32, vals_itr)) {
             FREE(decl.ImmediateData.u);
             return FALSE;
          }
@@ -1209,7 +1276,8 @@ static boolean parse_declaration( struct translate_ctx *ctx )
          if (str_match_no_case( &cur, tgsi_interpolate_names[i] )) {
             if (is_digit_alpha_underscore( cur ))
                continue;
-            decl.Declaration.Interpolate = i;
+            decl.Declaration.Interpolate = 1;
+            decl.Interp.Interpolate = i;
 
             ctx->cur = cur;
             break;
@@ -1240,28 +1308,27 @@ static boolean parse_declaration( struct translate_ctx *ctx )
 static boolean parse_immediate( struct translate_ctx *ctx )
 {
    struct tgsi_full_immediate imm;
-   float values[4];
    uint advance;
+   int type;
 
    if (!eat_white( &ctx->cur )) {
       report_error( ctx, "Syntax error" );
       return FALSE;
    }
-   if (!str_match_no_case( &ctx->cur, "FLT32" ) ||
-       is_digit_alpha_underscore( ctx->cur )) {
-      report_error( ctx, "Expected `FLT32'" );
+   for (type = 0; type < Elements(tgsi_immediate_type_names); ++type) {
+      if (str_match_no_case(&ctx->cur, tgsi_immediate_type_names[type]) &&
+          !is_digit_alpha_underscore(ctx->cur))
+         break;
+   }
+   if (type == Elements(tgsi_immediate_type_names)) {
+      report_error( ctx, "Expected immediate type" );
       return FALSE;
    }
 
-   parse_immediate_data(ctx, values);
-
    imm = tgsi_default_full_immediate();
    imm.Immediate.NrTokens += 4;
-   imm.Immediate.DataType = TGSI_IMM_FLOAT32;
-   imm.u[0].Float = values[0];
-   imm.u[1].Float = values[1];
-   imm.u[2].Float = values[2];
-   imm.u[3].Float = values[3];
+   imm.Immediate.DataType = type;
+   parse_immediate_data(ctx, type, imm.u);
 
    advance = tgsi_build_full_immediate(
       &imm,
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.c b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
index 0f9aa3ab43a..e427585db19 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
@@ -36,6 +36,7 @@
 #include "util/u_debug.h"
 #include "util/u_memory.h"
 #include "util/u_math.h"
+#include "util/u_bitmask.h"
 
 union tgsi_any_token {
    struct tgsi_header header;
@@ -46,8 +47,9 @@ union tgsi_any_token {
    struct tgsi_declaration decl;
    struct tgsi_declaration_range decl_range;
    struct tgsi_declaration_dimension decl_dim;
+   struct tgsi_declaration_interp decl_interp;
    struct tgsi_declaration_semantic decl_semantic;
-   struct tgsi_declaration_resource decl_resource;
+   struct tgsi_declaration_sampler_view decl_sampler_view;
    struct tgsi_immediate imm;
    union  tgsi_immediate_data imm_data;
    struct tgsi_instruction insn;
@@ -74,7 +76,6 @@ struct ureg_tokens {
 #define UREG_MAX_OUTPUT PIPE_MAX_ATTRIBS
 #define UREG_MAX_CONSTANT_RANGE 32
 #define UREG_MAX_IMMEDIATE 256
-#define UREG_MAX_TEMP 256
 #define UREG_MAX_ADDR 2
 #define UREG_MAX_PRED 1
 
@@ -147,10 +148,11 @@ struct ureg_program
       unsigned return_type_y;
       unsigned return_type_z;
       unsigned return_type_w;
-   } resource[PIPE_MAX_SHADER_RESOURCES];
-   unsigned nr_resources;
+   } sampler_view[PIPE_MAX_SHADER_SAMPLER_VIEWS];
+   unsigned nr_sampler_views;
 
-   unsigned temps_active[UREG_MAX_TEMP / 32];
+   struct util_bitmask *free_temps;
+   struct util_bitmask *local_temps;
    unsigned nr_temps;
 
    struct const_decl const_decls;
@@ -529,43 +531,48 @@ out:
    return ureg_src_register(TGSI_FILE_CONSTANT, index);
 }
 
-
-/* Allocate a new temporary.  Temporaries greater than UREG_MAX_TEMP
- * are legal, but will not be released.
- */
-struct ureg_dst ureg_DECL_temporary( struct ureg_program *ureg )
+static struct ureg_dst alloc_temporary( struct ureg_program *ureg,
+                                        boolean local )
 {
    unsigned i;
 
-   for (i = 0; i < UREG_MAX_TEMP; i += 32) {
-      int bit = ffs(~ureg->temps_active[i/32]);
-      if (bit != 0) {
-         i += bit - 1;
-         goto out;
-      }
+   /* Look for a released temporary.
+    */
+   for (i = util_bitmask_get_first_index(ureg->free_temps);
+        i != UTIL_BITMASK_INVALID_INDEX;
+        i = util_bitmask_get_next_index(ureg->free_temps, i + 1)) {
+      if (util_bitmask_get(ureg->local_temps, i) == local)
+         break;
    }
 
-   /* No reusable temps, so allocate a new one:
+   /* Or allocate a new one.
     */
-   i = ureg->nr_temps++;
+   if (i == UTIL_BITMASK_INVALID_INDEX)
+      i = ureg->nr_temps++;
 
-out:
-   if (i < UREG_MAX_TEMP)
-      ureg->temps_active[i/32] |= 1 << (i % 32);
+   util_bitmask_clear(ureg->free_temps, i);
 
-   if (i >= ureg->nr_temps)
-      ureg->nr_temps = i + 1;
+   if (local)
+      util_bitmask_set(ureg->local_temps, i);
 
    return ureg_dst_register( TGSI_FILE_TEMPORARY, i );
 }
 
+struct ureg_dst ureg_DECL_temporary( struct ureg_program *ureg )
+{
+   return alloc_temporary(ureg, FALSE);
+}
+
+struct ureg_dst ureg_DECL_local_temporary( struct ureg_program *ureg )
+{
+   return alloc_temporary(ureg, TRUE);
+}
 
 void ureg_release_temporary( struct ureg_program *ureg,
                              struct ureg_dst tmp )
 {
    if(tmp.File == TGSI_FILE_TEMPORARY)
-      if (tmp.Index < UREG_MAX_TEMP)
-         ureg->temps_active[tmp.Index/32] &= ~(1 << (tmp.Index % 32));
+      util_bitmask_set(ureg->free_temps, tmp.Index);
 }
 
 
@@ -615,34 +622,34 @@ struct ureg_src ureg_DECL_sampler( struct ureg_program *ureg,
 }
 
 /*
- * Allocate a new shader resource.
+ * Allocate a new shader sampler view.
  */
 struct ureg_src
-ureg_DECL_resource(struct ureg_program *ureg,
-                   unsigned index,
-                   unsigned target,
-                   unsigned return_type_x,
-                   unsigned return_type_y,
-                   unsigned return_type_z,
-                   unsigned return_type_w)
+ureg_DECL_sampler_view(struct ureg_program *ureg,
+                       unsigned index,
+                       unsigned target,
+                       unsigned return_type_x,
+                       unsigned return_type_y,
+                       unsigned return_type_z,
+                       unsigned return_type_w)
 {
-   struct ureg_src reg = ureg_src_register(TGSI_FILE_RESOURCE, index);
+   struct ureg_src reg = ureg_src_register(TGSI_FILE_SAMPLER_VIEW, index);
    uint i;
 
-   for (i = 0; i < ureg->nr_resources; i++) {
-      if (ureg->resource[i].index == index) {
+   for (i = 0; i < ureg->nr_sampler_views; i++) {
+      if (ureg->sampler_view[i].index == index) {
          return reg;
       }
    }
 
-   if (i < PIPE_MAX_SHADER_RESOURCES) {
-      ureg->resource[i].index = index;
-      ureg->resource[i].target = target;
-      ureg->resource[i].return_type_x = return_type_x;
-      ureg->resource[i].return_type_y = return_type_y;
-      ureg->resource[i].return_type_z = return_type_z;
-      ureg->resource[i].return_type_w = return_type_w;
-      ureg->nr_resources++;
+   if (i < PIPE_MAX_SHADER_SAMPLER_VIEWS) {
+      ureg->sampler_view[i].index = index;
+      ureg->sampler_view[i].target = target;
+      ureg->sampler_view[i].return_type_x = return_type_x;
+      ureg->sampler_view[i].return_type_y = return_type_y;
+      ureg->sampler_view[i].return_type_z = return_type_z;
+      ureg->sampler_view[i].return_type_w = return_type_w;
+      ureg->nr_sampler_views++;
       return reg;
    }
 
@@ -891,7 +898,7 @@ ureg_emit_dst( struct ureg_program *ureg,
    assert(dst.File != TGSI_FILE_CONSTANT);
    assert(dst.File != TGSI_FILE_INPUT);
    assert(dst.File != TGSI_FILE_SAMPLER);
-   assert(dst.File != TGSI_FILE_RESOURCE);
+   assert(dst.File != TGSI_FILE_SAMPLER_VIEW);
    assert(dst.File != TGSI_FILE_IMMEDIATE);
    assert(dst.File < TGSI_FILE_COUNT);
 
@@ -1229,28 +1236,50 @@ emit_decl_fs(struct ureg_program *ureg,
              unsigned cylindrical_wrap,
              unsigned centroid)
 {
-   union tgsi_any_token *out = get_tokens(ureg, DOMAIN_DECL, 3);
+   union tgsi_any_token *out = get_tokens(ureg, DOMAIN_DECL, 4);
 
    out[0].value = 0;
    out[0].decl.Type = TGSI_TOKEN_TYPE_DECLARATION;
-   out[0].decl.NrTokens = 3;
+   out[0].decl.NrTokens = 4;
    out[0].decl.File = file;
    out[0].decl.UsageMask = TGSI_WRITEMASK_XYZW; /* FIXME! */
-   out[0].decl.Interpolate = interpolate;
+   out[0].decl.Interpolate = 1;
    out[0].decl.Semantic = 1;
-   out[0].decl.CylindricalWrap = cylindrical_wrap;
-   out[0].decl.Centroid = centroid;
 
    out[1].value = 0;
    out[1].decl_range.First = index;
    out[1].decl_range.Last = index;
 
    out[2].value = 0;
-   out[2].decl_semantic.Name = semantic_name;
-   out[2].decl_semantic.Index = semantic_index;
+   out[2].decl_interp.Interpolate = interpolate;
+   out[2].decl_interp.CylindricalWrap = cylindrical_wrap;
+   out[2].decl_interp.Centroid = centroid;
+
+   out[3].value = 0;
+   out[3].decl_semantic.Name = semantic_name;
+   out[3].decl_semantic.Index = semantic_index;
 }
 
 
+static void emit_decl( struct ureg_program *ureg,
+                       unsigned file,
+                       unsigned index,
+                       boolean local )
+{
+   union tgsi_any_token *out = get_tokens( ureg, DOMAIN_DECL, 2 );
+
+   out[0].value = 0;
+   out[0].decl.Type = TGSI_TOKEN_TYPE_DECLARATION;
+   out[0].decl.NrTokens = 2;
+   out[0].decl.File = file;
+   out[0].decl.UsageMask = TGSI_WRITEMASK_XYZW;
+   out[0].decl.Local = local;
+
+   out[1].value = 0;
+   out[1].decl_range.First = index;
+   out[1].decl_range.Last = index;
+}
+
 static void emit_decl_range( struct ureg_program *ureg,
                              unsigned file,
                              unsigned first,
@@ -1263,7 +1292,6 @@ static void emit_decl_range( struct ureg_program *ureg,
    out[0].decl.NrTokens = 2;
    out[0].decl.File = file;
    out[0].decl.UsageMask = TGSI_WRITEMASK_XYZW;
-   out[0].decl.Interpolate = TGSI_INTERPOLATE_CONSTANT;
    out[0].decl.Semantic = 0;
 
    out[1].value = 0;
@@ -1285,7 +1313,6 @@ emit_decl_range2D(struct ureg_program *ureg,
    out[0].decl.NrTokens = 3;
    out[0].decl.File = file;
    out[0].decl.UsageMask = TGSI_WRITEMASK_XYZW;
-   out[0].decl.Interpolate = TGSI_INTERPOLATE_CONSTANT;
    out[0].decl.Dimension = 1;
 
    out[1].value = 0;
@@ -1297,33 +1324,32 @@ emit_decl_range2D(struct ureg_program *ureg,
 }
 
 static void
-emit_decl_resource(struct ureg_program *ureg,
-                   unsigned index,
-                   unsigned target,
-                   unsigned return_type_x,
-                   unsigned return_type_y,
-                   unsigned return_type_z,
-                   unsigned return_type_w )
+emit_decl_sampler_view(struct ureg_program *ureg,
+                       unsigned index,
+                       unsigned target,
+                       unsigned return_type_x,
+                       unsigned return_type_y,
+                       unsigned return_type_z,
+                       unsigned return_type_w )
 {
    union tgsi_any_token *out = get_tokens(ureg, DOMAIN_DECL, 3);
 
    out[0].value = 0;
    out[0].decl.Type = TGSI_TOKEN_TYPE_DECLARATION;
    out[0].decl.NrTokens = 3;
-   out[0].decl.File = TGSI_FILE_RESOURCE;
+   out[0].decl.File = TGSI_FILE_SAMPLER_VIEW;
    out[0].decl.UsageMask = 0xf;
-   out[0].decl.Interpolate = TGSI_INTERPOLATE_CONSTANT;
 
    out[1].value = 0;
    out[1].decl_range.First = index;
    out[1].decl_range.Last = index;
 
    out[2].value = 0;
-   out[2].decl_resource.Resource    = target;
-   out[2].decl_resource.ReturnTypeX = return_type_x;
-   out[2].decl_resource.ReturnTypeY = return_type_y;
-   out[2].decl_resource.ReturnTypeZ = return_type_z;
-   out[2].decl_resource.ReturnTypeW = return_type_w;
+   out[2].decl_sampler_view.Resource    = target;
+   out[2].decl_sampler_view.ReturnTypeX = return_type_x;
+   out[2].decl_sampler_view.ReturnTypeY = return_type_y;
+   out[2].decl_sampler_view.ReturnTypeZ = return_type_z;
+   out[2].decl_sampler_view.ReturnTypeW = return_type_w;
 }
 
 static void
@@ -1473,14 +1499,14 @@ static void emit_decls( struct ureg_program *ureg )
                        ureg->sampler[i].Index, 1 );
    }
 
-   for (i = 0; i < ureg->nr_resources; i++) {
-      emit_decl_resource(ureg,
-                         ureg->resource[i].index,
-                         ureg->resource[i].target,
-                         ureg->resource[i].return_type_x,
-                         ureg->resource[i].return_type_y,
-                         ureg->resource[i].return_type_z,
-                         ureg->resource[i].return_type_w);
+   for (i = 0; i < ureg->nr_sampler_views; i++) {
+      emit_decl_sampler_view(ureg,
+                             ureg->sampler_view[i].index,
+                             ureg->sampler_view[i].target,
+                             ureg->sampler_view[i].return_type_x,
+                             ureg->sampler_view[i].return_type_y,
+                             ureg->sampler_view[i].return_type_z,
+                             ureg->sampler_view[i].return_type_w);
    }
 
    if (ureg->const_decls.nr_constant_ranges) {
@@ -1508,10 +1534,9 @@ static void emit_decls( struct ureg_program *ureg )
       }
    }
 
-   if (ureg->nr_temps) {
-      emit_decl_range( ureg,
-                       TGSI_FILE_TEMPORARY,
-                       0, ureg->nr_temps );
+   for (i = 0; i < ureg->nr_temps; i++) {
+      emit_decl( ureg, TGSI_FILE_TEMPORARY, i,
+                 util_bitmask_get(ureg->local_temps, i) );
    }
 
    if (ureg->nr_addrs) {
@@ -1668,7 +1693,21 @@ struct ureg_program *ureg_create( unsigned processor )
    ureg->property_gs_input_prim = ~0;
    ureg->property_gs_output_prim = ~0;
    ureg->property_gs_max_vertices = ~0;
+
+   ureg->free_temps = util_bitmask_create();
+   if (ureg->free_temps == NULL)
+      goto fail;
+
+   ureg->local_temps = util_bitmask_create();
+   if (ureg->local_temps == NULL)
+      goto fail;
+
    return ureg;
+
+fail:
+   FREE(ureg->free_temps);
+   FREE(ureg);
+   return NULL;
 }
 
 
@@ -1681,6 +1720,9 @@ void ureg_destroy( struct ureg_program *ureg )
           ureg->domain[i].tokens != error_tokens)
          FREE(ureg->domain[i].tokens);
    }
-   
+
+   util_bitmask_destroy(ureg->free_temps);
+   util_bitmask_destroy(ureg->local_temps);
+
    FREE(ureg);
 }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.h b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
index 07ab8cba0ba..e6131f25aae 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
@@ -272,6 +272,14 @@ ureg_DECL_constant( struct ureg_program *,
 struct ureg_dst
 ureg_DECL_temporary( struct ureg_program * );
 
+/**
+ * Emit a temporary with the LOCAL declaration flag set.  For use when
+ * the register value is not required to be preserved across
+ * subroutine boundaries.
+ */
+struct ureg_dst
+ureg_DECL_local_temporary( struct ureg_program * );
+
 void 
 ureg_release_temporary( struct ureg_program *ureg,
                         struct ureg_dst tmp );
@@ -292,13 +300,13 @@ ureg_DECL_sampler( struct ureg_program *,
                    unsigned index );
 
 struct ureg_src
-ureg_DECL_resource(struct ureg_program *,
-                   unsigned index,
-                   unsigned target,
-                   unsigned return_type_x,
-                   unsigned return_type_y,
-                   unsigned return_type_z,
-                   unsigned return_type_w );
+ureg_DECL_sampler_view(struct ureg_program *,
+                       unsigned index,
+                       unsigned target,
+                       unsigned return_type_x,
+                       unsigned return_type_y,
+                       unsigned return_type_z,
+                       unsigned return_type_w );
 
 
 static INLINE struct ureg_src
diff --git a/src/gallium/auxiliary/util/u_format.h b/src/gallium/auxiliary/util/u_format.h
index bd4e51d279f..1718fb5e240 100644
--- a/src/gallium/auxiliary/util/u_format.h
+++ b/src/gallium/auxiliary/util/u_format.h
@@ -549,6 +549,19 @@ util_format_colormask(const struct util_format_description *desc)
 }
 
 
+/**
+ * Checks if color mask covers every channel for the specified format
+ *
+ * @param desc       a format description to check colormask with
+ * @param colormask  a bit mask for channels, matches format of PIPE_MASK_RGBA
+ */
+static INLINE boolean
+util_format_colormask_full(const struct util_format_description *desc, unsigned colormask)
+{
+   return (~colormask & util_format_colormask(desc)) == 0;
+}
+
+
 boolean
 util_format_is_float(enum pipe_format format);
 
diff --git a/src/gallium/auxiliary/util/u_pstipple.c b/src/gallium/auxiliary/util/u_pstipple.c
index ac0df8c1a9c..3a91b1da138 100644
--- a/src/gallium/auxiliary/util/u_pstipple.c
+++ b/src/gallium/auxiliary/util/u_pstipple.c
@@ -298,12 +298,13 @@ pstip_transform_inst(struct tgsi_transform_context *ctx,
          /* declare new position input reg */
          decl = tgsi_default_full_declaration();
          decl.Declaration.File = TGSI_FILE_INPUT;
-         decl.Declaration.Interpolate = TGSI_INTERPOLATE_LINEAR;
+         decl.Declaration.Interpolate = 1;
          decl.Declaration.Semantic = 1;
          decl.Semantic.Name = TGSI_SEMANTIC_POSITION;
          decl.Semantic.Index = 0;
          decl.Range.First = 
             decl.Range.Last = wincoordInput;
+         decl.Interp.Interpolate = TGSI_INTERPOLATE_LINEAR;
          ctx->emit_declaration(ctx, &decl);
       }
 
diff --git a/src/gallium/auxiliary/util/u_string.h b/src/gallium/auxiliary/util/u_string.h
index cc7992d7391..ed15981f1a5 100644
--- a/src/gallium/auxiliary/util/u_string.h
+++ b/src/gallium/auxiliary/util/u_string.h
@@ -48,6 +48,21 @@
 extern "C" {
 #endif
 
+#ifdef _GNU_SOURCE
+
+#define util_strchrnul strchrnul
+
+#else
+
+static INLINE char *
+util_strchrnul(const char *s, char c)
+{
+   for (; *s && *s != c; ++s);
+
+   return (char *)s;
+}
+
+#endif
 
 #ifdef WIN32
 
@@ -72,12 +87,9 @@ util_sprintf(char *str, const char *format, ...)
 static INLINE char *
 util_strchr(const char *s, char c)
 {
-   while(*s) {
-      if(*s == c)
-	 return (char *)s;
-      ++s;
-   }
-   return NULL;
+   char *p = util_strchrnul(s, c);
+
+   return *p ? p : NULL;
 }
 
 static INLINE char*
diff --git a/src/gallium/docs/source/context.rst b/src/gallium/docs/source/context.rst
index b2872cd282f..d17ea4289da 100644
--- a/src/gallium/docs/source/context.rst
+++ b/src/gallium/docs/source/context.rst
@@ -120,6 +120,23 @@ to the array index which is used for sampling.
 * ``sampler_view_destroy`` destroys a sampler view and releases its reference
   to associated texture.
 
+Shader Resources
+^^^^^^^^^^^^^^^^
+
+Shader resources are textures or buffers that may be read or written
+from a shader without an associated sampler.  This means that they
+have no support for floating point coordinates, address wrap modes or
+filtering.
+
+Shader resources are specified for all the shader stages at once using
+the ``set_shader_resources`` method.  When binding texture resources,
+the ``level``, ``first_layer`` and ``last_layer`` pipe_surface fields
+specify the mipmap level and the range of layers the texture will be
+constrained to.  In the case of buffers, ``first_element`` and
+``last_element`` specify the range within the buffer that will be used
+by the shader resource.  Writes to a shader resource are only allowed
+when the ``writable`` flag is set.
+
 Surfaces
 ^^^^^^^^
 
@@ -542,3 +559,44 @@ These flags control the behavior of a transfer object.
 ``PIPE_TRANSFER_FLUSH_EXPLICIT``
   Written ranges will be notified later with :ref:`transfer_flush_region`.
   Cannot be used with ``PIPE_TRANSFER_READ``.
+
+
+Compute kernel execution
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+A compute program can be defined, bound or destroyed using
+``create_compute_state``, ``bind_compute_state`` or
+``destroy_compute_state`` respectively.
+
+Any of the subroutines contained within the compute program can be
+executed on the device using the ``launch_grid`` method.  This method
+will execute as many instances of the program as elements in the
+specified N-dimensional grid, hopefully in parallel.
+
+The compute program has access to four special resources:
+
+* ``GLOBAL`` represents a memory space shared among all the threads
+  running on the device.  An arbitrary buffer created with the
+  ``PIPE_BIND_GLOBAL`` flag can be mapped into it using the
+  ``set_global_binding`` method.
+
+* ``LOCAL`` represents a memory space shared among all the threads
+  running in the same working group.  The initial contents of this
+  resource are undefined.
+
+* ``PRIVATE`` represents a memory space local to a single thread.
+  The initial contents of this resource are undefined.
+
+* ``INPUT`` represents a read-only memory space that can be
+  initialized at ``launch_grid`` time.
+
+These resources use a byte-based addressing scheme, and they can be
+accessed from the compute program by means of the LOAD/STORE TGSI
+opcodes.  Additional resources to be accessed using the same opcodes
+may be specified by the user with the ``set_compute_resources``
+method.
+
+In addition, normal texture sampling is allowed from the compute
+program: ``bind_compute_sampler_states`` may be used to set up texture
+samplers for the compute stage and ``set_compute_sampler_views`` may
+be used to bind a number of sampler views to it.
diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst
index f6c6f3fd119..ff63ce83bea 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -110,6 +110,8 @@ The integer capabilities:
 * ``PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY``: This CAP describes
   a hw limitation.  If true, pipe_vertex_element::src_offset must always be
   aligned to 4.  If false, there are no restrictions on src_offset.
+* ``PIPE_CAP_COMPUTE``: Whether the implementation supports the
+  compute entry points defined in pipe_context and pipe_screen.
 * ``PIPE_CAP_USER_INDEX_BUFFERS``: Whether user index buffers are supported.
   If not, the state tracker must upload all indices which are not in hw
   resources.
@@ -192,8 +194,33 @@ to be 0.
   If unsupported, only float opcodes are supported.
 * ``PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS``: THe maximum number of texture
   samplers.
+* ``PIPE_SHADER_CAP_PREFERRED_IR``: Preferred representation of the
+  program.  It should be one of the ``pipe_shader_ir`` enum values.
 
 
+.. _pipe_compute_cap:
+
+PIPE_COMPUTE_CAP_*
+^^^^^^^^^^^^^^^^^^
+
+Compute-specific capabilities. They can be queried using
+pipe_screen::get_compute_param.
+
+* ``PIPE_COMPUTE_CAP_GRID_DIMENSION``: Number of supported dimensions
+  for grid and block coordinates.  Value type: ``uint64_t``.
+* ``PIPE_COMPUTE_CAP_MAX_GRID_SIZE``: Maximum grid size in block
+  units.  Value type: ``uint64_t []``.
+* ``PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE``: Maximum block size in thread
+  units.  Value type: ``uint64_t []``.
+* ``PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE``: Maximum size of the GLOBAL
+  resource.  Value type: ``uint64_t``.
+* ``PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE``: Maximum size of the LOCAL
+  resource.  Value type: ``uint64_t``.
+* ``PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE``: Maximum size of the PRIVATE
+  resource.  Value type: ``uint64_t``.
+* ``PIPE_COMPUTE_CAP_MAX_INPUT_SIZE``: Maximum size of the INPUT
+  resource.  Value type: ``uint64_t``.
+
 .. _pipe_bind:
 
 PIPE_BIND_*
@@ -231,6 +258,12 @@ resources might be created and handled quite differently.
 * ``PIPE_BIND_SCANOUT``: A front color buffer or scanout buffer.
 * ``PIPE_BIND_SHARED``: A sharable buffer that can be given to another
   process.
+* ``PIPE_BIND_GLOBAL``: A buffer that can be mapped into the global
+  address space of a compute program.
+* ``PIPE_BIND_SHADER_RESOURCE``: A buffer or texture that can be
+  bound to the graphics pipeline as a shader resource.
+* ``PIPE_BIND_COMPUTE_RESOURCE``: A buffer or texture that can be
+  bound to the compute program as a shader resource.
 
 .. _pipe_usage:
 
diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
index 48e68968346..548a9a39855 100644
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -1312,28 +1312,36 @@ This opcode is the inverse of :opcode:`DFRACEXP`.
    dst.zw = \sqrt{src.zw}
 
 
-.. _resourceopcodes:
+.. _samplingopcodes:
 
-Resource Access Opcodes
-^^^^^^^^^^^^^^^^^^^^^^^^
+Resource Sampling Opcodes
+^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Those opcodes follow very closely semantics of the respective Direct3D
 instructions. If in doubt double check Direct3D documentation.
 
-.. opcode:: LOAD - Simplified alternative to the "SAMPLE" instruction.
-               Using the provided integer address, LOAD fetches data
-               from the specified buffer/texture without any filtering.
+.. opcode:: SAMPLE - Using provided address, sample data from the
+               specified texture using the filtering mode identified
+               by the gven sampler. The source data may come from
+               any resource type other than buffers.
+               SAMPLE dst, address, sampler_view, sampler
+               e.g.
+               SAMPLE TEMP[0], TEMP[1], SVIEW[0], SAMP[0]
+
+.. opcode:: SAMPLE_I - Simplified alternative to the SAMPLE instruction.
+               Using the provided integer address, SAMPLE_I fetches data
+               from the specified sampler view without any filtering.
                The source data may come from any resource type other
                than CUBE.
-               LOAD dst, address, resource
+               SAMPLE_I dst, address, sampler_view
                e.g.
-               LOAD TEMP[0], TEMP[1], RES[0]
+               SAMPLE_I TEMP[0], TEMP[1], SVIEW[0]
                The 'address' is specified as unsigned integers. If the
                'address' is out of range [0...(# texels - 1)] the
                result of the fetch is always 0 in all components.
                As such the instruction doesn't honor address wrap
                modes, in cases where that behavior is desirable
-               'sample' instruction should be used.
+               'SAMPLE' instruction should be used.
                address.w always provides an unsigned integer mipmap
                level. If the value is out of the range then the
                instruction always returns 0 in all components.
@@ -1348,7 +1356,7 @@ instructions. If in doubt double check Direct3D documentation.
                For 2D texture arrays address.z provides the array
                index, otherwise it exhibits the same behavior as in
                the case for 1D texture arrays.
-               The exeact semantics of the source address are presented
+               The exact semantics of the source address are presented
                in the table below:
                resource type         X     Y     Z       W
                -------------         ------------------------
@@ -1364,25 +1372,16 @@ instructions. If in doubt double check Direct3D documentation.
                Where 'mpl' is a mipmap level and 'idx' is the
                array index.
 
-
-.. opcode:: LOAD_MS - Just like LOAD but allows fetch data from
+.. opcode:: SAMPLE_I_MS - Just like SAMPLE_I but allows fetch data from
                multi-sampled surfaces.
 
-.. opcode:: SAMPLE - Using provided address, sample data from the
-               specified texture using the filtering mode identified
-               by the gven sampler. The source data may come from
-               any resource type other than buffers.
-               SAMPLE dst, address, resource, sampler
-               e.g.
-               SAMPLE TEMP[0], TEMP[1], RES[0], SAMP[0]
-
 .. opcode:: SAMPLE_B - Just like the SAMPLE instruction with the
                exception that an additiona bias is applied to the
                level of detail computed as part of the instruction
                execution.
-               SAMPLE_B dst, address, resource, sampler, lod_bias
+               SAMPLE_B dst, address, sampler_view, sampler, lod_bias
                e.g.
-               SAMPLE_B TEMP[0], TEMP[1], RES[0], SAMP[0], TEMP[2].x
+               SAMPLE_B TEMP[0], TEMP[1], SVIEW[0], SAMP[0], TEMP[2].x
 
 .. opcode:: SAMPLE_C - Similar to the SAMPLE instruction but it
                performs a comparison filter. The operands to SAMPLE_C
@@ -1394,33 +1393,32 @@ instructions. If in doubt double check Direct3D documentation.
                reference value against the red component value for the
                surce resource at each texel that the currently configured
                texture filter covers based on the provided coordinates.
-               SAMPLE_C dst, address, resource.r, sampler, ref_value
+               SAMPLE_C dst, address, sampler_view.r, sampler, ref_value
                e.g.
-               SAMPLE_C TEMP[0], TEMP[1], RES[0].r, SAMP[0], TEMP[2].x
+               SAMPLE_C TEMP[0], TEMP[1], SVIEW[0].r, SAMP[0], TEMP[2].x
 
 .. opcode:: SAMPLE_C_LZ - Same as SAMPLE_C, but LOD is 0 and derivatives
                are ignored. The LZ stands for level-zero.
-               SAMPLE_C_LZ dst, address, resource.r, sampler, ref_value
+               SAMPLE_C_LZ dst, address, sampler_view.r, sampler, ref_value
                e.g.
-               SAMPLE_C_LZ TEMP[0], TEMP[1], RES[0].r, SAMP[0], TEMP[2].x
+               SAMPLE_C_LZ TEMP[0], TEMP[1], SVIEW[0].r, SAMP[0], TEMP[2].x
 
 
 .. opcode:: SAMPLE_D - SAMPLE_D is identical to the SAMPLE opcode except
                that the derivatives for the source address in the x
                direction and the y direction are provided by extra
                parameters.
-               SAMPLE_D dst, address, resource, sampler, der_x, der_y
+               SAMPLE_D dst, address, sampler_view, sampler, der_x, der_y
                e.g.
-               SAMPLE_D TEMP[0], TEMP[1], RES[0], SAMP[0], TEMP[2], TEMP[3]
+               SAMPLE_D TEMP[0], TEMP[1], SVIEW[0], SAMP[0], TEMP[2], TEMP[3]
 
 .. opcode:: SAMPLE_L - SAMPLE_L is identical to the SAMPLE opcode except
                that the LOD is provided directly as a scalar value,
                representing no anisotropy. Source addresses A channel
                is used as the LOD.
-               SAMPLE_L dst, address, resource, sampler
+               SAMPLE_L dst, address, sampler_view, sampler
                e.g.
-               SAMPLE_L TEMP[0], TEMP[1], RES[0], SAMP[0]
-
+               SAMPLE_L TEMP[0], TEMP[1], SVIEW[0], SAMP[0]
 
 .. opcode:: GATHER4 - Gathers the four texels to be used in a bi-linear
                filtering operation and packs them into a single register.
@@ -1435,18 +1433,18 @@ instructions. If in doubt double check Direct3D documentation.
                the magnitude of the deltas are half a texel.
 
 
-.. opcode:: RESINFO - query the dimensions of a given input buffer.
+.. opcode:: SVIEWINFO - query the dimensions of a given sampler view.
                dst receives width, height, depth or array size and
                number of mipmap levels. The dst can have a writemask
                which will specify what info is the caller interested
                in.
-               RESINFO dst, src_mip_level, resource
+               SVIEWINFO dst, src_mip_level, sampler_view
                e.g.
-               RESINFO TEMP[0], TEMP[1].x, RES[0]
+               SVIEWINFO TEMP[0], TEMP[1].x, SVIEW[0]
                src_mip_level is an unsigned integer scalar. If it's
                out of range then returns 0 for width, height and
                depth/array size but the total number of mipmap is
-               still returned correctly for the given resource.
+               still returned correctly for the given sampler view.
                The returned width, height and depth values are for
                the mipmap level selected by the src_mip_level and
                are in the number of texels.
@@ -1463,6 +1461,272 @@ instructions. If in doubt double check Direct3D documentation.
                not a render target, the result is 0.
 
 
+.. _resourceopcodes:
+
+Resource Access Opcodes
+^^^^^^^^^^^^^^^^^^^^^^^
+
+.. opcode:: LOAD - Fetch data from a shader resource
+
+               Syntax: ``LOAD dst, resource, address``
+
+               Example: ``LOAD TEMP[0], RES[0], TEMP[1]``
+
+               Using the provided integer address, LOAD fetches data
+               from the specified buffer or texture without any
+               filtering.
+
+               The 'address' is specified as a vector of unsigned
+               integers.  If the 'address' is out of range the result
+               is unspecified.
+
+               Only the first mipmap level of a resource can be read
+               from using this instruction.
+
+               For 1D or 2D texture arrays, the array index is
+               provided as an unsigned integer in address.y or
+               address.z, respectively.  address.yz are ignored for
+               buffers and 1D textures.  address.z is ignored for 1D
+               texture arrays and 2D textures.  address.w is always
+               ignored.
+
+.. opcode:: STORE - Write data to a shader resource
+
+               Syntax: ``STORE resource, address, src``
+
+               Example: ``STORE RES[0], TEMP[0], TEMP[1]``
+
+               Using the provided integer address, STORE writes data
+               to the specified buffer or texture.
+
+               The 'address' is specified as a vector of unsigned
+               integers.  If the 'address' is out of range the result
+               is unspecified.
+
+               Only the first mipmap level of a resource can be
+               written to using this instruction.
+
+               For 1D or 2D texture arrays, the array index is
+               provided as an unsigned integer in address.y or
+               address.z, respectively.  address.yz are ignored for
+               buffers and 1D textures.  address.z is ignored for 1D
+               texture arrays and 2D textures.  address.w is always
+               ignored.
+
+
+.. _threadsyncopcodes:
+
+Inter-thread synchronization opcodes
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+These opcodes are intended for communication between threads running
+within the same compute grid.  For now they're only valid in compute
+programs.
+
+.. opcode:: MFENCE - Memory fence
+
+  Syntax: ``MFENCE resource``
+
+  Example: ``MFENCE RES[0]``
+
+  This opcode forces strong ordering between any memory access
+  operations that affect the specified resource.  This means that
+  previous loads and stores (and only those) will be performed and
+  visible to other threads before the program execution continues.
+
+
+.. opcode:: LFENCE - Load memory fence
+
+  Syntax: ``LFENCE resource``
+
+  Example: ``LFENCE RES[0]``
+
+  Similar to MFENCE, but it only affects the ordering of memory loads.
+
+
+.. opcode:: SFENCE - Store memory fence
+
+  Syntax: ``SFENCE resource``
+
+  Example: ``SFENCE RES[0]``
+
+  Similar to MFENCE, but it only affects the ordering of memory stores.
+
+
+.. opcode:: BARRIER - Thread group barrier
+
+  ``BARRIER``
+
+  This opcode suspends the execution of the current thread until all
+  the remaining threads in the working group reach the same point of
+  the program.  Results are unspecified if any of the remaining
+  threads terminates or never reaches an executed BARRIER instruction.
+
+
+.. _atomopcodes:
+
+Atomic opcodes
+^^^^^^^^^^^^^^
+
+These opcodes provide atomic variants of some common arithmetic and
+logical operations.  In this context atomicity means that another
+concurrent memory access operation that affects the same memory
+location is guaranteed to be performed strictly before or after the
+entire execution of the atomic operation.
+
+For the moment they're only valid in compute programs.
+
+.. opcode:: ATOMUADD - Atomic integer addition
+
+  Syntax: ``ATOMUADD dst, resource, offset, src``
+
+  Example: ``ATOMUADD TEMP[0], RES[0], TEMP[1], TEMP[2]``
+
+  The following operation is performed atomically on each component:
+
+.. math::
+
+  dst_i = resource[offset]_i
+
+  resource[offset]_i = dst_i + src_i
+
+
+.. opcode:: ATOMXCHG - Atomic exchange
+
+  Syntax: ``ATOMXCHG dst, resource, offset, src``
+
+  Example: ``ATOMXCHG TEMP[0], RES[0], TEMP[1], TEMP[2]``
+
+  The following operation is performed atomically on each component:
+
+.. math::
+
+  dst_i = resource[offset]_i
+
+  resource[offset]_i = src_i
+
+
+.. opcode:: ATOMCAS - Atomic compare-and-exchange
+
+  Syntax: ``ATOMCAS dst, resource, offset, cmp, src``
+
+  Example: ``ATOMCAS TEMP[0], RES[0], TEMP[1], TEMP[2], TEMP[3]``
+
+  The following operation is performed atomically on each component:
+
+.. math::
+
+  dst_i = resource[offset]_i
+
+  resource[offset]_i = (dst_i == cmp_i ? src_i : dst_i)
+
+
+.. opcode:: ATOMAND - Atomic bitwise And
+
+  Syntax: ``ATOMAND dst, resource, offset, src``
+
+  Example: ``ATOMAND TEMP[0], RES[0], TEMP[1], TEMP[2]``
+
+  The following operation is performed atomically on each component:
+
+.. math::
+
+  dst_i = resource[offset]_i
+
+  resource[offset]_i = dst_i \& src_i
+
+
+.. opcode:: ATOMOR - Atomic bitwise Or
+
+  Syntax: ``ATOMOR dst, resource, offset, src``
+
+  Example: ``ATOMOR TEMP[0], RES[0], TEMP[1], TEMP[2]``
+
+  The following operation is performed atomically on each component:
+
+.. math::
+
+  dst_i = resource[offset]_i
+
+  resource[offset]_i = dst_i | src_i
+
+
+.. opcode:: ATOMXOR - Atomic bitwise Xor
+
+  Syntax: ``ATOMXOR dst, resource, offset, src``
+
+  Example: ``ATOMXOR TEMP[0], RES[0], TEMP[1], TEMP[2]``
+
+  The following operation is performed atomically on each component:
+
+.. math::
+
+  dst_i = resource[offset]_i
+
+  resource[offset]_i = dst_i \oplus src_i
+
+
+.. opcode:: ATOMUMIN - Atomic unsigned minimum
+
+  Syntax: ``ATOMUMIN dst, resource, offset, src``
+
+  Example: ``ATOMUMIN TEMP[0], RES[0], TEMP[1], TEMP[2]``
+
+  The following operation is performed atomically on each component:
+
+.. math::
+
+  dst_i = resource[offset]_i
+
+  resource[offset]_i = (dst_i < src_i ? dst_i : src_i)
+
+
+.. opcode:: ATOMUMAX - Atomic unsigned maximum
+
+  Syntax: ``ATOMUMAX dst, resource, offset, src``
+
+  Example: ``ATOMUMAX TEMP[0], RES[0], TEMP[1], TEMP[2]``
+
+  The following operation is performed atomically on each component:
+
+.. math::
+
+  dst_i = resource[offset]_i
+
+  resource[offset]_i = (dst_i > src_i ? dst_i : src_i)
+
+
+.. opcode:: ATOMIMIN - Atomic signed minimum
+
+  Syntax: ``ATOMIMIN dst, resource, offset, src``
+
+  Example: ``ATOMIMIN TEMP[0], RES[0], TEMP[1], TEMP[2]``
+
+  The following operation is performed atomically on each component:
+
+.. math::
+
+  dst_i = resource[offset]_i
+
+  resource[offset]_i = (dst_i < src_i ? dst_i : src_i)
+
+
+.. opcode:: ATOMIMAX - Atomic signed maximum
+
+  Syntax: ``ATOMIMAX dst, resource, offset, src``
+
+  Example: ``ATOMIMAX TEMP[0], RES[0], TEMP[1], TEMP[2]``
+
+  The following operation is performed atomically on each component:
+
+.. math::
+
+  dst_i = resource[offset]_i
+
+  resource[offset]_i = (dst_i > src_i ? dst_i : src_i)
+
+
+
 Explanation of symbols used
 ------------------------------
 
@@ -1531,19 +1795,17 @@ of TGSI_FILE.
 UsageMask field specifies which of the register components can be accessed
 and is one of TGSI_WRITEMASK.
 
-Interpolate field is only valid for fragment shader INPUT register files.
-It specifes the way input is being interpolated by the rasteriser and is one
-of TGSI_INTERPOLATE.
+The Local flag specifies that a given value isn't intended for
+subroutine parameter passing and, as a result, the implementation
+isn't required to give any guarantees of it being preserved across
+subroutine boundaries.  As it's merely a compiler hint, the
+implementation is free to ignore it.
 
 If Dimension flag is set to 1, a Declaration Dimension token follows.
 
 If Semantic flag is set to 1, a Declaration Semantic token follows.
 
-CylindricalWrap bitfield is only valid for fragment shader INPUT register
-files. It specifies which register components should be subject to cylindrical
-wrapping when interpolating by the rasteriser. If TGSI_CYLINDRICAL_WRAP_X
-is set to 1, the X component should be interpolated according to cylindrical
-wrapping rules.
+If Interpolate flag is set to 1, a Declaration Interpolate token follows.
 
 If file is TGSI_FILE_RESOURCE, a Declaration Resource token follows.
 
@@ -1690,12 +1952,42 @@ is a writable stencil reference value. Only the Y component is writable.
 This allows the fragment shader to change the fragments stencilref value.
 
 
-Declaration Resource
+Declaration Interpolate
+^^^^^^^^^^^^^^^^^^^^^^^
+
+This token is only valid for fragment shader INPUT declarations.
+
+The Interpolate field specifes the way input is being interpolated by
+the rasteriser and is one of TGSI_INTERPOLATE_*.
+
+The CylindricalWrap bitfield specifies which register components
+should be subject to cylindrical wrapping when interpolating by the
+rasteriser. If TGSI_CYLINDRICAL_WRAP_X is set to 1, the X component
+should be interpolated according to cylindrical wrapping rules.
+
+
+Declaration Sampler View
 ^^^^^^^^^^^^^^^^^^^^^^^^
 
+   Follows Declaration token if file is TGSI_FILE_SAMPLER_VIEW.
+
+   DCL SVIEW[#], resource, type(s)
+
+   Declares a shader input sampler view and assigns it to a SVIEW[#]
+   register.
+
+   resource can be one of BUFFER, 1D, 2D, 3D, 1DArray and 2DArray.
+
+   type must be 1 or 4 entries (if specifying on a per-component
+   level) out of UNORM, SNORM, SINT, UINT and FLOAT.
+
+
+Declaration Resource
+^^^^^^^^^^^^^^^^^^^^
+
    Follows Declaration token if file is TGSI_FILE_RESOURCE.
 
-   DCL RES[#], resource, type(s)
+   DCL RES[#], resource [, WR] [, RAW]
 
    Declares a shader input resource and assigns it to a RES[#]
    register.
@@ -1703,8 +1995,21 @@ Declaration Resource
    resource can be one of BUFFER, 1D, 2D, 3D, CUBE, 1DArray and
    2DArray.
 
-   type must be 1 or 4 entries (if specifying on a per-component
-   level) out of UNORM, SNORM, SINT, UINT and FLOAT.
+   If the RAW keyword is not specified, the texture data will be
+   subject to conversion, swizzling and scaling as required to yield
+   the specified data type from the physical data format of the bound
+   resource.
+
+   If the RAW keyword is specified, no channel conversion will be
+   performed: the values read for each of the channels (X,Y,Z,W) will
+   correspond to consecutive words in the same order and format
+   they're found in memory.  No element-to-address conversion will be
+   performed either: the value of the provided X coordinate will be
+   interpreted in byte units instead of texel units.  The result of
+   accessing a misaligned address is undefined.
+
+   Usage of the STORE opcode is only allowed if the WR (writable) flag
+   is set.
 
 
 Properties
diff --git a/src/gallium/drivers/i915/i915_fpc_translate.c b/src/gallium/drivers/i915/i915_fpc_translate.c
index 94c0c69fcae..def9a03d377 100644
--- a/src/gallium/drivers/i915/i915_fpc_translate.c
+++ b/src/gallium/drivers/i915/i915_fpc_translate.c
@@ -512,6 +512,22 @@ i915_translate_instruction(struct i915_fp_compile *p,
       emit_simple_arith(p, inst, A0_ADD, 2, fs);
       break;
 
+   case TGSI_OPCODE_CEIL:
+      src0 = src_vector(p, &inst->Src[0], fs);
+      tmp = i915_get_utemp(p);
+      flags = get_result_flags(inst);
+      i915_emit_arith(p,
+                      A0_FLR,
+                      tmp,
+                      flags & A0_DEST_CHANNEL_ALL, 0,
+                      negate(src0, 1, 1, 1, 1), 0, 0);
+      i915_emit_arith(p,
+                      A0_MOV,
+                      get_result_vector(p, &inst->Dst[0]),
+                      flags, 0,
+                      negate(tmp, 1, 1, 1, 1), 0, 0);
+      break;
+
    case TGSI_OPCODE_CMP:
       src0 = src_vector(p, &inst->Src[0], fs);
       src1 = src_vector(p, &inst->Src[1], fs);
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend.h b/src/gallium/drivers/llvmpipe/lp_bld_blend.h
index f82ae30bb7d..c0c95a27129 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend.h
@@ -64,12 +64,14 @@ lp_build_blend_func(struct lp_build_context *bld,
 LLVMValueRef
 lp_build_blend_aos(struct gallivm_state *gallivm,
                    const struct pipe_blend_state *blend,
+                   const enum pipe_format *cbuf_format,
                    struct lp_type type,
                    unsigned rt,
                    LLVMValueRef src,
                    LLVMValueRef dst,
+                   LLVMValueRef mask,
                    LLVMValueRef const_,
-                   unsigned alpha_swizzle);
+                   const unsigned char swizzle[4]);
 
 
 void
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
index c342346a36e..59d5f545966 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
@@ -45,12 +45,14 @@
 
 #include "pipe/p_state.h"
 #include "util/u_debug.h"
+#include "util/u_format.h"
 
 #include "gallivm/lp_bld_type.h"
 #include "gallivm/lp_bld_const.h"
 #include "gallivm/lp_bld_arit.h"
 #include "gallivm/lp_bld_logic.h"
 #include "gallivm/lp_bld_swizzle.h"
+#include "gallivm/lp_bld_bitarit.h"
 #include "gallivm/lp_bld_debug.h"
 
 #include "lp_bld_blend.h"
@@ -300,25 +302,39 @@ lp_build_blend_func(struct lp_build_context *bld,
 }
 
 
+/**
+ * Performs blending of src and dst pixels
+ *
+ * @param blend         the blend state of the shader variant
+ * @param cbuf_format   format of the colour buffer
+ * @param type          data type of the pixel vector
+ * @param rt            rt number
+ * @param src           blend src
+ * @param dst           blend dst
+ * @param mask          optional mask to apply to the blending result
+ * @param const_        const blend color
+ * @param swizzle       swizzle values for RGBA
+ *
+ * @return the result of blending src and dst
+ */
 LLVMValueRef
 lp_build_blend_aos(struct gallivm_state *gallivm,
                    const struct pipe_blend_state *blend,
+                   const enum pipe_format *cbuf_format,
                    struct lp_type type,
                    unsigned rt,
                    LLVMValueRef src,
                    LLVMValueRef dst,
+                   LLVMValueRef mask,
                    LLVMValueRef const_,
-                   unsigned alpha_swizzle)
+                   const unsigned char swizzle[4])
 {
    struct lp_build_blend_aos_context bld;
    LLVMValueRef src_term;
    LLVMValueRef dst_term;
-
-   /* FIXME: color masking not implemented yet */
-   assert(blend->rt[rt].colormask == 0xf);
-
-   if(!blend->rt[rt].blend_enable)
-      return src;
+   LLVMValueRef result;
+   unsigned alpha_swizzle = swizzle[3];
+   boolean fullcolormask;
 
    /* Setup build context */
    memset(&bld, 0, sizeof bld);
@@ -327,30 +343,59 @@ lp_build_blend_aos(struct gallivm_state *gallivm,
    bld.dst = dst;
    bld.const_ = const_;
 
-   /* TODO: There are still a few optimization opportunities here. For certain
-    * combinations it is possible to reorder the operations and therefore saving
-    * some instructions. */
+   if (!blend->rt[rt].blend_enable) {
+      result = src;
+   } else {
+
+      /* TODO: There are still a few optimization opportunities here. For certain
+       * combinations it is possible to reorder the operations and therefore saving
+       * some instructions. */
+
+      src_term = lp_build_blend_factor(&bld, src, blend->rt[rt].rgb_src_factor,
+                                       blend->rt[rt].alpha_src_factor, alpha_swizzle);
+      dst_term = lp_build_blend_factor(&bld, dst, blend->rt[rt].rgb_dst_factor,
+                                       blend->rt[rt].alpha_dst_factor, alpha_swizzle);
+
+      lp_build_name(src_term, "src_term");
+      lp_build_name(dst_term, "dst_term");
 
-   src_term = lp_build_blend_factor(&bld, src, blend->rt[rt].rgb_src_factor,
-                                    blend->rt[rt].alpha_src_factor, alpha_swizzle);
-   dst_term = lp_build_blend_factor(&bld, dst, blend->rt[rt].rgb_dst_factor,
-                                    blend->rt[rt].alpha_dst_factor, alpha_swizzle);
+      if(blend->rt[rt].rgb_func == blend->rt[rt].alpha_func) {
+         result = lp_build_blend_func(&bld.base, blend->rt[rt].rgb_func, src_term, dst_term);
+      }
+      else {
+         /* Seperate RGB / A functions */
+
+         LLVMValueRef rgb;
+         LLVMValueRef alpha;
 
-   lp_build_name(src_term, "src_term");
-   lp_build_name(dst_term, "dst_term");
+         rgb   = lp_build_blend_func(&bld.base, blend->rt[rt].rgb_func,   src_term, dst_term);
+         alpha = lp_build_blend_func(&bld.base, blend->rt[rt].alpha_func, src_term, dst_term);
 
-   if(blend->rt[rt].rgb_func == blend->rt[rt].alpha_func) {
-      return lp_build_blend_func(&bld.base, blend->rt[rt].rgb_func, src_term, dst_term);
+         result = lp_build_blend_swizzle(&bld, rgb, alpha, LP_BUILD_BLEND_SWIZZLE_RGBA, alpha_swizzle);
+      }
    }
-   else {
-      /* Seperate RGB / A functions */
 
-      LLVMValueRef rgb;
-      LLVMValueRef alpha;
+   /* Check if color mask is necessary */
+   fullcolormask = util_format_colormask_full(util_format_description(cbuf_format[rt]), blend->rt[rt].colormask);
+
+   if (!fullcolormask) {
+      LLVMValueRef color_mask;
 
-      rgb   = lp_build_blend_func(&bld.base, blend->rt[rt].rgb_func,   src_term, dst_term);
-      alpha = lp_build_blend_func(&bld.base, blend->rt[rt].alpha_func, src_term, dst_term);
+      color_mask = lp_build_const_mask_aos_swizzled(gallivm, bld.base.type, blend->rt[rt].colormask, swizzle);
+      lp_build_name(color_mask, "color_mask");
 
-      return lp_build_blend_swizzle(&bld, rgb, alpha, LP_BUILD_BLEND_SWIZZLE_RGBA, alpha_swizzle);
+      /* Combine with input mask if necessary */
+      if (mask) {
+         mask = lp_build_and(&bld.base, color_mask, mask);
+      } else {
+         mask = color_mask;
+      }
+   }
+
+   /* Apply mask, if one exists */
+   if (mask) {
+      result = lp_build_select(&bld.base, mask, result, dst);
    }
+
+   return result;
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
index 71d0ddf5e75..230b80a945f 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -290,6 +290,10 @@ lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
    c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
    rej4 = _mm_slli_epi32(rej4, 2);
 
+   /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */
+   c = _mm_sub_epi32(c, _mm_set1_epi32(1));
+   rej4 = _mm_add_epi32(rej4, _mm_set1_epi32(1));
+
    dcdx2 = _mm_add_epi32(dcdx, dcdx);
    dcdx3 = _mm_add_epi32(dcdx2, dcdx);
 
@@ -383,7 +387,7 @@ lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
    __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
    __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
    __m128i unused;
-   
+
    transpose4_epi32(&p0, &p1, &p2, &zero,
                     &c, &dcdx, &dcdy, &unused);
 
@@ -394,6 +398,9 @@ lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
    c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
    c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
 
+   /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */
+   c = _mm_sub_epi32(c, _mm_set1_epi32(1));
+
    dcdx2 = _mm_add_epi32(dcdx, dcdx);
    dcdx3 = _mm_add_epi32(dcdx2, dcdx);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index b50c354fa9b..26d35debdaf 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -271,15 +271,13 @@ do_triangle_ccw(struct lp_setup_context *setup,
        */
       int adj = (setup->pixel_offset != 0) ? 1 : 0;
 
-      bbox.x0 = (MIN3(x[0], x[1], x[2]) + (FIXED_ONE-1)) >> FIXED_ORDER;
-      bbox.x1 = (MAX3(x[0], x[1], x[2]) + (FIXED_ONE-1)) >> FIXED_ORDER;
-      bbox.y0 = (MIN3(y[0], y[1], y[2]) + (FIXED_ONE-1) + adj) >> FIXED_ORDER;
-      bbox.y1 = (MAX3(y[0], y[1], y[2]) + (FIXED_ONE-1) + adj) >> FIXED_ORDER;
+      /* Inclusive x0, exclusive x1 */
+      bbox.x0 = MIN3(x[0], x[1], x[2]) >> FIXED_ORDER;
+      bbox.x1 = (MAX3(x[0], x[1], x[2]) - 1) >> FIXED_ORDER;
 
-      /* Inclusive coordinates:
-       */
-      bbox.x1--;
-      bbox.y1--;
+      /* Inclusive / exclusive depending upon adj (bottom-left or top-right) */
+      bbox.y0 = (MIN3(y[0], y[1], y[2]) + adj) >> FIXED_ORDER;
+      bbox.y1 = (MAX3(y[0], y[1], y[2]) - 1 + adj) >> FIXED_ORDER;
    }
 
    if (bbox.x1 < bbox.x0 ||
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index ec94190649c..2d2391e908c 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -924,6 +924,7 @@ generate_variant(struct llvmpipe_context *lp,
                  const struct lp_fragment_shader_variant_key *key)
 {
    struct lp_fragment_shader_variant *variant;
+   const struct util_format_description *cbuf0_format_desc;
    boolean fullcolormask;
 
    variant = CALLOC_STRUCT(lp_fragment_shader_variant);
@@ -942,12 +943,8 @@ generate_variant(struct llvmpipe_context *lp,
     */
    fullcolormask = FALSE;
    if (key->nr_cbufs == 1) {
-      const struct util_format_description *format_desc;
-      format_desc = util_format_description(key->cbuf_format[0]);
-      if ((~key->blend.rt[0].colormask &
-           util_format_colormask(format_desc)) == 0) {
-         fullcolormask = TRUE;
-      }
+      cbuf0_format_desc = util_format_description(key->cbuf_format[0]);
+      fullcolormask = util_format_colormask_full(cbuf0_format_desc, key->blend.rt[0].colormask);
    }
 
    variant->opaque =
diff --git a/src/gallium/drivers/llvmpipe/lp_test_blend.c b/src/gallium/drivers/llvmpipe/lp_test_blend.c
index b3ca134131d..51324cbb6a3 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_blend.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_blend.c
@@ -179,7 +179,9 @@ add_blend_test(struct gallivm_state *gallivm,
    LLVMValueRef res_ptr;
    LLVMBasicBlockRef block;
    LLVMBuilderRef builder;
+   const enum pipe_format format = PIPE_FORMAT_R8G8B8A8_UNORM;
    const unsigned rt = 0;
+   const unsigned char swizzle[4] = { 0, 1, 2, 3 };
 
    vec_type = lp_build_vec_type(gallivm, type);
 
@@ -205,7 +207,7 @@ add_blend_test(struct gallivm_state *gallivm,
       dst = LLVMBuildLoad(builder, dst_ptr, "dst");
       con = LLVMBuildLoad(builder, const_ptr, "const");
 
-      res = lp_build_blend_aos(gallivm, blend, type, rt, src, dst, con, 3);
+      res = lp_build_blend_aos(gallivm, blend, &format, type, rt, src, dst, NULL, con, swizzle);
 
       lp_build_name(res, "res");
 
diff --git a/src/gallium/drivers/nouveau/nouveau_buffer.c b/src/gallium/drivers/nouveau/nouveau_buffer.c
index ff199debd74..936e2bf246a 100644
--- a/src/gallium/drivers/nouveau/nouveau_buffer.c
+++ b/src/gallium/drivers/nouveau/nouveau_buffer.c
@@ -523,8 +523,10 @@ nouveau_scratch_runout_release(struct nouveau_context *nv)
 {
    if (!nv->scratch.nr_runout)
       return;
-   while (nv->scratch.nr_runout--)
+   do {
+      --nv->scratch.nr_runout;
       nouveau_bo_ref(NULL, &nv->scratch.runout[nv->scratch.nr_runout]);
+   } while (nv->scratch.nr_runout);
 
    FREE(nv->scratch.runout);
    nv->scratch.end = 0;
diff --git a/src/gallium/drivers/nv30/nvfx_fragprog.c b/src/gallium/drivers/nv30/nvfx_fragprog.c
index e562b454f92..592ad21c6c8 100644
--- a/src/gallium/drivers/nv30/nvfx_fragprog.c
+++ b/src/gallium/drivers/nv30/nvfx_fragprog.c
@@ -535,6 +535,11 @@ nvfx_fragprog_parse_instruction(struct nv30_context* nvfx, struct nvfx_fpc *fpc,
    case TGSI_OPCODE_ADD:
       nvfx_fp_emit(fpc, arith(sat, ADD, dst, mask, src[0], src[1], none));
       break;
+   case TGSI_OPCODE_CEIL:
+      tmp = nvfx_src(temp(fpc));
+      nvfx_fp_emit(fpc, arith(0, FLR, tmp.reg, mask, neg(src[0]), none, none));
+      nvfx_fp_emit(fpc, arith(sat, MOV, dst, mask, neg(tmp), none, none));
+      break;
    case TGSI_OPCODE_CMP:
       insn = arith(0, MOV, none.reg, mask, src[0], none, none);
       insn.cc_update = 1;
diff --git a/src/gallium/drivers/nv30/nvfx_vertprog.c b/src/gallium/drivers/nv30/nvfx_vertprog.c
index d7eb9fb0a63..82972b3943c 100644
--- a/src/gallium/drivers/nv30/nvfx_vertprog.c
+++ b/src/gallium/drivers/nv30/nvfx_vertprog.c
@@ -550,6 +550,11 @@ nvfx_vertprog_parse_instruction(struct nv30_context *nv30, struct nvfx_vpc *vpc,
    case TGSI_OPCODE_ARL:
       nvfx_vp_emit(vpc, arith(0, VEC, ARL, dst, mask, src[0], none, none));
       break;
+   case TGSI_OPCODE_CEIL:
+      tmp = nvfx_src(temp(vpc));
+      nvfx_vp_emit(vpc, arith(0, VEC, FLR, tmp.reg, mask, neg(src[0]), none, none));
+      nvfx_vp_emit(vpc, arith(sat, VEC, MOV, dst, mask, neg(tmp), none, none));
+      break;
    case TGSI_OPCODE_CMP:
       insn = arith(0, VEC, MOV, none.reg, mask, src[0], none, none);
       insn.cc_update = 1;
diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_build_util.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_build_util.cpp
index f7dac25c116..f713e6391c6 100644
--- a/src/gallium/drivers/nv50/codegen/nv50_ir_build_util.cpp
+++ b/src/gallium/drivers/nv50/codegen/nv50_ir_build_util.cpp
@@ -278,6 +278,31 @@ BuildUtil::mkSelect(Value *pred, Value *dst, Value *trSrc, Value *flSrc)
    return mkOp2(OP_UNION, typeOfSize(dst->reg.size), dst, def0, def1);
 }
 
+Instruction *
+BuildUtil::mkSplit(Value *h[2], uint8_t halfSize, Value *val)
+{
+   Instruction *insn = NULL;
+
+   const DataType fTy = typeOfSize(halfSize * 2);
+
+   if (val->reg.file == FILE_IMMEDIATE)
+      val = mkMov(getSSA(halfSize * 2), val, fTy)->getDef(0);
+
+   if (isMemoryFile(val->reg.file)) {
+      h[0] = cloneShallow(getFunction(), val);
+      h[1] = cloneShallow(getFunction(), val);
+      h[0]->reg.size = halfSize;
+      h[1]->reg.size = halfSize;
+      h[1]->reg.data.offset += halfSize;
+   } else {
+      h[0] = getSSA(halfSize, val->reg.file);
+      h[1] = getSSA(halfSize, val->reg.file);
+      insn = mkOp1(OP_SPLIT, fTy, h[0], val);
+      insn->setDef(1, h[1]);
+   }
+   return insn;
+}
+
 FlowInstruction *
 BuildUtil::mkFlow(operation op, void *targ, CondCode cc, Value *pred)
 {
diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_build_util.h b/src/gallium/drivers/nv50/codegen/nv50_ir_build_util.h
index 9ee04dbcd12..dd7e491cb5c 100644
--- a/src/gallium/drivers/nv50/codegen/nv50_ir_build_util.h
+++ b/src/gallium/drivers/nv50/codegen/nv50_ir_build_util.h
@@ -81,6 +81,8 @@ public:
 
    Instruction *mkSelect(Value *pred, Value *dst, Value *trSrc, Value *flSrc);
 
+   Instruction *mkSplit(Value *half[2], uint8_t halfSize, Value *);
+
    void mkClobber(DataFile file, uint32_t regMask, int regUnitLog2);
 
    ImmediateValue *mkImm(float);
diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_from_tgsi.cpp
index 82e23602ca0..16f191da159 100644
--- a/src/gallium/drivers/nv50/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nv50/codegen/nv50_ir_from_tgsi.cpp
@@ -347,6 +347,7 @@ static nv50_ir::TexTarget translateTexture(uint tex)
    NV50_IR_TEX_TARG_CASE(SHADOW2D_ARRAY, 2D_ARRAY_SHADOW);
    NV50_IR_TEX_TARG_CASE(SHADOWCUBE, CUBE_SHADOW);
    NV50_IR_TEX_TARG_CASE(SHADOWRECT, RECT_SHADOW);
+   NV50_IR_TEX_TARG_CASE(BUFFER, BUFFER);
 
    case TGSI_TEXTURE_UNKNOWN:
    default:
@@ -548,7 +549,7 @@ static nv50_ir::operation translateOpcode(uint opcode)
    NV50_IR_OPCODE_CASE(SAMPLE_D, TXD);
    NV50_IR_OPCODE_CASE(SAMPLE_L, TXL);
    NV50_IR_OPCODE_CASE(GATHER4, TXG);
-   NV50_IR_OPCODE_CASE(RESINFO, TXQ);
+   NV50_IR_OPCODE_CASE(SVIEWINFO, TXQ);
 
    NV50_IR_OPCODE_CASE(END, EXIT);
 
@@ -597,8 +598,8 @@ public:
 
    int clipVertexOutput;
 
-   uint8_t *resourceTargets; // TGSI_TEXTURE_*
-   unsigned resourceCount;
+   uint8_t *samplerViewTargets; // TGSI_TEXTURE_*
+   unsigned samplerViewCount;
 
 private:
    int inferSysValDirection(unsigned sn) const;
@@ -617,7 +618,7 @@ Source::Source(struct nv50_ir_prog_info *prog) : info(prog)
    if (prog->dbgFlags & NV50_IR_DEBUG_BASIC)
       tgsi_dump(tokens, 0);
 
-   resourceTargets = NULL;
+   samplerViewTargets = NULL;
 
    mainTempsInLMem = FALSE;
 }
@@ -632,8 +633,8 @@ Source::~Source()
    if (info->immd.type)
       FREE(info->immd.type);
 
-   if (resourceTargets)
-      delete[] resourceTargets;
+   if (samplerViewTargets)
+      delete[] samplerViewTargets;
 }
 
 bool Source::scanSource()
@@ -650,8 +651,8 @@ bool Source::scanSource()
 
    clipVertexOutput = -1;
 
-   resourceCount = scan.file_max[TGSI_FILE_RESOURCE] + 1;
-   resourceTargets = new uint8_t[resourceCount];
+   samplerViewCount = scan.file_max[TGSI_FILE_SAMPLER_VIEW] + 1;
+   samplerViewTargets = new uint8_t[samplerViewCount];
 
    info->immd.bufSize = 0;
    tempArrayCount = 0;
@@ -805,7 +806,7 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
             info->in[i].si = si;
             if (info->type == PIPE_SHADER_FRAGMENT) {
                // translate interpolation mode
-               switch (decl->Declaration.Interpolate) {
+               switch (decl->Interp.Interpolate) {
                case TGSI_INTERPOLATE_CONSTANT:
                   info->in[i].flat = 1;
                   break;
@@ -818,7 +819,7 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
                default:
                   break;
                }
-               if (decl->Declaration.Centroid)
+               if (decl->Interp.Centroid)
                   info->in[i].centroid = 1;
             }
          }
@@ -874,9 +875,9 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
          info->sv[i].input = inferSysValDirection(sn);
       }
       break;
-   case TGSI_FILE_RESOURCE:
+   case TGSI_FILE_SAMPLER_VIEW:
       for (i = first; i <= last; ++i)
-         resourceTargets[i] = decl->Resource.Resource;
+         samplerViewTargets[i] = decl->SamplerView.Resource;
       break;
    case TGSI_FILE_IMMEDIATE_ARRAY:
    {
@@ -1000,13 +1001,15 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst)
 nv50_ir::TexInstruction::Target
 Instruction::getTexture(const tgsi::Source *code, int s) const
 {
-   if (insn->Instruction.Texture) {
-      return translateTexture(insn->Texture.Texture);
-   } else {
+   switch (getSrc(s).getFile()) {
+   case TGSI_FILE_SAMPLER_VIEW: {
       // XXX: indirect access
       unsigned int r = getSrc(s).getIndex(0);
-      assert(r < code->resourceCount);
-      return translateTexture(code->resourceTargets[r]);
+      assert(r < code->samplerViewCount);
+      return translateTexture(code->samplerViewTargets[r]);
+   }
+   default:
+      return translateTexture(insn->Texture.Texture);
    }
 }
 
@@ -2042,7 +2045,7 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
       handleTXF(dst0, 1);
       break;
    case TGSI_OPCODE_TXQ:
-   case TGSI_OPCODE_RESINFO:
+   case TGSI_OPCODE_SVIEWINFO:
       handleTXQ(dst0, TXQ_DIMS);
       break;
    case TGSI_OPCODE_F2I:
diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_lowering_nv50.cpp
index 27373b4cc47..16bba0e1723 100644
--- a/src/gallium/drivers/nv50/codegen/nv50_ir_lowering_nv50.cpp
+++ b/src/gallium/drivers/nv50/codegen/nv50_ir_lowering_nv50.cpp
@@ -57,15 +57,17 @@ expandIntegerMUL(BuildUtil *bld, Instruction *mul)
 
    Instruction *i[9];
 
-   Value *a[2] = { bld->getSSA(halfSize), bld->getSSA(halfSize) };
-   Value *b[2] = { bld->getSSA(halfSize), bld->getSSA(halfSize) };
+   bld->setPosition(mul, true);
+
+   Value *a[2], *b[2];
    Value *c[2];
    Value *t[4];
    for (int j = 0; j < 4; ++j)
       t[j] = bld->getSSA(fullSize);
 
-   (i[0] = bld->mkOp1(OP_SPLIT, fTy, a[0], mul->getSrc(0)))->setDef(1, a[1]);
-   (i[1] = bld->mkOp1(OP_SPLIT, fTy, b[0], mul->getSrc(1)))->setDef(1, b[1]);
+   // split sources into halves
+   i[0] = bld->mkSplit(a, halfSize, mul->getSrc(0));
+   i[1] = bld->mkSplit(b, halfSize, mul->getSrc(1));
 
    i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]);
    i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
@@ -96,7 +98,8 @@ expandIntegerMUL(BuildUtil *bld, Instruction *mul)
    delete_Instruction(bld->getProgram(), mul);
 
    for (int j = 2; j <= (highResult ? 5 : 4); ++j)
-      i[j]->sType = hTy;
+      if (i[j])
+         i[j]->sType = hTy;
 
    return true;
 }
@@ -518,7 +521,6 @@ private:
 
    bool handleEXPORT(Instruction *);
 
-   bool handleMUL(Instruction *);
    bool handleDIV(Instruction *);
    bool handleSQRT(Instruction *);
    bool handlePOW(Instruction *);
@@ -587,7 +589,8 @@ NV50LoweringPreSSA::handleTEX(TexInstruction *i)
    if (i->tex.target.isArray()) {
       Value *layer = i->getSrc(arg - 1);
       LValue *src = new_LValue(func, FILE_GPR);
-      bld.mkCvt(OP_CVT, TYPE_U16, src, TYPE_F32, layer);
+      bld.mkCvt(OP_CVT, TYPE_U32, src, TYPE_F32, layer);
+      bld.mkOp2(OP_MIN, TYPE_U32, src, src, bld.loadImm(NULL, 511));
       i->setSrc(arg - 1, src);
 
       if (i->tex.target.isCube()) {
@@ -940,14 +943,6 @@ NV50LoweringPreSSA::handleRDSV(Instruction *i)
    return true;
 }
 
-bool
-NV50LoweringPreSSA::handleMUL(Instruction *i)
-{
-   if (!isFloatType(i->dType) && typeSizeof(i->sType) > 2)
-      return expandIntegerMUL(&bld, i);
-   return true;
-}
-
 bool
 NV50LoweringPreSSA::handleDIV(Instruction *i)
 {
@@ -1068,8 +1063,6 @@ NV50LoweringPreSSA::visit(Instruction *i)
       return handleSELP(i);
    case OP_POW:
       return handlePOW(i);
-   case OP_MUL:
-      return handleMUL(i);
    case OP_DIV:
       return handleDIV(i);
    case OP_SQRT:
diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_peephole.cpp
index db5195cd582..10382d9cac6 100644
--- a/src/gallium/drivers/nv50/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nv50/codegen/nv50_ir_peephole.cpp
@@ -564,7 +564,7 @@ ConstantFolding::tryCollapseChainedMULs(Instruction *mul2,
       insn = mul2->getSrc(t)->getInsn();
       if (!mul2->src(t).mod && insn->op == OP_MUL && insn->dType == TYPE_F32)
          mul1 = insn;
-      if (mul1) {
+      if (mul1 && !mul1->saturate) {
          int s1;
 
          if (mul1->src(s1 = 0).getImmediate(imm1) ||
@@ -584,10 +584,11 @@ ConstantFolding::tryCollapseChainedMULs(Instruction *mul2,
             if (f < 0)
                mul1->src(0).mod *= Modifier(NV50_IR_MOD_NEG);
          }
+         mul1->saturate = mul2->saturate;
          return;
       }
    }
-   if (mul2->getDef(0)->refCount() == 1) {
+   if (mul2->getDef(0)->refCount() == 1 && !mul2->saturate) {
       // b = mul a, imm
       // d = mul b, c   -> d = mul_x_imm a, c
       int s2, t2;
diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_ra.cpp
index 77edaa6067a..726331e91e7 100644
--- a/src/gallium/drivers/nv50/codegen/nv50_ir_ra.cpp
+++ b/src/gallium/drivers/nv50/codegen/nv50_ir_ra.cpp
@@ -1819,8 +1819,8 @@ RegAlloc::InsertConstraintsPass::texConstraintNVE0(TexInstruction *tex)
    int n = tex->srcCount(0xff, true);
    if (n > 4) {
       condenseSrcs(tex, 0, 3);
-      if (n > 5)
-         condenseSrcs(tex, 4, n - 1);
+      if (n > 5) // NOTE: first call modified positions already
+         condenseSrcs(tex, 4 - (4 - 1), n - 1 - (4 - 1));
    } else
    if (n > 1) {
       condenseSrcs(tex, 0, n - 1);
@@ -1850,8 +1850,8 @@ RegAlloc::InsertConstraintsPass::texConstraintNVC0(TexInstruction *tex)
 
    if (s > 1)
       condenseSrcs(tex, 0, s - 1);
-   if (n > 1)
-      condenseSrcs(tex, s, s + (n - 1));
+   if (n > 1) // NOTE: first call modified positions already
+      condenseSrcs(tex, 1, n);
 
    condenseDefs(tex);
 }
diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_target_nv50.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_target_nv50.cpp
index 5e541e514cb..8b11c6a2fdd 100644
--- a/src/gallium/drivers/nv50/codegen/nv50_ir_target_nv50.cpp
+++ b/src/gallium/drivers/nv50/codegen/nv50_ir_target_nv50.cpp
@@ -310,7 +310,22 @@ TargetNV50::insnCanLoad(const Instruction *i, int s,
       return false;
    }
 
-   if (ld->getSrc(0)->reg.data.offset > (int32_t)(127 * typeSizeof(ld->dType)))
+   uint8_t ldSize;
+
+   if ((i->op == OP_MUL || i->op == OP_MAD) && !isFloatType(i->dType)) {
+      // 32-bit MUL will be split into 16-bit MULs
+      if (ld->src(0).isIndirect(0))
+         return false;
+      if (sf == FILE_IMMEDIATE)
+         return false;
+      ldSize = 2;
+   } else {
+      ldSize = typeSizeof(ld->dType);
+   }
+
+   if (ldSize < 4 && sf == FILE_SHADER_INPUT) // no < 4-byte aligned a[] access
+      return false;
+   if (ld->getSrc(0)->reg.data.offset > (int32_t)(127 * ldSize))
       return false;
 
    if (ld->src(0).isIndirect(0)) {
diff --git a/src/gallium/drivers/nv50/nv50_context.h b/src/gallium/drivers/nv50/nv50_context.h
index 1cee0e06c02..44a0ba0f561 100644
--- a/src/gallium/drivers/nv50/nv50_context.h
+++ b/src/gallium/drivers/nv50/nv50_context.h
@@ -48,6 +48,7 @@
 #define NV50_NEW_CONSTBUF     (1 << 18)
 #define NV50_NEW_TEXTURES     (1 << 19)
 #define NV50_NEW_SAMPLERS     (1 << 20)
+#define NV50_NEW_STRMOUT      (1 << 21)
 #define NV50_NEW_CONTEXT      (1 << 31)
 
 #define NV50_BIND_FB          0
@@ -56,9 +57,10 @@
 #define NV50_BIND_INDEX       3
 #define NV50_BIND_TEXTURES    4
 #define NV50_BIND_CB(s, i)   (5 + 16 * (s) + (i))
-#define NV50_BIND_SCREEN     53
-#define NV50_BIND_TLS        54
-#define NV50_BIND_COUNT      55
+#define NV50_BIND_SO         53
+#define NV50_BIND_SCREEN     54
+#define NV50_BIND_TLS        55
+#define NV50_BIND_COUNT      56
 #define NV50_BIND_2D          0
 #define NV50_BIND_M2MF        0
 #define NV50_BIND_FENCE       1
@@ -92,11 +94,13 @@ struct nv50_context {
       boolean point_sprite;
       boolean rt_serialize;
       boolean flushed;
+      boolean rasterizer_discard;
       uint8_t tls_required;
       uint8_t num_vtxbufs;
       uint8_t num_vtxelts;
       uint8_t num_textures[3];
       uint8_t num_samplers[3];
+      uint8_t prim_size;
       uint16_t scissor;
    } state;
 
@@ -126,6 +130,10 @@ struct nv50_context {
    struct nv50_tsc_entry *samplers[3][PIPE_MAX_SAMPLERS];
    unsigned num_samplers[3];
 
+   uint8_t num_so_targets;
+   uint8_t so_targets_dirty;
+   struct pipe_stream_output_target *so_target[4];
+
    struct pipe_framebuffer_state framebuffer;
    struct pipe_blend_color blend_colour;
    struct pipe_stencil_ref stencil_ref;
@@ -168,6 +176,14 @@ extern struct draw_stage *nv50_draw_render_stage(struct nv50_context *);
 
 /* nv50_query.c */
 void nv50_init_query_functions(struct nv50_context *);
+void nv50_query_pushbuf_submit(struct nouveau_pushbuf *,
+                               struct pipe_query *, unsigned result_offset);
+void nv84_query_fifo_wait(struct nouveau_pushbuf *, struct pipe_query *);
+void nva0_so_target_save_offset(struct pipe_context *,
+                                struct pipe_stream_output_target *,
+                                unsigned index, boolean seralize);
+
+#define NVA0_QUERY_STREAM_OUTPUT_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0)
 
 /* nv50_shader_state.c */
 void nv50_vertprog_validate(struct nv50_context *);
@@ -177,6 +193,7 @@ void nv50_fp_linkage_validate(struct nv50_context *);
 void nv50_gp_linkage_validate(struct nv50_context *);
 void nv50_constbufs_validate(struct nv50_context *);
 void nv50_validate_derived_rs(struct nv50_context *);
+void nv50_stream_output_validate(struct nv50_context *);
 
 /* nv50_state.c */
 extern void nv50_init_state_functions(struct nv50_context *);
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
index 1b2e2934b79..ca40ac2dd43 100644
--- a/src/gallium/drivers/nv50/nv50_program.c
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -68,6 +68,17 @@ nv50_vertprog_assign_slots(struct nv50_ir_prog_info *info)
          break;
       }
    }
+
+   /*
+    * Corner case: VP has no inputs, but we will still need to submit data to
+    * draw it. HW will shout at us and won't draw anything if we don't enable
+    * any input, so let's just pretend it's the first one.
+    */
+   if (prog->vp.attrs[0] == 0 &&
+       prog->vp.attrs[1] == 0 &&
+       prog->vp.attrs[2] == 0)
+      prog->vp.attrs[0] |= 0xf;
+
    /* VertexID before InstanceID */
    if (info->io.vertexId < info->numSysVals)
       info->sv[info->io.vertexId].slot[0] = n++;
@@ -235,6 +246,59 @@ nv50_program_assign_varying_slots(struct nv50_ir_prog_info *info)
    }
 }
 
+static struct nv50_stream_output_state *
+nv50_program_create_strmout_state(const struct nv50_ir_prog_info *info,
+                                  const struct pipe_stream_output_info *pso)
+{
+   struct nv50_stream_output_state *so;
+   unsigned b, i, c;
+   unsigned base[4];
+
+   so = MALLOC_STRUCT(nv50_stream_output_state);
+   if (!so)
+      return NULL;
+   memset(so->map, 0xff, sizeof(so->map));
+
+   for (b = 0; b < 4; ++b)
+      so->num_attribs[b] = 0;
+   for (i = 0; i < pso->num_outputs; ++i) {
+      unsigned end =  pso->output[i].dst_offset + pso->output[i].num_components;
+      b = pso->output[i].output_buffer;
+      assert(b < 4);
+      so->num_attribs[b] = MAX2(so->num_attribs[b], end);
+   }
+
+   so->ctrl = NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED;
+
+   so->stride[0] = pso->stride[0] * 4;
+   base[0] = 0;
+   for (b = 1; b < 4; ++b) {
+      assert(!so->num_attribs[b] || so->num_attribs[b] == pso->stride[b]);
+      so->stride[b] = so->num_attribs[b] * 4;
+      if (so->num_attribs[b])
+         so->ctrl = (b + 1) << NV50_3D_STRMOUT_BUFFERS_CTRL_SEPARATE__SHIFT;
+      base[b] = align(base[b - 1] + so->num_attribs[b - 1], 4);
+   }
+   if (so->ctrl & NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED) {
+      assert(so->stride[0] < NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__MAX);
+      so->ctrl |= so->stride[0] << NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__SHIFT;
+   }
+
+   so->map_size = base[3] + so->num_attribs[3];
+
+   for (i = 0; i < pso->num_outputs; ++i) {
+      const unsigned s = pso->output[i].start_component;
+      const unsigned p = pso->output[i].dst_offset;
+      const unsigned r = pso->output[i].register_index;
+      b = pso->output[i].output_buffer;
+
+      for (c = 0; c < pso->output[i].num_components; ++c)
+         so->map[base[b] + p + c] = info->out[r].slot[s + c];
+   }
+
+   return so;
+}
+
 boolean
 nv50_program_translate(struct nv50_program *prog, uint16_t chipset)
 {
@@ -293,6 +357,10 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset)
          prog->fp.flags[0] |= NV50_3D_FP_CONTROL_USES_KIL;
    }
 
+   if (prog->pipe.stream_output.num_outputs)
+      prog->so = nv50_program_create_strmout_state(info,
+                                                   &prog->pipe.stream_output);
+
 out:
    FREE(info);
    return !ret;
diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h
index 92361ad9946..f56268b5439 100644
--- a/src/gallium/drivers/nv50/nv50_program.h
+++ b/src/gallium/drivers/nv50/nv50_program.h
@@ -42,6 +42,15 @@ struct nv50_varying {
    ubyte si; /* semantic index */
 };
 
+struct nv50_stream_output_state
+{
+   uint32_t ctrl;
+   uint16_t stride[4];
+   uint8_t num_attribs[4];
+   uint8_t map_size;
+   uint8_t map[128];
+};
+
 struct nv50_program {
    struct pipe_shader_state pipe;
 
@@ -88,6 +97,8 @@ struct nv50_program {
    void *fixups; /* relocation records */
 
    struct nouveau_heap *mem;
+
+   struct nv50_stream_output_state *so;
 };
 
 boolean nv50_program_translate(struct nv50_program *, uint16_t chipset);
diff --git a/src/gallium/drivers/nv50/nv50_push.c b/src/gallium/drivers/nv50/nv50_push.c
index 04e32b7e8b9..3abe189e7b5 100644
--- a/src/gallium/drivers/nv50/nv50_push.c
+++ b/src/gallium/drivers/nv50/nv50_push.c
@@ -210,7 +210,8 @@ nv50_push_vbo(struct nv50_context *nv50, const struct pipe_draw_info *info)
 {
    struct push_context ctx;
    unsigned i, index_size;
-   unsigned inst = info->instance_count;
+   unsigned inst_count = info->instance_count;
+   unsigned vert_count = info->count;
    boolean apply_bias = info->indexed && info->index_bias;
 
    ctx.push = nv50->base.pushbuf;
@@ -242,6 +243,17 @@ nv50_push_vbo(struct nv50_context *nv50, const struct pipe_draw_info *info)
       ctx.primitive_restart = info->primitive_restart;
       ctx.restart_index = info->restart_index;
    } else {
+      if (unlikely(info->count_from_stream_output)) {
+         struct pipe_context *pipe = &nv50->base.pipe;
+         struct nv50_so_target *targ;
+         targ = nv50_so_target(info->count_from_stream_output);
+         if (!targ->pq) {
+            NOUVEAU_ERR("draw_stream_output not supported on pre-NVA0 cards\n");
+            return;
+         }
+         pipe->get_query_result(pipe, targ->pq, TRUE, (void *)&vert_count);
+         vert_count /= targ->stride;
+      }
       ctx.idxbuf = NULL;
       index_size = 0;
       ctx.primitive_restart = FALSE;
@@ -262,21 +274,21 @@ nv50_push_vbo(struct nv50_context *nv50, const struct pipe_draw_info *info)
    }
    nv50->state.prim_restart = info->primitive_restart;
 
-   while (inst--) {
+   while (inst_count--) {
       BEGIN_NV04(ctx.push, NV50_3D(VERTEX_BEGIN_GL), 1);
       PUSH_DATA (ctx.push, ctx.prim);
       switch (index_size) {
       case 0:
-         emit_vertices_seq(&ctx, info->start, info->count);
+         emit_vertices_seq(&ctx, info->start, vert_count);
          break;
       case 1:
-         emit_vertices_i08(&ctx, info->start, info->count);
+         emit_vertices_i08(&ctx, info->start, vert_count);
          break;
       case 2:
-         emit_vertices_i16(&ctx, info->start, info->count);
+         emit_vertices_i16(&ctx, info->start, vert_count);
          break;
       case 4:
-         emit_vertices_i32(&ctx, info->start, info->count);
+         emit_vertices_i32(&ctx, info->start, vert_count);
          break;
       default:
          assert(0);
diff --git a/src/gallium/drivers/nv50/nv50_query.c b/src/gallium/drivers/nv50/nv50_query.c
index 5275e74964a..8e62c5f11bc 100644
--- a/src/gallium/drivers/nv50/nv50_query.c
+++ b/src/gallium/drivers/nv50/nv50_query.c
@@ -36,7 +36,8 @@
 
 struct nv50_query {
    uint32_t *data;
-   uint32_t type;
+   uint16_t type;
+   uint16_t index;
    uint32_t sequence;
    struct nouveau_bo *bo;
    uint32_t base;
@@ -170,21 +171,15 @@ nv50_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
       BEGIN_NV04(push, NV50_3D(SAMPLECNT_ENABLE), 1);
       PUSH_DATA (push, 1);
       break;
-   case PIPE_QUERY_PRIMITIVES_GENERATED: /* store before & after instead ? */
-      PUSH_SPACE(push, 2);
-      BEGIN_NV04(push, NV50_3D(COUNTER_RESET), 1);
-      PUSH_DATA (push, NV50_3D_COUNTER_RESET_GENERATED_PRIMITIVES);
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+      nv50_query_get(push, q, 0x10, 0x06805002);
       break;
    case PIPE_QUERY_PRIMITIVES_EMITTED:
-      PUSH_SPACE(push, 2);
-      BEGIN_NV04(push, NV50_3D(COUNTER_RESET), 1);
-      PUSH_DATA (push, NV50_3D_COUNTER_RESET_TRANSFORM_FEEDBACK);
+      nv50_query_get(push, q, 0x10, 0x05805002);
       break;
    case PIPE_QUERY_SO_STATISTICS:
-      PUSH_SPACE(push, 3);
-      BEGIN_NI04(push, NV50_3D(COUNTER_RESET), 2);
-      PUSH_DATA (push, NV50_3D_COUNTER_RESET_TRANSFORM_FEEDBACK);
-      PUSH_DATA (push, NV50_3D_COUNTER_RESET_GENERATED_PRIMITIVES);
+      nv50_query_get(push, q, 0x20, 0x05805002);
+      nv50_query_get(push, q, 0x30, 0x06805002);
       break;
    case PIPE_QUERY_TIMESTAMP_DISJOINT:
    case PIPE_QUERY_TIME_ELAPSED:
@@ -227,6 +222,9 @@ nv50_query_end(struct pipe_context *pipe, struct pipe_query *pq)
    case PIPE_QUERY_GPU_FINISHED:
       nv50_query_get(push, q, 0, 0x1000f010);
       break;
+   case NVA0_QUERY_STREAM_OUTPUT_BUFFER_OFFSET:
+      nv50_query_get(push, q, 0, 0x0d005002 | (q->index << 5));
+      break;
    default:
       assert(0);
       break;
@@ -247,6 +245,7 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
    struct nv50_context *nv50 = nv50_context(pipe);
    struct nv50_query *q = nv50_query(pq);
    uint64_t *res64 = (uint64_t *)result;
+   uint32_t *res32 = (uint32_t *)result;
    boolean *res8 = (boolean *)result;
    uint64_t *data64 = (uint64_t *)q->data;
 
@@ -275,11 +274,11 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
       break;
    case PIPE_QUERY_PRIMITIVES_GENERATED: /* u64 count, u64 time */
    case PIPE_QUERY_PRIMITIVES_EMITTED: /* u64 count, u64 time */
-      res64[0] = data64[0];
+      res64[0] = data64[0] - data64[2];
       break;
    case PIPE_QUERY_SO_STATISTICS:
-      res64[0] = data64[0];
-      res64[1] = data64[1];
+      res64[0] = data64[0] - data64[4];
+      res64[1] = data64[2] - data64[6];
       break;
    case PIPE_QUERY_TIMESTAMP_DISJOINT: /* u32 sequence, u32 0, u64 time */
       res64[0] = 1000000000;
@@ -288,6 +287,9 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
    case PIPE_QUERY_TIME_ELAPSED:
       res64[0] = data64[1] - data64[3];
       break;
+   case NVA0_QUERY_STREAM_OUTPUT_BUFFER_OFFSET:
+      res32[0] = q->data[1];
+      break;
    default:
       return FALSE;
    }
@@ -295,6 +297,21 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
    return TRUE;
 }
 
+void
+nv84_query_fifo_wait(struct nouveau_pushbuf *push, struct pipe_query *pq)
+{
+   struct nv50_query *q = nv50_query(pq);
+   unsigned offset = q->offset;
+
+   PUSH_SPACE(push, 5);
+   PUSH_REFN (push, q->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+   BEGIN_NV04(push, SUBC_3D(NV84_SUBCHAN_SEMAPHORE_ADDRESS_HIGH), 4);
+   PUSH_DATAh(push, q->bo->offset + offset);
+   PUSH_DATA (push, q->bo->offset + offset);
+   PUSH_DATA (push, q->sequence);
+   PUSH_DATA (push, NV84_SUBCHAN_SEMAPHORE_TRIGGER_ACQUIRE_EQUAL);
+}
+
 static void
 nv50_render_condition(struct pipe_context *pipe,
                       struct pipe_query *pq, uint mode)
@@ -324,6 +341,38 @@ nv50_render_condition(struct pipe_context *pipe,
    PUSH_DATA (push, NV50_3D_COND_MODE_RES_NON_ZERO);
 }
 
+void
+nv50_query_pushbuf_submit(struct nouveau_pushbuf *push,
+                          struct pipe_query *pq, unsigned result_offset)
+{
+   struct nv50_query *q = nv50_query(pq);
+
+   /* XXX: does this exist ? */
+#define NV50_IB_ENTRY_1_NO_PREFETCH (0 << (31 - 8))
+
+   nouveau_pushbuf_space(push, 0, 0, 1);
+   nouveau_pushbuf_data(push, q->bo, q->offset + result_offset, 4 |
+                        NV50_IB_ENTRY_1_NO_PREFETCH);
+}
+
+void
+nva0_so_target_save_offset(struct pipe_context *pipe,
+                           struct pipe_stream_output_target *ptarg,
+                           unsigned index, boolean serialize)
+{
+   struct nv50_so_target *targ = nv50_so_target(ptarg);
+
+   if (serialize) {
+      struct nouveau_pushbuf *push = nv50_context(pipe)->base.pushbuf;
+      PUSH_SPACE(push, 2);
+      BEGIN_NV04(push, SUBC_3D(NV50_GRAPH_SERIALIZE), 1);
+      PUSH_DATA (push, 0);
+   }
+
+   nv50_query(targ->pq)->index = index;
+   nv50_query_end(pipe, targ->pq);
+}
+
 void
 nv50_init_query_functions(struct nv50_context *nv50)
 {
diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c
index a6dfbedf299..c96e028b2a2 100644
--- a/src/gallium/drivers/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nv50/nv50_screen.c
@@ -73,6 +73,8 @@ nv50_screen_is_format_supported(struct pipe_screen *pscreen,
 static int
 nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 {
+   const uint16_t class_3d = nouveau_screen(pscreen)->class_3d;
+
    switch (param) {
    case PIPE_CAP_MAX_COMBINED_SAMPLERS:
       return 64;
@@ -82,8 +84,8 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
       return 12;
    case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
       return 14;
-   case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS: /* shader support missing */
-      return 0;
+   case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
+      return 512;
    case PIPE_CAP_MIN_TEXEL_OFFSET:
       return -8;
    case PIPE_CAP_MAX_TEXEL_OFFSET:
@@ -95,7 +97,6 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_ANISOTROPIC_FILTER:
    case PIPE_CAP_SCALED_RESOLVE:
       return 1;
-   case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
    case PIPE_CAP_SEAMLESS_CUBE_MAP:
       return nv50_screen(pscreen)->tesla->oclass >= NVA0_3D_CLASS;
    case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
@@ -121,11 +122,12 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_OCCLUSION_QUERY:
       return 1;
    case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
-      return 0;
+      return 4;
    case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
-      return 128;
    case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
-      return 32;
+      return 64;
+   case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
+      return (class_3d >= NVA0_3D_CLASS) ? 1 : 0;
    case PIPE_CAP_BLEND_EQUATION_SEPARATE:
    case PIPE_CAP_INDEP_BLEND_ENABLE:
       return 1;
diff --git a/src/gallium/drivers/nv50/nv50_shader_state.c b/src/gallium/drivers/nv50/nv50_shader_state.c
index aef3f129c81..d070f07bbbc 100644
--- a/src/gallium/drivers/nv50/nv50_shader_state.c
+++ b/src/gallium/drivers/nv50/nv50_shader_state.c
@@ -207,6 +207,8 @@ nv50_gmtyprog_validate(struct nv50_context *nv50)
       PUSH_DATA (push, gp->gp.vert_count);
       BEGIN_NV04(push, NV50_3D(GP_START_ID), 1);
       PUSH_DATA (push, gp->code_base);
+
+      nv50->state.prim_size = gp->gp.prim_type; /* enum matches vertex count */
    }
    nv50_program_update_context_state(nv50, gp, 2);
 
@@ -278,6 +280,12 @@ nv50_validate_derived_rs(struct nv50_context *nv50)
 
    nv50_sprite_coords_validate(nv50);
 
+   if (nv50->state.rasterizer_discard != nv50->rast->pipe.rasterizer_discard) {
+      nv50->state.rasterizer_discard = nv50->rast->pipe.rasterizer_discard;
+      BEGIN_NV04(push, NV50_3D(RASTERIZE_ENABLE), 1);
+      PUSH_DATA (push, !nv50->rast->pipe.rasterizer_discard);
+   }
+
    if (nv50->dirty & NV50_NEW_FRAGPROG)
       return;
    psize = nv50->state.semantic_psize & ~NV50_3D_SEMANTIC_PTSZ_PTSZ_EN__MASK;
@@ -343,6 +351,7 @@ nv50_fp_linkage_validate(struct nv50_context *nv50)
    uint32_t colors = fp->fp.colors;
    uint32_t lin[4];
    uint8_t map[64];
+   uint8_t so_map[64];
 
    if (!(nv50->dirty & (NV50_NEW_VERTPROG |
                         NV50_NEW_FRAGPROG |
@@ -411,6 +420,30 @@ nv50_fp_linkage_validate(struct nv50_context *nv50)
    if (nv50->rast->pipe.clamp_vertex_color)
       colors |= NV50_3D_SEMANTIC_COLOR_CLMP_EN;
 
+   if (unlikely(vp->so)) {
+      /* Slot i in STRMOUT_MAP specifies the offset where slot i in RESULT_MAP
+       * gets written.
+       *
+       * TODO:
+       * Inverting vp->so->map (output -> offset) would probably speed this up.
+       */
+      memset(so_map, 0, sizeof(so_map));
+      for (i = 0; i < vp->so->map_size; ++i) {
+         if (vp->so->map[i] == 0xff)
+            continue;
+         for (c = 0; c < m; ++c)
+            if (map[c] == vp->so->map[i] && !so_map[c])
+               break;
+         if (c == m) {
+            c = m;
+            map[m++] = vp->so->map[i];
+         }
+         so_map[c] = 0x80 | i;
+      }
+      for (c = m; c & 3; ++c)
+         so_map[c] = 0;
+   }
+
    n = (m + 3) / 4;
    assert(m <= 64);
 
@@ -451,6 +484,11 @@ nv50_fp_linkage_validate(struct nv50_context *nv50)
 
    BEGIN_NV04(push, NV50_3D(GP_ENABLE), 1);
    PUSH_DATA (push, nv50->gmtyprog ? 1 : 0);
+
+   if (vp->so) {
+      BEGIN_NV04(push, NV50_3D(STRMOUT_MAP(0)), n);
+      PUSH_DATAp(push, so_map, n);
+   }
 }
 
 static int
@@ -509,3 +547,75 @@ nv50_gp_linkage_validate(struct nv50_context *nv50)
    BEGIN_NV04(push, NV50_3D(VP_RESULT_MAP(0)), n);
    PUSH_DATAp(push, map, n);
 }
+
+void
+nv50_stream_output_validate(struct nv50_context *nv50)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct nv50_stream_output_state *so;
+   uint32_t ctrl;
+   unsigned i;
+   unsigned prims = ~0;
+
+   so = nv50->gmtyprog ? nv50->gmtyprog->so : nv50->vertprog->so;
+
+   if (!so || !nv50->num_so_targets) {
+      BEGIN_NV04(push, NV50_3D(STRMOUT_ENABLE), 1);
+      PUSH_DATA (push, 0);
+      if (nv50->screen->base.class_3d < NVA0_3D_CLASS) {
+         BEGIN_NV04(push, NV50_3D(STRMOUT_PRIMITIVE_LIMIT), 1);
+         PUSH_DATA (push, 0);
+      }
+      BEGIN_NV04(push, NV50_3D(STRMOUT_PARAMS_LATCH), 1);
+      PUSH_DATA (push, 1);
+      return;
+   }
+
+   ctrl = so->ctrl;
+   if (nv50->screen->base.class_3d >= NVA0_3D_CLASS)
+      ctrl |= NVA0_3D_STRMOUT_BUFFERS_CTRL_LIMIT_MODE_OFFSET;
+
+   BEGIN_NV04(push, NV50_3D(STRMOUT_BUFFERS_CTRL), 1);
+   PUSH_DATA (push, ctrl);
+
+   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_SO);
+
+   for (i = 0; i < nv50->num_so_targets; ++i) {
+      struct nv50_so_target *targ = nv50_so_target(nv50->so_target[i]);
+      struct nv04_resource *buf = nv04_resource(targ->pipe.buffer);
+
+      const unsigned n = nv50->screen->base.class_3d >= NVA0_3D_CLASS ? 4 : 3;
+
+      if (n == 4 && !targ->clean)
+         nv84_query_fifo_wait(push, targ->pq);
+      BEGIN_NV04(push, NV50_3D(STRMOUT_ADDRESS_HIGH(i)), n);
+      PUSH_DATAh(push, buf->address + targ->pipe.buffer_offset);
+      PUSH_DATA (push, buf->address + targ->pipe.buffer_offset);
+      PUSH_DATA (push, so->num_attribs[i]);
+      if (n == 4) {
+         PUSH_DATA(push, targ->pipe.buffer_size);
+
+         BEGIN_NV04(push, NVA0_3D(STRMOUT_OFFSET(i)), 1);
+         if (!targ->clean) {
+            assert(targ->pq);
+            nv50_query_pushbuf_submit(push, targ->pq, 0x4);
+         } else {
+            PUSH_DATA(push, 0);
+            targ->clean = FALSE;
+         }
+      } else {
+         const unsigned limit = targ->pipe.buffer_size /
+            (so->stride[i] * nv50->state.prim_size);
+         prims = MIN2(prims, limit);
+      }
+      BCTX_REFN(nv50->bufctx_3d, SO, buf, WR);
+   }
+   if (prims != ~0) {
+      BEGIN_NV04(push, NV50_3D(STRMOUT_PRIMITIVE_LIMIT), 1);
+      PUSH_DATA (push, prims);
+   }
+   BEGIN_NV04(push, NV50_3D(STRMOUT_PARAMS_LATCH), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NV04(push, NV50_3D(STRMOUT_ENABLE), 1);
+   PUSH_DATA (push, 1);
+}
diff --git a/src/gallium/drivers/nv50/nv50_state.c b/src/gallium/drivers/nv50/nv50_state.c
index a17540a1492..7f840e2b42e 100644
--- a/src/gallium/drivers/nv50/nv50_state.c
+++ b/src/gallium/drivers/nv50/nv50_state.c
@@ -680,6 +680,9 @@ nv50_sp_state_create(struct pipe_context *pipe,
    prog->type = type;
    prog->pipe.tokens = tgsi_dup_tokens(cso->tokens);
 
+   if (cso->stream_output.num_outputs)
+      prog->pipe.stream_output = cso->stream_output;
+
    return (void *)prog;
 }
 
@@ -920,6 +923,90 @@ nv50_vertex_state_bind(struct pipe_context *pipe, void *hwcso)
    nv50->dirty |= NV50_NEW_VERTEX;
 }
 
+static struct pipe_stream_output_target *
+nv50_so_target_create(struct pipe_context *pipe,
+                      struct pipe_resource *res,
+                      unsigned offset, unsigned size)
+{
+   struct nv50_so_target *targ = MALLOC_STRUCT(nv50_so_target);
+   if (!targ)
+      return NULL;
+
+   if (nouveau_context(pipe)->screen->class_3d >= NVA0_3D_CLASS) {
+      targ->pq = pipe->create_query(pipe,
+                                    NVA0_QUERY_STREAM_OUTPUT_BUFFER_OFFSET);
+      if (!targ->pq) {
+         FREE(targ);
+         return NULL;
+      }
+   } else {
+      targ->pq = NULL;
+   }
+   targ->clean = TRUE;
+
+   targ->pipe.buffer_size = size;
+   targ->pipe.buffer_offset = offset;
+   targ->pipe.context = pipe;
+   targ->pipe.buffer = NULL;
+   pipe_resource_reference(&targ->pipe.buffer, res);
+   pipe_reference_init(&targ->pipe.reference, 1);
+
+   return &targ->pipe;
+}
+
+static void
+nv50_so_target_destroy(struct pipe_context *pipe,
+                       struct pipe_stream_output_target *ptarg)
+{
+   struct nv50_so_target *targ = nv50_so_target(ptarg);
+   if (targ->pq)
+      pipe->destroy_query(pipe, targ->pq);
+   FREE(targ);
+}
+
+static void
+nv50_set_stream_output_targets(struct pipe_context *pipe,
+                               unsigned num_targets,
+                               struct pipe_stream_output_target **targets,
+                               unsigned append_mask)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+   unsigned i;
+   boolean serialize = TRUE;
+   const boolean can_resume = nv50->screen->base.class_3d >= NVA0_3D_CLASS;
+
+   assert(num_targets <= 4);
+
+   for (i = 0; i < num_targets; ++i) {
+      const boolean changed = nv50->so_target[i] != targets[i];
+      if (!changed && (append_mask & (1 << i)))
+         continue;
+      nv50->so_targets_dirty |= 1 << i;
+
+      if (can_resume && changed && nv50->so_target[i]) {
+         nva0_so_target_save_offset(pipe, nv50->so_target[i], i, serialize);
+         serialize = FALSE;
+      }
+
+      if (targets[i] && !(append_mask & (1 << i)))
+         nv50_so_target(targets[i])->clean = TRUE;
+
+      pipe_so_target_reference(&nv50->so_target[i], targets[i]);
+   }
+   for (; i < nv50->num_so_targets; ++i) {
+      if (can_resume && nv50->so_target[i]) {
+         nva0_so_target_save_offset(pipe, nv50->so_target[i], i, serialize);
+         serialize = FALSE;
+      }
+      pipe_so_target_reference(&nv50->so_target[i], NULL);
+      nv50->so_targets_dirty |= 1 << i;
+   }
+   nv50->num_so_targets = num_targets;
+
+   if (nv50->so_targets_dirty)
+      nv50->dirty |= NV50_NEW_STRMOUT;
+}
+
 void
 nv50_init_state_functions(struct nv50_context *nv50)
 {
@@ -975,5 +1062,8 @@ nv50_init_state_functions(struct nv50_context *nv50)
 
    pipe->set_vertex_buffers = nv50_set_vertex_buffers;
    pipe->set_index_buffer = nv50_set_index_buffer;
-}
 
+   pipe->create_stream_output_target = nv50_so_target_create;
+   pipe->stream_output_target_destroy = nv50_so_target_destroy;
+   pipe->set_stream_output_targets = nv50_set_stream_output_targets;
+}
diff --git a/src/gallium/drivers/nv50/nv50_state_validate.c b/src/gallium/drivers/nv50/nv50_state_validate.c
index c19acf6c426..a95e96d3c51 100644
--- a/src/gallium/drivers/nv50/nv50_state_validate.c
+++ b/src/gallium/drivers/nv50/nv50_state_validate.c
@@ -360,6 +360,8 @@ static struct state_validate {
     { nv50_constbufs_validate,     NV50_NEW_CONSTBUF },
     { nv50_validate_textures,      NV50_NEW_TEXTURES },
     { nv50_validate_samplers,      NV50_NEW_SAMPLERS },
+    { nv50_stream_output_validate, NV50_NEW_STRMOUT |
+                                   NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG },
     { nv50_vertex_arrays_validate, NV50_NEW_VERTEX | NV50_NEW_ARRAYS }
 };
 #define validate_list_len (sizeof(validate_list) / sizeof(validate_list[0]))
diff --git a/src/gallium/drivers/nv50/nv50_stateobj.h b/src/gallium/drivers/nv50/nv50_stateobj.h
index 188406da600..8a9260c937e 100644
--- a/src/gallium/drivers/nv50/nv50_stateobj.h
+++ b/src/gallium/drivers/nv50/nv50_stateobj.h
@@ -51,4 +51,17 @@ struct nv50_vertex_stateobj {
    struct nv50_vertex_element element[0];
 };
 
+struct nv50_so_target {
+   struct pipe_stream_output_target pipe;
+   struct pipe_query *pq;
+   unsigned stride;
+   boolean clean;
+};
+
+static INLINE struct nv50_so_target *
+nv50_so_target(struct pipe_stream_output_target *ptarg)
+{
+   return (struct nv50_so_target *)ptarg;
+}
+
 #endif
diff --git a/src/gallium/drivers/nv50/nv50_surface.c b/src/gallium/drivers/nv50/nv50_surface.c
index b38e49ffcc1..15c88d5316d 100644
--- a/src/gallium/drivers/nv50/nv50_surface.c
+++ b/src/gallium/drivers/nv50/nv50_surface.c
@@ -711,7 +711,7 @@ nv50_blit_set_src(struct nv50_context *nv50,
    templ.swizzle_a = PIPE_SWIZZLE_ALPHA;
 
    nv50->textures[2][0] = nv50_create_sampler_view(pipe, res, &templ);
-   nv50->textures[2][0] = NULL;
+   nv50->textures[2][1] = NULL;
 
    nv50_blit_fixup_tic_entry(nv50->textures[2][0]);
 
diff --git a/src/gallium/drivers/nv50/nv50_vbo.c b/src/gallium/drivers/nv50/nv50_vbo.c
index bc01e69decf..323677eaf80 100644
--- a/src/gallium/drivers/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nv50/nv50_vbo.c
@@ -405,6 +405,25 @@ nv50_prim_gl(unsigned prim)
    }
 }
 
+/* For pre-nva0 transform feedback. */
+static const uint8_t nv50_pipe_prim_to_prim_size[PIPE_PRIM_MAX + 1] =
+{
+   [PIPE_PRIM_POINTS] = 1,
+   [PIPE_PRIM_LINES] = 2,
+   [PIPE_PRIM_LINE_LOOP] = 2,
+   [PIPE_PRIM_LINE_STRIP] = 2,
+   [PIPE_PRIM_TRIANGLES] = 3,
+   [PIPE_PRIM_TRIANGLE_STRIP] = 3,
+   [PIPE_PRIM_TRIANGLE_FAN] = 3,
+   [PIPE_PRIM_QUADS] = 3,
+   [PIPE_PRIM_QUAD_STRIP] = 3,
+   [PIPE_PRIM_POLYGON] = 3,
+   [PIPE_PRIM_LINES_ADJACENCY] = 2,
+   [PIPE_PRIM_LINE_STRIP_ADJACENCY] = 2,
+   [PIPE_PRIM_TRIANGLES_ADJACENCY] = 3,
+   [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = 3
+};
+
 static void
 nv50_draw_arrays(struct nv50_context *nv50,
                  unsigned mode, unsigned start, unsigned count,
@@ -623,6 +642,51 @@ nv50_draw_elements(struct nv50_context *nv50, boolean shorten,
    }
 }
 
+static void
+nva0_draw_stream_output(struct nv50_context *nv50,
+                        const struct pipe_draw_info *info)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct nv50_so_target *so = nv50_so_target(info->count_from_stream_output);
+   struct nv04_resource *res = nv04_resource(so->pipe.buffer);
+   unsigned num_instances = info->instance_count;
+   unsigned mode = nv50_prim_gl(info->mode);
+
+   if (unlikely(nv50->screen->base.class_3d < NVA0_3D_CLASS)) {
+      /* A proper implementation without waiting doesn't seem possible,
+       * so don't bother.
+       */
+      NOUVEAU_ERR("draw_stream_output not supported on pre-NVA0 cards\n");
+      return;
+   }
+
+   if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) {
+      res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING;
+      PUSH_SPACE(push, 4);
+      BEGIN_NV04(push, SUBC_3D(NV50_GRAPH_SERIALIZE), 1);
+      PUSH_DATA (push, 0);
+      BEGIN_NV04(push, NV50_3D(VERTEX_ARRAY_FLUSH), 1);
+      PUSH_DATA (push, 0);
+   }
+
+   assert(num_instances);
+   do {
+      PUSH_SPACE(push, 8);
+      BEGIN_NV04(push, NV50_3D(VERTEX_BEGIN_GL), 1);
+      PUSH_DATA (push, mode);
+      BEGIN_NV04(push, NVA0_3D(DRAW_TFB_BASE), 1);
+      PUSH_DATA (push, 0);
+      BEGIN_NV04(push, NVA0_3D(DRAW_TFB_STRIDE), 1);
+      PUSH_DATA (push, 0);
+      BEGIN_NV04(push, NVA0_3D(DRAW_TFB_BYTES), 1);
+      nv50_query_pushbuf_submit(push, so->pq, 0x4);
+      BEGIN_NV04(push, NV50_3D(VERTEX_END_GL), 1);
+      PUSH_DATA (push, 0);
+
+      mode |= NV50_3D_VERTEX_BEGIN_GL_INSTANCE_NEXT;
+   } while (--num_instances);
+}
+
 static void
 nv50_draw_vbo_kick_notify(struct nouveau_pushbuf *chan)
 {
@@ -655,6 +719,9 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    if (nv50->vbo_user && !(nv50->dirty & (NV50_NEW_VERTEX | NV50_NEW_ARRAYS)))
       nv50_update_user_vbufs(nv50);
 
+   if (unlikely(nv50->num_so_targets && !nv50->gmtyprog))
+      nv50->state.prim_size = nv50_pipe_prim_to_prim_size[info->mode];
+
    nv50_state_validate(nv50, ~0, 8); /* 8 as minimum, we use flush_notify */
 
    push->kick_notify = nv50_draw_vbo_kick_notify;
@@ -679,11 +746,7 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       nv50->base.vbo_dirty = FALSE;
    }
 
-   if (!info->indexed) {
-      nv50_draw_arrays(nv50,
-                       info->mode, info->start, info->count,
-                       info->instance_count);
-   } else {
+   if (info->indexed) {
       boolean shorten = info->max_index <= 65535;
 
       assert(nv50->idxbuf.buffer);
@@ -713,6 +776,13 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       nv50_draw_elements(nv50, shorten,
                          info->mode, info->start, info->count,
                          info->instance_count, info->index_bias);
+   } else
+   if (unlikely(info->count_from_stream_output)) {
+      nva0_draw_stream_output(nv50, info);
+   } else {
+      nv50_draw_arrays(nv50,
+                       info->mode, info->start, info->count,
+                       info->instance_count);
    }
    push->kick_notify = nv50_default_kick_notify;
 
diff --git a/src/gallium/drivers/nv50/nv50_winsys.h b/src/gallium/drivers/nv50/nv50_winsys.h
index b36898dabe6..145ee70cb9f 100644
--- a/src/gallium/drivers/nv50/nv50_winsys.h
+++ b/src/gallium/drivers/nv50/nv50_winsys.h
@@ -49,6 +49,7 @@ PUSH_REFN(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t flags)
 
 #define SUBC_3D(m) 3, (m)
 #define NV50_3D(n) SUBC_3D(NV50_3D_##n)
+#define NVA0_3D(n) SUBC_3D(NVA0_3D_##n)
 
 #define SUBC_2D(m) 4, (m)
 #define NV50_2D(n) SUBC_2D(NV50_2D_##n)
diff --git a/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp
index 2ca4979dc74..fbd1aa5dfc9 100644
--- a/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp
@@ -1020,7 +1020,7 @@ CodeEmitterNVC0::emitTEX(const TexInstruction *i)
       code[1] |= 0x02000000;
    }
 
-   if (i->tex.derivAll)
+   if (i->op != OP_TXD && i->tex.derivAll)
       code[1] |= 1 << 13;
 
    defId(i->def(0), 14);
diff --git a/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp
index 02ae9fd5d0e..900e998df8d 100644
--- a/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp
@@ -749,21 +749,22 @@ bool
 NVC0LoweringPass::handleTXD(TexInstruction *txd)
 {
    int dim = txd->tex.target.getDim();
-   int arg = txd->tex.target.getDim() + txd->tex.target.isArray();
+   int arg = txd->tex.target.getArgCount();
 
    handleTEX(txd);
-   while (txd->src(arg).exists())
+   while (txd->srcExists(arg))
       ++arg;
 
    txd->tex.derivAll = true;
-   if (dim > 2 || txd->tex.target.isShadow())
+   if (dim > 2 ||
+       txd->tex.target.isCube() ||
+       arg > 4 ||
+       txd->tex.target.isShadow())
       return handleManualTXD(txd);
 
-   assert(arg <= 4); // at most s/t/array, x, y, offset
-
    for (int c = 0; c < dim; ++c) {
-      txd->src(arg + c * 2 + 0).set(txd->dPdx[c]);
-      txd->src(arg + c * 2 + 1).set(txd->dPdy[c]);
+      txd->setSrc(arg + c * 2 + 0, txd->dPdx[c]);
+      txd->setSrc(arg + c * 2 + 1, txd->dPdy[c]);
       txd->dPdx[c].set(NULL);
       txd->dPdy[c].set(NULL);
    }
diff --git a/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.cpp
index 10c2d09d657..e4b9dc18311 100644
--- a/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.cpp
+++ b/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.cpp
@@ -223,6 +223,9 @@ static const struct opProperties _initProps[] =
    { OP_ABS,    0x0, 0x0, 0x0, 0x0, 0x1, 0x0 },
    { OP_NEG,    0x0, 0x1, 0x0, 0x0, 0x1, 0x0 },
    { OP_CVT,    0x1, 0x1, 0x0, 0x8, 0x1, 0x0 },
+   { OP_CEIL,   0x1, 0x1, 0x0, 0x8, 0x1, 0x0 },
+   { OP_FLOOR,  0x1, 0x1, 0x0, 0x8, 0x1, 0x0 },
+   { OP_TRUNC,  0x1, 0x1, 0x0, 0x8, 0x1, 0x0 },
    { OP_AND,    0x0, 0x0, 0x3, 0x0, 0x2, 0x2 | 0x8 },
    { OP_OR,     0x0, 0x0, 0x3, 0x0, 0x2, 0x2 | 0x8 },
    { OP_XOR,    0x0, 0x0, 0x3, 0x0, 0x2, 0x2 | 0x8 },
diff --git a/src/gallium/drivers/r300/r300_vs_draw.c b/src/gallium/drivers/r300/r300_vs_draw.c
index 69d67585d8b..b9e73dd514b 100644
--- a/src/gallium/drivers/r300/r300_vs_draw.c
+++ b/src/gallium/drivers/r300/r300_vs_draw.c
@@ -94,11 +94,12 @@ static void emit_output(struct tgsi_transform_context *ctx,
 
     decl = tgsi_default_full_declaration();
     decl.Declaration.File = TGSI_FILE_OUTPUT;
-    decl.Declaration.Interpolate = interp;
+    decl.Declaration.Interpolate = 1;
     decl.Declaration.Semantic = TRUE;
     decl.Semantic.Name = name;
     decl.Semantic.Index = index;
     decl.Range.First = decl.Range.Last = reg;
+    decl.Interp.Interpolate = interp;
     ctx->emit_declaration(ctx, &decl);
     ++vsctx->num_outputs;
 }
diff --git a/src/gallium/drivers/r600/Makefile.am b/src/gallium/drivers/r600/Makefile.am
index 3089a829e53..77d2674d262 100644
--- a/src/gallium/drivers/r600/Makefile.am
+++ b/src/gallium/drivers/r600/Makefile.am
@@ -29,7 +29,7 @@ libr600_a_SOURCES += \
 	$(LLVM_C_SOURCES)
 
 libr600_a_LIBADD = \
-	$(top_srcdir)/src/gallium/drivers/radeon/libradeon.a
+	$(top_builddir)/src/gallium/drivers/radeon/libradeon.a
 
 AM_CFLAGS += \
 	$(LLVM_CFLAGS) \
diff --git a/src/gallium/drivers/r600/eg_asm.c b/src/gallium/drivers/r600/eg_asm.c
index b6d03ef37de..d2c1679796a 100644
--- a/src/gallium/drivers/r600/eg_asm.c
+++ b/src/gallium/drivers/r600/eg_asm.c
@@ -133,6 +133,10 @@ int eg_bytecode_cf_build(struct r600_bytecode *bc, struct r600_bytecode_cf *cf)
 					S_SQ_CF_WORD1_COND(cf->cond) |
 					S_SQ_CF_WORD1_POP_COUNT(cf->pop_count);
 		break;
+	case CF_NATIVE:
+		bc->bytecode[id++] = cf->isa[0];
+		bc->bytecode[id++] = cf->isa[1];
+		break;
 	default:
 		R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
 		return -EINVAL;
diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
index 517121dc288..81aedb5c0ac 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -796,11 +796,11 @@ static void *evergreen_create_dsa_state(struct pipe_context *ctx,
 		alpha_test_control |= S_028410_ALPHA_TEST_ENABLE(1);
 		alpha_ref = fui(state->alpha.ref_value);
 	}
+	dsa->sx_alpha_test_control = alpha_test_control & 0xff;
 	dsa->alpha_ref = alpha_ref;
 
 	/* misc */
 	db_render_control = 0;
-	r600_pipe_state_add_reg(rstate, R_028410_SX_ALPHA_TEST_CONTROL, alpha_test_control);
 	r600_pipe_state_add_reg(rstate, R_028800_DB_DEPTH_CONTROL, db_depth_control);
 	r600_pipe_state_add_reg(rstate, R_028000_DB_RENDER_CONTROL, db_render_control);
 	return rstate;
@@ -1428,6 +1428,11 @@ static void evergreen_cb(struct r600_context *rctx, struct r600_pipe_state *rsta
 		blend_bypass = 1;
 	}
 
+	if (ntype == V_028C70_NUMBER_UINT || ntype == V_028C70_NUMBER_SINT)
+		rctx->sx_alpha_test_control |= S_028410_ALPHA_TEST_BYPASS(1);
+	else
+		rctx->sx_alpha_test_control &= C_028410_ALPHA_TEST_BYPASS;
+
 	color_info |= S_028C70_FORMAT(format) |
 		S_028C70_COMP_SWAP(swap) |
 		S_028C70_BLEND_CLAMP(blend_clamp) |
diff --git a/src/gallium/drivers/r600/evergreend.h b/src/gallium/drivers/r600/evergreend.h
index 4009e91d4fc..105d80f061d 100644
--- a/src/gallium/drivers/r600/evergreend.h
+++ b/src/gallium/drivers/r600/evergreend.h
@@ -32,20 +32,16 @@
 #define EVERGREEN_CONTEXT_REG_OFFSET                0X00028000
 #define EVERGREEN_CONTEXT_REG_END                   0X00029000
 #define EVERGREEN_RESOURCE_OFFSET                   0x00030000
-#define EVERGREEN_RESOURCE_END                      0x00034000
-#define CAYMAN_RESOURCE_END                         0x00038000
+#define EVERGREEN_RESOURCE_END                      0x00038000
 #define EVERGREEN_LOOP_CONST_OFFSET                 0x0003A200
-#define EVERGREEN_LOOP_CONST_END                    0x0003A26C
+#define EVERGREEN_LOOP_CONST_END                    0x0003A500
 #define EVERGREEN_BOOL_CONST_OFFSET                 0x0003A500
-#define EVERGREEN_BOOL_CONST_END                    0x0003A506
-#define CAYMAN_BOOL_CONST_END                       0x0003A518
+#define EVERGREEN_BOOL_CONST_END                    0x0003A518
 #define EVERGREEN_SAMPLER_OFFSET                    0X0003C000
-#define EVERGREEN_SAMPLER_END                       0X0003CFF0
-#define CAYMAN_SAMPLER_END                          0X0003C600
+#define EVERGREEN_SAMPLER_END                       0X0003C600
 
 #define EVERGREEN_CTL_CONST_OFFSET                  0x0003CFF0
-#define EVERGREEN_CTL_CONST_END                     0x0003E200
-#define CAYMAN_CTL_CONST_END                        0x0003FF0C
+#define EVERGREEN_CTL_CONST_END                     0x0003FF0C
 
 #define EVENT_TYPE_PS_PARTIAL_FLUSH            0x10
 #define EVENT_TYPE_ZPASS_DONE                  0x15
diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c
index 651933bf37c..5a10bd90776 100644
--- a/src/gallium/drivers/r600/r600_asm.c
+++ b/src/gallium/drivers/r600/r600_asm.c
@@ -94,6 +94,7 @@ static inline unsigned int r600_bytecode_get_num_operands(struct r600_bytecode *
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT:
+		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE:
@@ -153,6 +154,7 @@ static inline unsigned int r600_bytecode_get_num_operands(struct r600_bytecode *
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT:
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT:
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE:
+		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT:
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT:
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE:
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE:
@@ -171,6 +173,7 @@ static inline unsigned int r600_bytecode_get_num_operands(struct r600_bytecode *
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV:
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT:
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT:
+		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL:
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR:
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC:
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE:
@@ -1927,6 +1930,7 @@ int r600_bytecode_build(struct r600_bytecode *bc)
 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
 			case CM_V_SQ_CF_WORD1_SQ_CF_INST_END:
+			case CF_NATIVE:
 				break;
 			default:
 				R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
@@ -2025,13 +2029,12 @@ int r600_bytecode_build(struct r600_bytecode *bc)
 				}
 				break;
 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_TEX:
-				if (bc->chip_class == CAYMAN) {
-					LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
-						r = r600_bytecode_vtx_build(bc, vtx, addr);
-						if (r)
-							return r;
-						addr += 4;
-					}
+				LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
+					assert(bc->chip_class >= EVERGREEN);
+					r = r600_bytecode_vtx_build(bc, vtx, addr);
+					if (r)
+						return r;
+					addr += 4;
 				}
 				LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
 					r = r600_bytecode_tex_build(bc, tex, addr);
@@ -2069,6 +2072,8 @@ int r600_bytecode_build(struct r600_bytecode *bc)
 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
 			case CM_V_SQ_CF_WORD1_SQ_CF_INST_END:
 				break;
+			case CF_NATIVE:
+				break;
 			default:
 				R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
 				return -EINVAL;
@@ -2341,6 +2346,10 @@ void r600_bytecode_dump(struct r600_bytecode *bc)
 				fprintf(stderr, "COND:%X ", cf->cond);
 				fprintf(stderr, "POP_COUNT:%X\n", cf->pop_count);
 				break;
+			case CF_NATIVE:
+				fprintf(stderr, "%04d %08X CF NATIVE\n", id, bc->bytecode[id]);
+				fprintf(stderr, "%04d %08X CF NATIVE\n", id + 1, bc->bytecode[id + 1]);
+				break;
 			default:
 				R600_ERR("Unknown instruction %0x\n", cf->inst);
 			}
@@ -2477,7 +2486,8 @@ void r600_bytecode_dump(struct r600_bytecode *bc)
 			if (alu->last) {
 				for (i = 0; i < nliteral; i++, id++) {
 					float *f = (float*)(bc->bytecode + id);
-					fprintf(stderr, "%04d %08X\t%f\n", id, bc->bytecode[id], *f);
+					fprintf(stderr, "%04d %08X\t%f (%d)\n", id, bc->bytecode[id], *f,
+							*(bc->bytecode + id));
 				}
 				id += nliteral & 1;
 				nliteral = 0;
diff --git a/src/gallium/drivers/r600/r600_asm.h b/src/gallium/drivers/r600/r600_asm.h
index 5790ead991f..a8a157b79e4 100644
--- a/src/gallium/drivers/r600/r600_asm.h
+++ b/src/gallium/drivers/r600/r600_asm.h
@@ -135,6 +135,14 @@ struct r600_bytecode_kcache {
 	unsigned			addr;
 };
 
+/* A value of CF_NATIVE in r600_bytecode_cf::inst means that this instruction
+ * has already been encoded, and the encoding has been stored in
+ * r600_bytecode::isa.  This is used by the LLVM backend to emit CF instructions
+ * e.g. RAT_WRITE_* that can't be properly represented by struct
+ * r600_bytecode_cf.
+ */
+#define CF_NATIVE ~0
+
 struct r600_bytecode_cf {
 	struct list_head		list;
 
@@ -157,6 +165,7 @@ struct r600_bytecode_cf {
 	struct r600_bytecode_alu		*curr_bs_head;
 	struct r600_bytecode_alu		*prev_bs_head;
 	struct r600_bytecode_alu		*prev2_bs_head;
+	unsigned isa[2];
 };
 
 #define FC_NONE				0
diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c
index d467baf60fb..f916604db7b 100644
--- a/src/gallium/drivers/r600/r600_llvm.c
+++ b/src/gallium/drivers/r600/r600_llvm.c
@@ -21,10 +21,44 @@ static LLVMValueRef llvm_fetch_const(
 	enum tgsi_opcode_type type,
 	unsigned swizzle)
 {
-	return lp_build_intrinsic_unary(bld_base->base.gallivm->builder,
+	LLVMValueRef cval = lp_build_intrinsic_unary(bld_base->base.gallivm->builder,
 		"llvm.AMDGPU.load.const", bld_base->base.elem_type,
 		lp_build_const_int32(bld_base->base.gallivm,
 		radeon_llvm_reg_index_soa(reg->Register.Index, swizzle)));
+
+	return bitcast(bld_base, type, cval);
+}
+
+static void llvm_load_system_value(
+		struct radeon_llvm_context * ctx,
+		unsigned index,
+		const struct tgsi_full_declaration *decl)
+{
+	unsigned chan;
+
+	switch (decl->Semantic.Name) {
+	case TGSI_SEMANTIC_INSTANCEID: chan = 3; break;
+	case TGSI_SEMANTIC_VERTEXID: chan = 0; break;
+	default: assert(!"unknown system value");
+	}
+
+	LLVMValueRef reg = lp_build_const_int32(
+			ctx->soa.bld_base.base.gallivm, chan);
+	ctx->system_values[index] = lp_build_intrinsic_unary(
+			ctx->soa.bld_base.base.gallivm->builder,
+			"llvm.R600.load.input",
+			ctx->soa.bld_base.base.elem_type, reg);
+}
+
+static LLVMValueRef llvm_fetch_system_value(
+		struct lp_build_tgsi_context * bld_base,
+		const struct tgsi_full_src_register *reg,
+		enum tgsi_opcode_type type,
+		unsigned swizzle)
+{
+	struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
+	LLVMValueRef cval = ctx->system_values[reg->Register.Index];
+	return bitcast(bld_base, type, cval);
 }
 
 static void llvm_load_input(
@@ -59,17 +93,13 @@ static void llvm_emit_prologue(struct lp_build_tgsi_context * bld_base)
 	for (i = 0; i < ctx->reserved_reg_count; i++) {
 		unsigned chan;
 		for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
-			LLVMValueRef reg;
 			LLVMValueRef reg_index = lp_build_const_int32(
 					base->gallivm,
 					radeon_llvm_reg_index_soa(i, chan));
-			reg = lp_build_intrinsic_unary(base->gallivm->builder,
-						"llvm.AMDGPU.reserve.reg",
-						base->elem_type, reg_index);
 			lp_build_intrinsic_unary(base->gallivm->builder,
-				"llvm.AMDGPU.export.reg",
+				"llvm.AMDGPU.reserve.reg",
 				LLVMVoidTypeInContext(base->gallivm->context),
-				reg);
+				reg_index);
 		}
 	}
 }
@@ -85,7 +115,6 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
 		unsigned chan;
 		for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 			LLVMValueRef output;
-			LLVMValueRef store_output;
 			unsigned adjusted_reg_idx = i +
 					ctx->reserved_reg_count;
 			LLVMValueRef reg_index = lp_build_const_int32(
@@ -95,16 +124,11 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
 			output = LLVMBuildLoad(base->gallivm->builder,
 				ctx->soa.outputs[i][chan], "");
 
-			store_output = lp_build_intrinsic_binary(
+			lp_build_intrinsic_binary(
 				base->gallivm->builder,
 				"llvm.AMDGPU.store.output",
-				base->elem_type,
-				output, reg_index);
-
-			lp_build_intrinsic_unary(base->gallivm->builder,
-				"llvm.AMDGPU.export.reg",
 				LLVMVoidTypeInContext(base->gallivm->context),
-				store_output);
+				output, reg_index);
 		}
 	}
 }
@@ -169,28 +193,7 @@ static struct lp_build_tgsi_action dot_action = {
 	.intr_name = "llvm.AMDGPU.dp4"
 };
 
-static void txp_fetch_args(
-	struct lp_build_tgsi_context * bld_base,
-	struct lp_build_emit_data * emit_data)
-{
-	LLVMValueRef src_w;
-	unsigned chan;
-	LLVMValueRef coords[4];
-
-	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
-	src_w = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
 
-	for (chan = 0; chan < 3; chan++ ) {
-		LLVMValueRef arg = lp_build_emit_fetch(bld_base,
-						emit_data->inst, 0, chan);
-		coords[chan] = lp_build_emit_llvm_binary(bld_base,
-					TGSI_OPCODE_DIV, arg, src_w);
-	}
-	coords[3] = bld_base->base.one;
-	emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm,
-						coords, 4);
-	emit_data->arg_count = 1;
-}
 
 LLVMModuleRef r600_tgsi_llvm(
 	struct radeon_llvm_context * ctx,
@@ -204,20 +207,25 @@ LLVMModuleRef r600_tgsi_llvm(
 	bld_base->info = &shader_info;
 	bld_base->userdata = ctx;
 	bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = llvm_fetch_const;
+	bld_base->emit_fetch_funcs[TGSI_FILE_SYSTEM_VALUE] = llvm_fetch_system_value;
 	bld_base->emit_prologue = llvm_emit_prologue;
 	bld_base->emit_epilogue = llvm_emit_epilogue;
 	ctx->userdata = ctx;
 	ctx->load_input = llvm_load_input;
+	ctx->load_system_value = llvm_load_system_value;
 
 	bld_base->op_actions[TGSI_OPCODE_DP2] = dot_action;
 	bld_base->op_actions[TGSI_OPCODE_DP3] = dot_action;
 	bld_base->op_actions[TGSI_OPCODE_DP4] = dot_action;
 	bld_base->op_actions[TGSI_OPCODE_DPH] = dot_action;
+	bld_base->op_actions[TGSI_OPCODE_DDX].emit = llvm_emit_tex;
+	bld_base->op_actions[TGSI_OPCODE_DDY].emit = llvm_emit_tex;
 	bld_base->op_actions[TGSI_OPCODE_TEX].emit = llvm_emit_tex;
 	bld_base->op_actions[TGSI_OPCODE_TXB].emit = llvm_emit_tex;
 	bld_base->op_actions[TGSI_OPCODE_TXD].emit = llvm_emit_tex;
 	bld_base->op_actions[TGSI_OPCODE_TXL].emit = llvm_emit_tex;
-	bld_base->op_actions[TGSI_OPCODE_TXP].fetch_args = txp_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_TXF].emit = llvm_emit_tex;
+	bld_base->op_actions[TGSI_OPCODE_TXQ].emit = llvm_emit_tex;
 	bld_base->op_actions[TGSI_OPCODE_TXP].emit = llvm_emit_tex;
 
 	lp_build_tgsi_llvm(bld_base, tokens);
diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
index 63fc27564d7..db455f021ad 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -161,6 +161,7 @@ struct r600_pipe_dsa {
 	ubyte				valuemask[2];
 	ubyte				writemask[2];
 	bool				is_flush;
+	unsigned                        sx_alpha_test_control;
 };
 
 struct r600_vertex_element
@@ -250,6 +251,7 @@ struct r600_context {
 	struct pipe_framebuffer_state	framebuffer;
 	unsigned			cb_target_mask;
 	unsigned			fb_cb_shader_mask;
+	unsigned			sx_alpha_test_control;
 	unsigned			cb_shader_mask;
 	unsigned			cb_color_control;
 	unsigned			pa_sc_line_stipple;
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index 5e22b35ba48..cd78104a010 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -287,6 +287,7 @@ static void llvm_if(struct r600_shader_ctx *ctx, struct r600_bytecode_alu * alu,
 {
 	alu->inst = pred_inst; 
 	alu->predicate = 1;
+	alu->dst.write = 0;
 	alu->src[1].sel = V_SQ_ALU_SRC_0;
 	alu->src[1].chan = 0;
 	alu->last = 1;
@@ -362,6 +363,10 @@ static unsigned r600_fc_from_byte_stream(struct r600_shader_ctx *ctx,
 			tgsi_loop_brk_cont(ctx);
 		}
 		break;
+	case 8:
+		r600_break_from_byte_stream(ctx, &alu,
+			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT));
+		break;
 	}
 
 	return bytes_read;
@@ -401,10 +406,43 @@ static unsigned r600_tex_from_byte_stream(struct r600_shader_ctx *ctx,
 	return bytes_read;
 }
 
+static int r600_vtx_from_byte_stream(struct r600_shader_ctx *ctx,
+	unsigned char * bytes, unsigned bytes_read)
+{
+	struct r600_bytecode_vtx vtx;
+	memset(&vtx, 0, sizeof(vtx));
+	vtx.inst = bytes[bytes_read++];
+	vtx.fetch_type = bytes[bytes_read++];
+	vtx.buffer_id = bytes[bytes_read++];
+	vtx.src_gpr = bytes[bytes_read++];
+	vtx.src_sel_x = bytes[bytes_read++];
+	vtx.mega_fetch_count = bytes[bytes_read++];
+	vtx.dst_gpr = bytes[bytes_read++];
+	vtx.dst_sel_x = bytes[bytes_read++];
+	vtx.dst_sel_y = bytes[bytes_read++];
+	vtx.dst_sel_z = bytes[bytes_read++];
+	vtx.dst_sel_w = bytes[bytes_read++];
+	vtx.use_const_fields = bytes[bytes_read++];
+	vtx.data_format = bytes[bytes_read++];
+	vtx.num_format_all = bytes[bytes_read++];
+	vtx.format_comp_all = bytes[bytes_read++];
+	vtx.srf_mode_all = bytes[bytes_read++];
+	vtx.offset = bytes[bytes_read++];
+	vtx.endian = bytes[bytes_read++];
+
+	if (r600_bytecode_add_vtx(ctx->bc, &vtx)) {
+		fprintf(stderr, "Error adding vtx\n");
+	}
+	/* Use the Texture Cache */
+	ctx->bc->cf_last->inst = EG_V_SQ_CF_WORD1_SQ_CF_INST_TEX;
+	return bytes_read;
+}
+
 static void r600_bytecode_from_byte_stream(struct r600_shader_ctx *ctx,
 				unsigned char * bytes,	unsigned num_bytes)
 {
 	unsigned bytes_read = 0;
+	unsigned i, byte;
 	while (bytes_read < num_bytes) {
 		char inst_type = bytes[bytes_read++];
 		switch (inst_type) {
@@ -420,6 +458,20 @@ static void r600_bytecode_from_byte_stream(struct r600_shader_ctx *ctx,
 			bytes_read = r600_fc_from_byte_stream(ctx, bytes,
 								bytes_read);
 			break;
+		case 3:
+			r600_bytecode_add_cfinst(ctx->bc, CF_NATIVE);
+			for (i = 0; i < 2; i++) {
+				for (byte = 0 ; byte < 4; byte++) {
+					ctx->bc->cf_last->isa[i] |=
+					(bytes[bytes_read++] << (byte * 8));
+				}
+			}
+			break;
+
+		case 4:
+			bytes_read = r600_vtx_from_byte_stream(ctx, bytes,
+								bytes_read);
+			break;
 		default:
 			/* XXX: Error here */
 			break;
@@ -670,8 +722,8 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx)
 		ctx->shader->input[i].name = d->Semantic.Name;
 		ctx->shader->input[i].sid = d->Semantic.Index;
 		ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
-		ctx->shader->input[i].interpolate = d->Declaration.Interpolate;
-		ctx->shader->input[i].centroid = d->Declaration.Centroid;
+		ctx->shader->input[i].interpolate = d->Interp.Interpolate;
+		ctx->shader->input[i].centroid = d->Interp.Centroid;
 		ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First;
 		if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
 			switch (ctx->shader->input[i].name) {
@@ -697,7 +749,7 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx)
 		ctx->shader->output[i].sid = d->Semantic.Index;
 		ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
 		ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First;
-		ctx->shader->output[i].interpolate = d->Declaration.Interpolate;
+		ctx->shader->output[i].interpolate = d->Interp.Interpolate;
 		ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
 		if (ctx->type == TGSI_PROCESSOR_VERTEX) {
 			switch (d->Semantic.Name) {
@@ -5102,7 +5154,7 @@ static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
 	{80,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_PUSHA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_POPA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_CEIL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
+	{TGSI_OPCODE_CEIL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL, tgsi_op2},
 	{TGSI_OPCODE_I2F,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT, tgsi_op2_trans},
 	{TGSI_OPCODE_NOT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT, tgsi_op2},
 	{TGSI_OPCODE_TRUNC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_op2},
@@ -5168,16 +5220,16 @@ static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
 	{TGSI_OPCODE_CASE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_DEFAULT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_ENDSWITCH,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_LOAD,      0, 0, tgsi_unsupported},
-	{TGSI_OPCODE_LOAD_MS,   0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE,    0, 0, tgsi_unsupported},
+	{TGSI_OPCODE_SAMPLE_I,  0, 0, tgsi_unsupported},
+	{TGSI_OPCODE_SAMPLE_I_MS, 0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_B,  0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_C,  0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_D,  0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_L,  0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_GATHER4,   0, 0, tgsi_unsupported},
-	{TGSI_OPCODE_RESINFO,	0, 0, tgsi_unsupported},
+	{TGSI_OPCODE_SVIEWINFO,	0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_UARL,      0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT, tgsi_r600_arl},
@@ -5276,7 +5328,7 @@ static struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
 	{80,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_PUSHA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_POPA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_CEIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
+	{TGSI_OPCODE_CEIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL, tgsi_op2},
 	{TGSI_OPCODE_I2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT, tgsi_op2_trans},
 	{TGSI_OPCODE_NOT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT, tgsi_op2},
 	{TGSI_OPCODE_TRUNC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_op2},
@@ -5342,16 +5394,16 @@ static struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
 	{TGSI_OPCODE_CASE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_DEFAULT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_ENDSWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_LOAD,      0, 0, tgsi_unsupported},
-	{TGSI_OPCODE_LOAD_MS,   0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE,    0, 0, tgsi_unsupported},
+	{TGSI_OPCODE_SAMPLE_I,      0, 0, tgsi_unsupported},
+	{TGSI_OPCODE_SAMPLE_I_MS,   0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_B,  0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_C,  0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_D,  0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_L,  0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_GATHER4,   0, 0, tgsi_unsupported},
-	{TGSI_OPCODE_RESINFO,	0, 0, tgsi_unsupported},
+	{TGSI_OPCODE_SVIEWINFO,	0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_UARL,      0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT, tgsi_eg_arl},
@@ -5450,7 +5502,7 @@ static struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
 	{80,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_PUSHA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_POPA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_CEIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
+	{TGSI_OPCODE_CEIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL, tgsi_op2},
 	{TGSI_OPCODE_I2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT, tgsi_op2},
 	{TGSI_OPCODE_NOT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT, tgsi_op2},
 	{TGSI_OPCODE_TRUNC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_op2},
@@ -5516,16 +5568,16 @@ static struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
 	{TGSI_OPCODE_CASE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_DEFAULT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_ENDSWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_LOAD,      0, 0, tgsi_unsupported},
-	{TGSI_OPCODE_LOAD_MS,   0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE,    0, 0, tgsi_unsupported},
+	{TGSI_OPCODE_SAMPLE_I,      0, 0, tgsi_unsupported},
+	{TGSI_OPCODE_SAMPLE_I_MS,   0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_B,  0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_C,  0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_D,  0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_L,  0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_GATHER4,   0, 0, tgsi_unsupported},
-	{TGSI_OPCODE_RESINFO,	0, 0, tgsi_unsupported},
+	{TGSI_OPCODE_SVIEWINFO,	0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_UARL,      0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT, tgsi_eg_arl},
diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c
index 3a83b613e58..acf59f80bf4 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -805,9 +805,9 @@ static void *r600_create_dsa_state(struct pipe_context *ctx,
 		alpha_test_control |= S_028410_ALPHA_TEST_ENABLE(1);
 		alpha_ref = fui(state->alpha.ref_value);
 	}
+	dsa->sx_alpha_test_control = alpha_test_control & 0xff;
 	dsa->alpha_ref = alpha_ref;
 
-	r600_pipe_state_add_reg(rstate, R_028410_SX_ALPHA_TEST_CONTROL, alpha_test_control);
 	r600_pipe_state_add_reg(rstate, R_028800_DB_DEPTH_CONTROL, db_depth_control);
 	return rstate;
 }
@@ -1466,6 +1466,11 @@ static void r600_cb(struct r600_context *rctx, struct r600_pipe_state *rstate,
 		blend_bypass = 1;
 	}
 
+	if (ntype == V_0280A0_NUMBER_UINT || ntype == V_0280A0_NUMBER_SINT)
+		rctx->sx_alpha_test_control |= S_028410_ALPHA_TEST_BYPASS(1);
+	else
+		rctx->sx_alpha_test_control &= C_028410_ALPHA_TEST_BYPASS;
+
 	color_info |= S_0280A0_FORMAT(format) |
 		S_0280A0_COMP_SWAP(swap) |
 		S_0280A0_BLEND_BYPASS(blend_bypass) |
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index ccae7d91d43..d47383558d9 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -244,6 +244,8 @@ void r600_bind_dsa_state(struct pipe_context *ctx, void *state)
 		return;
 	rstate = &dsa->rstate;
 	rctx->states[rstate->id] = rstate;
+	rctx->sx_alpha_test_control &= ~0xff;
+	rctx->sx_alpha_test_control |= dsa->sx_alpha_test_control;
 	rctx->alpha_ref = dsa->alpha_ref;
 	rctx->alpha_ref_dirty = true;
 	r600_context_pipe_state_set(rctx, rstate);
@@ -796,6 +798,7 @@ void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *dinfo)
 		r600_pipe_state_add_reg(&rctx->vgt, R_02823C_CB_SHADER_MASK, 0);
 		r600_pipe_state_add_reg(&rctx->vgt, R_028408_VGT_INDX_OFFSET, info.index_bias);
 		r600_pipe_state_add_reg(&rctx->vgt, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, info.restart_index);
+		r600_pipe_state_add_reg(&rctx->vgt, R_028410_SX_ALPHA_TEST_CONTROL, 0);
 		r600_pipe_state_add_reg(&rctx->vgt, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, info.primitive_restart);
 		r600_pipe_state_add_reg(&rctx->vgt, R_03CFF4_SQ_VTX_START_INST_LOC, info.start_instance);
 		r600_pipe_state_add_reg(&rctx->vgt, R_028A0C_PA_SC_LINE_STIPPLE, 0);
@@ -817,6 +820,7 @@ void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *dinfo)
 	r600_pipe_state_mod_reg(&rctx->vgt, rctx->cb_shader_mask);
 	r600_pipe_state_mod_reg(&rctx->vgt, info.index_bias);
 	r600_pipe_state_mod_reg(&rctx->vgt, info.restart_index);
+	r600_pipe_state_mod_reg(&rctx->vgt, rctx->sx_alpha_test_control);
 	r600_pipe_state_mod_reg(&rctx->vgt, info.primitive_restart);
 	r600_pipe_state_mod_reg(&rctx->vgt, info.start_instance);
 
diff --git a/src/gallium/drivers/radeon/AMDGPU.h b/src/gallium/drivers/radeon/AMDGPU.h
index eff002a5eae..0f42cb744d3 100644
--- a/src/gallium/drivers/radeon/AMDGPU.h
+++ b/src/gallium/drivers/radeon/AMDGPU.h
@@ -1,4 +1,4 @@
-//===-- AMDGPU.h - TODO: Add brief description -------===//
+//===-- AMDGPU.h - MachineFunction passes hw codegen --------------*- C++ -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,10 +6,6 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// TODO: Add full description
-//
-//===----------------------------------------------------------------------===//
 
 #ifndef AMDGPU_H
 #define AMDGPU_H
@@ -19,29 +15,24 @@
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
-    class FunctionPass;
-    class AMDGPUTargetMachine;
-
-    FunctionPass *createR600CodeEmitterPass(formatted_raw_ostream &OS);
-    FunctionPass *createR600LowerShaderInstructionsPass(TargetMachine &tm);
-    FunctionPass *createR600LowerInstructionsPass(TargetMachine &tm);
-
-    FunctionPass *createSIAssignInterpRegsPass(TargetMachine &tm);
-    FunctionPass *createSIInitMachineFunctionInfoPass(TargetMachine &tm);
-    FunctionPass *createSILowerShaderInstructionsPass(TargetMachine &tm);
-    FunctionPass *createSIPropagateImmReadsPass(TargetMachine &tm);
-    FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
 
-    FunctionPass *createAMDGPUReorderPreloadInstructionsPass(TargetMachine &tm);
+class FunctionPass;
+class AMDGPUTargetMachine;
 
-    FunctionPass *createAMDGPULowerInstructionsPass(TargetMachine &tm);
-    FunctionPass *createAMDGPULowerShaderInstructionsPass(TargetMachine &tm);
+// R600 Passes
+FunctionPass* createR600KernelParametersPass(const TargetData* TD);
+FunctionPass *createR600CodeEmitterPass(formatted_raw_ostream &OS);
+FunctionPass *createR600LowerInstructionsPass(TargetMachine &tm);
 
-    FunctionPass *createAMDGPUDelimitInstGroupsPass(TargetMachine &tm);
+// SI Passes
+FunctionPass *createSIAssignInterpRegsPass(TargetMachine &tm);
+FunctionPass *createSIPropagateImmReadsPass(TargetMachine &tm);
+FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
 
-    FunctionPass *createAMDGPUConvertToISAPass(TargetMachine &tm);
+// Passes common to R600 and SI
+FunctionPass *createAMDGPULowerInstructionsPass(TargetMachine &tm);
+FunctionPass *createAMDGPUConvertToISAPass(TargetMachine &tm);
 
-    FunctionPass *createAMDGPUFixRegClassesPass(TargetMachine &tm);
+} // End namespace llvm
 
-} /* End namespace llvm */
-#endif /* AMDGPU_H */
+#endif // AMDGPU_H
diff --git a/src/gallium/drivers/radeon/AMDGPUConstants.pm b/src/gallium/drivers/radeon/AMDGPUConstants.pm
deleted file mode 100644
index b64ff49c187..00000000000
--- a/src/gallium/drivers/radeon/AMDGPUConstants.pm
+++ /dev/null
@@ -1,44 +0,0 @@
-#===-- AMDGPUConstants.pm - TODO: Add brief description -------===#
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-#===----------------------------------------------------------------------===#
-#
-# TODO: Add full description
-#
-#===----------------------------------------------------------------------===#
-
-package AMDGPUConstants;
-
-use base 'Exporter';
-
-use constant CONST_REG_COUNT => 256;
-use constant TEMP_REG_COUNT => 128;
-
-our @EXPORT = ('TEMP_REG_COUNT', 'CONST_REG_COUNT', 'get_hw_index', 'get_chan_str');
-
-sub get_hw_index {
-  my ($index) = @_;
-  return int($index / 4);
-}
-
-sub get_chan_str {
-  my ($index) = @_;
-  my $chan = $index % 4;
-  if ($chan == 0 )  {
-    return 'X';
-  } elsif ($chan == 1) {
-    return 'Y';
-  } elsif ($chan == 2) {
-    return 'Z';
-  } elsif ($chan == 3) {
-    return 'W';
-  } else {
-    die("Unknown chan value: $chan");
-  }
-}
-
-1;
diff --git a/src/gallium/drivers/radeon/AMDGPUConvertToISA.cpp b/src/gallium/drivers/radeon/AMDGPUConvertToISA.cpp
index ce947f8ff78..8e82b8438bb 100644
--- a/src/gallium/drivers/radeon/AMDGPUConvertToISA.cpp
+++ b/src/gallium/drivers/radeon/AMDGPUConvertToISA.cpp
@@ -34,7 +34,7 @@ namespace {
     virtual bool runOnMachineFunction(MachineFunction &MF);
 
   };
-} /* End anonymous namespace */
+} // End anonymous namespace
 
 char AMDGPUConvertToISAPass::ID = 0;
 
diff --git a/src/gallium/drivers/radeon/AMDGPUGenInstrEnums.pl b/src/gallium/drivers/radeon/AMDGPUGenInstrEnums.pl
index 1fd4fb04b3e..130eaac72bc 100644
--- a/src/gallium/drivers/radeon/AMDGPUGenInstrEnums.pl
+++ b/src/gallium/drivers/radeon/AMDGPUGenInstrEnums.pl
@@ -1,15 +1,32 @@
-#===-- AMDGPUGenInstrEnums.pl - TODO: Add brief description -------===#
+#===-- AMDGPUGenInstrEnums.pl - Script for generating instruction enums ----===#
 #
 #                     The LLVM Compiler Infrastructure
 #
 # This file is distributed under the University of Illinois Open Source
 # License. See LICENSE.TXT for details.
 #
-#===----------------------------------------------------------------------===#
+#===-----------------------------------------------------------------------===#
 #
-# TODO: Add full description
+# This perl script is used to generate the following files:
 #
-#===----------------------------------------------------------------------===#
+# 1. perl AMDGPUGenInstrEnums.pl td  > AMDGPUInstrEnums.td
+#
+#    This file contains Tablegen constants used for matching hw instructions
+#    from R600 and SI with functionally similar AMDIL instruction.  It aslo
+#    contains definitions of floating point constants like pi (in hex notation)
+#    that are used in some of the shader patterns.
+#
+# 2. perl AMDGPUGenInstrEnums.pl h   > AMDGPUInstrEnums.h
+#
+#    This file contains cpp enums that match the constant values in
+#    AMDGPUInstrEnums.td
+#
+# 3. perl AMDGPUGenInstrEnums.pl inc > AMDGPUInstrEnums.include
+#
+#    This file contains a function called GetRealAMDILOpcode which maps the
+#    constant values defined in AMDGPUInstrEnums.h to the corresponding AMDIL
+#    instructions.
+#===-----------------------------------------------------------------------===#
 
 use warnings;
 use strict;
@@ -41,7 +58,7 @@ my $FILE_TYPE = $ARGV[0];
 
 open AMDIL, '<', 'AMDILInstructions.td';
 
-my @INST_ENUMS = ('NONE', 'FEQ', 'FGE', 'FLT', 'FNE', 'MOVE_f32', 'MOVE_i32', 'FTOI', 'ITOF', 'CMOVLOG_f32', 'UGT', 'IGE', 'INE', 'UGE', 'IEQ');
+my @INST_ENUMS = ('NONE', 'FEQ', 'FGE', 'FLT', 'FNE', 'MOVE_f32', 'MOVE_i32', 'FTOI', 'ITOF', 'CMOVLOG_f32', 'UGT', 'IGE', 'INE', 'UGE', 'IEQ', 'BINARY_OR_i32', 'BINARY_NOT_i32');
 
 while (<AMDIL>) {
   if ($_ =~ /defm\s+([A-Z_]+)\s+:\s+([A-Za-z0-9]+)</) {
diff --git a/src/gallium/drivers/radeon/AMDGPUGenShaderPatterns.pl b/src/gallium/drivers/radeon/AMDGPUGenShaderPatterns.pl
deleted file mode 100644
index 60523a7b48f..00000000000
--- a/src/gallium/drivers/radeon/AMDGPUGenShaderPatterns.pl
+++ /dev/null
@@ -1,30 +0,0 @@
-#===-- AMDGPUGenShaderPatterns.pl - TODO: Add brief description -------===#
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-#===----------------------------------------------------------------------===#
-#
-# TODO: Add full description
-#
-#===----------------------------------------------------------------------===#
-
-use strict;
-use warnings;
-
-use AMDGPUConstants;
-
-my $reg_prefix = $ARGV[0];
-
-for (my $i = 0; $i < CONST_REG_COUNT * 4; $i++) {
-  my $index = get_hw_index($i);
-  my $chan = get_chan_str($i);
-print <<STRING;
-def : Pat <
-  (int_AMDGPU_load_const $i),
-  (f32 (MOV (f32 $reg_prefix$index\_$chan)))
->;
-STRING
-}
diff --git a/src/gallium/drivers/radeon/AMDGPUISelLowering.cpp b/src/gallium/drivers/radeon/AMDGPUISelLowering.cpp
index 2c1052fd8ea..2bdc8a759f2 100644
--- a/src/gallium/drivers/radeon/AMDGPUISelLowering.cpp
+++ b/src/gallium/drivers/radeon/AMDGPUISelLowering.cpp
@@ -1,4 +1,4 @@
-//===-- AMDGPUISelLowering.cpp - TODO: Add brief description -------===//
+//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// This is the parent TargetLowering class for hardware code gen targets.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/src/gallium/drivers/radeon/AMDGPUISelLowering.h b/src/gallium/drivers/radeon/AMDGPUISelLowering.h
index 3c5beb1cdae..1b3f71006e2 100644
--- a/src/gallium/drivers/radeon/AMDGPUISelLowering.h
+++ b/src/gallium/drivers/radeon/AMDGPUISelLowering.h
@@ -1,4 +1,4 @@
-//===-- AMDGPUISelLowering.h - TODO: Add brief description -------===//
+//===-- AMDGPUISelLowering.h - AMDGPU Lowering Interface --------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// This file contains the interface defintiion of the TargetLowering class
+// that is common to all AMD GPUs.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/src/gallium/drivers/radeon/AMDGPUInstrInfo.cpp b/src/gallium/drivers/radeon/AMDGPUInstrInfo.cpp
index 4742283f688..ecd8ac90526 100644
--- a/src/gallium/drivers/radeon/AMDGPUInstrInfo.cpp
+++ b/src/gallium/drivers/radeon/AMDGPUInstrInfo.cpp
@@ -108,9 +108,4 @@ unsigned AMDGPUInstrInfo::getISAOpcode(unsigned opcode) const
   }
 }
 
-bool AMDGPUInstrInfo::isRegPreload(const MachineInstr &MI) const
-{
-  return (get(MI.getOpcode()).TSFlags >> AMDGPU_TFLAG_SHIFTS::PRELOAD_REG) & 0x1;
-}
-
 #include "AMDGPUInstrEnums.include"
diff --git a/src/gallium/drivers/radeon/AMDGPUInstrInfo.h b/src/gallium/drivers/radeon/AMDGPUInstrInfo.h
index fa009bc6302..930b41e7191 100644
--- a/src/gallium/drivers/radeon/AMDGPUInstrInfo.h
+++ b/src/gallium/drivers/radeon/AMDGPUInstrInfo.h
@@ -1,4 +1,4 @@
-//===-- AMDGPUInstrInfo.h - TODO: Add brief description -------===//
+//===-- AMDGPUInstrInfo.h - AMDGPU Instruction Information ------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// This file contains the definitoin of a TargetInstrInfo class that is common
+// to all AMD GPUs.
 //
 //===----------------------------------------------------------------------===//
 
@@ -21,17 +22,17 @@
 
 namespace llvm {
 
-  class AMDGPUTargetMachine;
-  class MachineFunction;
-  class MachineInstr;
-  class MachineInstrBuilder;
+class AMDGPUTargetMachine;
+class MachineFunction;
+class MachineInstr;
+class MachineInstrBuilder;
 
-  class AMDGPUInstrInfo : public AMDILInstrInfo {
-  private:
+class AMDGPUInstrInfo : public AMDILInstrInfo {
+private:
   AMDGPUTargetMachine & TM;
   std::map<unsigned, unsigned> amdilToISA;
 
-  public:
+public:
   explicit AMDGPUInstrInfo(AMDGPUTargetMachine &tm);
 
   virtual const AMDGPURegisterInfo &getRegisterInfo() const = 0;
@@ -41,19 +42,9 @@ namespace llvm {
   virtual MachineInstr * convertToISA(MachineInstr & MI, MachineFunction &MF,
     DebugLoc DL) const;
 
-  bool isRegPreload(const MachineInstr &MI) const;
-
   #include "AMDGPUInstrEnums.h.include"
-  };
+};
 
 } // End llvm namespace
 
-/* AMDGPU target flags are stored in bits 32-39 */
-namespace AMDGPU_TFLAG_SHIFTS {
-  enum TFLAGS {
-    PRELOAD_REG = 32
-  };
-}
-
-
 #endif // AMDGPUINSTRINFO_H_
diff --git a/src/gallium/drivers/radeon/AMDGPUInstructions.td b/src/gallium/drivers/radeon/AMDGPUInstructions.td
index 0433c8dcd95..f689356e488 100644
--- a/src/gallium/drivers/radeon/AMDGPUInstructions.td
+++ b/src/gallium/drivers/radeon/AMDGPUInstructions.td
@@ -1,4 +1,4 @@
-//===-- AMDGPUInstructions.td - TODO: Add brief description -------===//
+//===-- AMDGPUInstructions.td - Common instruction defs ---*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// This file contains instruction defs that are common to all hw codegen
+// targets.
 //
 //===----------------------------------------------------------------------===//
 
@@ -16,14 +17,12 @@ include "AMDGPUInstrEnums.td"
 class AMDGPUInst <dag outs, dag ins, string asm, list<dag> pattern> : Instruction {
   field bits<16> AMDILOp = 0;
   field bits<3> Gen = 0;
-  field bit PreloadReg = 0;
 
   let Namespace = "AMDIL";
   let OutOperandList = outs;
   let InOperandList = ins;
   let AsmString = asm;
   let Pattern = pattern;
-  let TSFlags{32} = PreloadReg;
   let TSFlags{42-40} = Gen;
   let TSFlags{63-48} = AMDILOp;
 }
@@ -37,42 +36,12 @@ class AMDGPUShaderInst <dag outs, dag ins, string asm, list<dag> pattern>
 
 let isCodeGenOnly = 1 in {
 
-  def EXPORT_REG : AMDGPUShaderInst <
-    (outs),
-    (ins GPRF32:$src),
-    "EXPORT_REG $src",
-    [(int_AMDGPU_export_reg GPRF32:$src)]
-  >;
-
-  def LOAD_INPUT : AMDGPUShaderInst <
-    (outs GPRF32:$dst),
-    (ins i32imm:$src),
-    "LOAD_INPUT $dst, $src",
-    [] >{
-    let PreloadReg = 1;
-  }
-
   def MASK_WRITE : AMDGPUShaderInst <
     (outs),
     (ins GPRF32:$src),
     "MASK_WRITE $src",
     []
   >;
-
-  def RESERVE_REG : AMDGPUShaderInst <
-    (outs GPRF32:$dst),
-    (ins i32imm:$src),
-    "RESERVE_REG $dst, $src",
-    [(set GPRF32:$dst, (int_AMDGPU_reserve_reg imm:$src))]> {
-    let PreloadReg = 1;
-  }
-
-  def STORE_OUTPUT: AMDGPUShaderInst <
-    (outs GPRF32:$dst),
-    (ins GPRF32:$src0, i32imm:$src1),
-    "STORE_OUTPUT $dst, $src0, $src1",
-    [(set GPRF32:$dst, (int_AMDGPU_store_output GPRF32:$src0, imm:$src1))]
-  >;
 }
 
 /* Generic helper patterns for intrinsics */
diff --git a/src/gallium/drivers/radeon/AMDGPUIntrinsics.td b/src/gallium/drivers/radeon/AMDGPUIntrinsics.td
index d2cda0db936..398fd11431f 100644
--- a/src/gallium/drivers/radeon/AMDGPUIntrinsics.td
+++ b/src/gallium/drivers/radeon/AMDGPUIntrinsics.td
@@ -1,4 +1,4 @@
-//===-- AMDGPUIntrinsics.td - TODO: Add brief description -------===//
+//===-- AMDGPUIntrinsics.td - Common intrinsics  -*- tablegen -*-----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,17 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// This file defines intrinsics that are used by all hw codegen targets.
 //
 //===----------------------------------------------------------------------===//
 
 let TargetPrefix = "AMDGPU", isTarget = 1 in {
 
-  def int_AMDGPU_export_reg : Intrinsic<[], [llvm_float_ty], []>;
   def int_AMDGPU_load_const : Intrinsic<[llvm_float_ty], [llvm_i32_ty], []>;
   def int_AMDGPU_load_imm : Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty], []>;
-  def int_AMDGPU_reserve_reg : Intrinsic<[llvm_float_ty], [llvm_i32_ty], []>;
-  def int_AMDGPU_store_output : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty], []>;
+  def int_AMDGPU_reserve_reg : Intrinsic<[], [llvm_i32_ty], []>;
+  def int_AMDGPU_store_output : Intrinsic<[], [llvm_float_ty, llvm_i32_ty], []>;
   def int_AMDGPU_swizzle : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], []>;
 
   def int_AMDGPU_arl : Intrinsic<[llvm_i32_ty], [llvm_float_ty], []>;
@@ -26,7 +25,7 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in {
   def int_AMDGPU_div : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], []>;
   def int_AMDGPU_dp4 : Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, llvm_v4f32_ty], []>;
   def int_AMDGPU_floor : Intrinsic<[llvm_float_ty], [llvm_float_ty], []>;
-  def int_AMDGPU_kill : Intrinsic<[llvm_float_ty], [llvm_float_ty], []>;
+  def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>;
   def int_AMDGPU_kilp : Intrinsic<[], [], []>;
   def int_AMDGPU_lrp : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], []>;
   def int_AMDGPU_mul : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], []>;
@@ -35,7 +34,7 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in {
   def int_AMDGPU_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], []>;
   def int_AMDGPU_seq : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], []>;
   def int_AMDGPU_sgt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], []>;
-  def int_AMDGPU_sge : BinaryIntFloat;
+  def int_AMDGPU_sge : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], []>;
   def int_AMDGPU_sin : Intrinsic<[llvm_float_ty], [llvm_float_ty], []>;
   def int_AMDGPU_sle : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], []>;
   def int_AMDGPU_sne : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], []>;
@@ -43,9 +42,18 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in {
   def int_AMDGPU_mullit : Intrinsic<[llvm_v4f32_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], []>;
   def int_AMDGPU_tex : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>;
   def int_AMDGPU_txb : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+  def int_AMDGPU_txf : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+  def int_AMDGPU_txq : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>;
   def int_AMDGPU_txd : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>;
   def int_AMDGPU_txl : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>;
   def int_AMDGPU_trunc : Intrinsic<[llvm_float_ty], [llvm_float_ty], []>;
+  def int_AMDGPU_ddx : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+  def int_AMDGPU_ddy : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+  def int_AMDGPU_imax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>;
+  def int_AMDGPU_imin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>;
+  def int_AMDGPU_umax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>;
+  def int_AMDGPU_umin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>;
+  def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], []>;
 }
 
 let TargetPrefix = "TGSI", isTarget = 1 in {
diff --git a/src/gallium/drivers/radeon/AMDGPULowerInstructions.cpp b/src/gallium/drivers/radeon/AMDGPULowerInstructions.cpp
index b49d0dddf65..2e455fea8ab 100644
--- a/src/gallium/drivers/radeon/AMDGPULowerInstructions.cpp
+++ b/src/gallium/drivers/radeon/AMDGPULowerInstructions.cpp
@@ -1,4 +1,4 @@
-//===-- AMDGPULowerInstructions.cpp - TODO: Add brief description -------===//
+//===-- AMDGPULowerInstructions.cpp - AMDGPU lowering pass ----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// This pass lowers unsupported AMDIL MachineInstrs to LLVM pseudo 
+// MachineInstrs for hw codegen targets.
 //
 //===----------------------------------------------------------------------===//
 
@@ -27,7 +28,7 @@ namespace {
   private:
     static char ID;
     TargetMachine &TM;
-    void lowerVCREATE_v4f32(MachineInstr &MI, MachineBasicBlock::iterator I,
+    void lowerVCREATE_v4(MachineInstr &MI, MachineBasicBlock::iterator I,
                               MachineBasicBlock &MBB, MachineFunction &MF);
 
   public:
@@ -56,8 +57,9 @@ bool AMDGPULowerInstructionsPass::runOnMachineFunction(MachineFunction &MF)
 
       switch (MI.getOpcode()) {
       default: continue;
-      case AMDIL::VCREATE_v4f32: lowerVCREATE_v4f32(MI, I, MBB, MF); break;
-
+      case AMDIL::VCREATE_v4f32:
+      case AMDIL::VCREATE_v4i32:
+        lowerVCREATE_v4(MI, I, MBB, MF); break;
       }
       MI.eraseFromParent();
     }
@@ -65,7 +67,7 @@ bool AMDGPULowerInstructionsPass::runOnMachineFunction(MachineFunction &MF)
   return false;
 }
 
-void AMDGPULowerInstructionsPass::lowerVCREATE_v4f32(MachineInstr &MI,
+void AMDGPULowerInstructionsPass::lowerVCREATE_v4(MachineInstr &MI,
     MachineBasicBlock::iterator I, MachineBasicBlock &MBB, MachineFunction &MF)
 {
   MachineRegisterInfo & MRI = MF.getRegInfo();
diff --git a/src/gallium/drivers/radeon/AMDGPULowerShaderInstructions.cpp b/src/gallium/drivers/radeon/AMDGPULowerShaderInstructions.cpp
deleted file mode 100644
index d33055ccb87..00000000000
--- a/src/gallium/drivers/radeon/AMDGPULowerShaderInstructions.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-//===-- AMDGPULowerShaderInstructions.cpp - TODO: Add brief description -------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// TODO: Add full description
-//
-//===----------------------------------------------------------------------===//
-
-
-#include "AMDGPULowerShaderInstructions.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
-
-using namespace llvm;
-
-void AMDGPULowerShaderInstructionsPass::preloadRegister(MachineFunction * MF,
-    const TargetInstrInfo * TII, unsigned physReg, unsigned virtReg) const
-{
-  if (!MRI->isLiveIn(physReg)) {
-    MRI->addLiveIn(physReg, virtReg);
-    MachineBasicBlock &EntryMBB = MF->front();
-    BuildMI(MF->front(), EntryMBB.begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
-            virtReg)
-            .addReg(physReg);
-  } else {
-    /* We can't mark the same register as preloaded twice, but we still must
-     * associate virtReg with the correct preloaded register. */
-    unsigned newReg = MRI->getLiveInVirtReg(physReg);
-    MRI->replaceRegWith(virtReg, newReg);
-  }
-}
diff --git a/src/gallium/drivers/radeon/AMDGPULowerShaderInstructions.h b/src/gallium/drivers/radeon/AMDGPULowerShaderInstructions.h
deleted file mode 100644
index 5ee77fafe2b..00000000000
--- a/src/gallium/drivers/radeon/AMDGPULowerShaderInstructions.h
+++ /dev/null
@@ -1,40 +0,0 @@
-//===-- AMDGPULowerShaderInstructions.h - TODO: Add brief description -------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// TODO: Add full description
-//
-//===----------------------------------------------------------------------===//
-
-
-#ifndef AMDGPU_LOWER_SHADER_INSTRUCTIONS
-#define AMDGPU_LOWER_SHADER_INSTRUCTIONS
-
-namespace llvm {
-
-class MachineFunction;
-class MachineRegisterInfo;
-class TargetInstrInfo;
-
-class AMDGPULowerShaderInstructionsPass {
-
-  protected:
-    MachineRegisterInfo * MRI;
-    /**
-     * @param physReg The physical register that will be preloaded.
-     * @param virtReg The virtual register that currently holds the
-     *                preloaded value.
-     */
-    void preloadRegister(MachineFunction * MF, const TargetInstrInfo * TII,
-                         unsigned physReg, unsigned virtReg) const;
-};
-
-} // end namespace llvm
-
-
-#endif // AMDGPU_LOWER_SHADER_INSTRUCTIONS
diff --git a/src/gallium/drivers/radeon/AMDGPURegisterInfo.cpp b/src/gallium/drivers/radeon/AMDGPURegisterInfo.cpp
index 162a49116a0..ad48335fd33 100644
--- a/src/gallium/drivers/radeon/AMDGPURegisterInfo.cpp
+++ b/src/gallium/drivers/radeon/AMDGPURegisterInfo.cpp
@@ -1,4 +1,4 @@
-//===-- AMDGPURegisterInfo.cpp - TODO: Add brief description -------===//
+//===-- AMDGPURegisterInfo.cpp - AMDGPU Register Information -------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// Parent TargetRegisterInfo class common to all hw codegen targets.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/src/gallium/drivers/radeon/AMDGPURegisterInfo.h b/src/gallium/drivers/radeon/AMDGPURegisterInfo.h
index f4492e9795d..d545c06f69e 100644
--- a/src/gallium/drivers/radeon/AMDGPURegisterInfo.h
+++ b/src/gallium/drivers/radeon/AMDGPURegisterInfo.h
@@ -1,4 +1,4 @@
-//===-- AMDGPURegisterInfo.h - TODO: Add brief description -------===//
+//===-- AMDGPURegisterInfo.h - AMDGPURegisterInfo Interface -*- C++ -*-----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// This file contains the TargetRegisterInfo interface that is implemented
+// by all hw codegen targets.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/src/gallium/drivers/radeon/AMDGPURegisterInfo.td b/src/gallium/drivers/radeon/AMDGPURegisterInfo.td
index 173d6622569..1707903ae7e 100644
--- a/src/gallium/drivers/radeon/AMDGPURegisterInfo.td
+++ b/src/gallium/drivers/radeon/AMDGPURegisterInfo.td
@@ -1,4 +1,4 @@
-//===-- AMDGPURegisterInfo.td - TODO: Add brief description -------===//
+//===-- AMDGPURegisterInfo.td - AMDGPU register info -------*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// Tablegen register definitions common to all hw codegen targets.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/src/gallium/drivers/radeon/AMDGPUReorderPreloadInstructions.cpp b/src/gallium/drivers/radeon/AMDGPUReorderPreloadInstructions.cpp
deleted file mode 100644
index c923f19c39f..00000000000
--- a/src/gallium/drivers/radeon/AMDGPUReorderPreloadInstructions.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-//===-- AMDGPUReorderPreloadInstructions.cpp - TODO: Add brief description -------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// TODO: Add full description
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDIL.h"
-#include "AMDILInstrInfo.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Function.h"
-
-using namespace llvm;
-
-namespace {
-  class AMDGPUReorderPreloadInstructionsPass : public MachineFunctionPass {
-
-  private:
-    static char ID;
-    TargetMachine &TM;
-
-  public:
-    AMDGPUReorderPreloadInstructionsPass(TargetMachine &tm) :
-      MachineFunctionPass(ID), TM(tm) { }
-
-      bool runOnMachineFunction(MachineFunction &MF);
-
-      const char *getPassName() const { return "AMDGPU Reorder Preload Instructions"; }
-    };
-} /* End anonymous namespace */
-
-char AMDGPUReorderPreloadInstructionsPass::ID = 0;
-
-FunctionPass *llvm::createAMDGPUReorderPreloadInstructionsPass(TargetMachine &tm) {
-    return new AMDGPUReorderPreloadInstructionsPass(tm);
-}
-
-/* This pass moves instructions that represent preloaded registers to the
- * start of the program. */
-bool AMDGPUReorderPreloadInstructionsPass::runOnMachineFunction(MachineFunction &MF)
-{
-  const AMDGPUInstrInfo * TII =
-                        static_cast<const AMDGPUInstrInfo*>(TM.getInstrInfo());
-
-  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
-                                                  BB != BB_E; ++BB) {
-    MachineBasicBlock &MBB = *BB;
-    for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
-         I != MBB.end(); I = Next, Next = llvm::next(I) ) {
-      MachineInstr &MI = *I;
-      if (TII->isRegPreload(MI)) {
-         MF.front().insert(MF.front().begin(), MI.removeFromParent());
-      }
-    }
-  }
-  return false;
-}
diff --git a/src/gallium/drivers/radeon/AMDGPUTargetMachine.cpp b/src/gallium/drivers/radeon/AMDGPUTargetMachine.cpp
index 313349ce01b..c1c21abc9c1 100644
--- a/src/gallium/drivers/radeon/AMDGPUTargetMachine.cpp
+++ b/src/gallium/drivers/radeon/AMDGPUTargetMachine.cpp
@@ -1,4 +1,4 @@
-//===-- AMDGPUTargetMachine.cpp - TODO: Add brief description -------===//
+//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// The AMDGPU target machine contains all of the hardware specific information
+// needed to emit code for R600 and SI GPUs.
 //
 //===----------------------------------------------------------------------===//
 
@@ -16,7 +17,6 @@
 #include "AMDILTargetMachine.h"
 #include "R600ISelLowering.h"
 #include "R600InstrInfo.h"
-#include "R600KernelParameters.h"
 #include "SIISelLowering.h"
 #include "SIInstrInfo.h"
 #include "llvm/Analysis/Passes.h"
@@ -112,31 +112,28 @@ AMDGPUPassConfig::addPreISel()
 {
   const AMDILSubtarget &ST = TM->getSubtarget<AMDILSubtarget>();
   if (ST.device()->getGeneration() <= AMDILDeviceInfo::HD6XXX) {
-    PM.add(createR600KernelParametersPass(
+    PM->add(createR600KernelParametersPass(
                      getAMDGPUTargetMachine().getTargetData()));
   }
   return false;
 }
 
 bool AMDGPUPassConfig::addInstSelector() {
-  PM.add(createAMDILPeepholeOpt(*TM));
-  PM.add(createAMDILISelDag(getAMDGPUTargetMachine()));
+  PM->add(createAMDILPeepholeOpt(*TM));
+  PM->add(createAMDILISelDag(getAMDGPUTargetMachine()));
   return false;
 }
 
 bool AMDGPUPassConfig::addPreRegAlloc() {
   const AMDILSubtarget &ST = TM->getSubtarget<AMDILSubtarget>();
 
-  PM.add(createAMDGPUReorderPreloadInstructionsPass(*TM));
   if (ST.device()->getGeneration() <= AMDILDeviceInfo::HD6XXX) {
-    PM.add(createR600LowerShaderInstructionsPass(*TM));
-    PM.add(createR600LowerInstructionsPass(*TM));
+    PM->add(createR600LowerInstructionsPass(*TM));
   } else {
-    PM.add(createSILowerShaderInstructionsPass(*TM));
-    PM.add(createSIAssignInterpRegsPass(*TM));
+    PM->add(createSIAssignInterpRegsPass(*TM));
   }
-  PM.add(createAMDGPULowerInstructionsPass(*TM));
-  PM.add(createAMDGPUConvertToISAPass(*TM));
+  PM->add(createAMDGPULowerInstructionsPass(*TM));
+  PM->add(createAMDGPUConvertToISAPass(*TM));
   return false;
 }
 
@@ -150,10 +147,10 @@ bool AMDGPUPassConfig::addPreSched2() {
 
 bool AMDGPUPassConfig::addPreEmitPass() {
   const AMDILSubtarget &ST = TM->getSubtarget<AMDILSubtarget>();
-  PM.add(createAMDILCFGPreparationPass(*TM));
-  PM.add(createAMDILCFGStructurizerPass(*TM));
+  PM->add(createAMDILCFGPreparationPass(*TM));
+  PM->add(createAMDILCFGStructurizerPass(*TM));
   if (ST.device()->getGeneration() == AMDILDeviceInfo::HD7XXX) {
-    PM.add(createSIPropagateImmReadsPass(*TM));
+    PM->add(createSIPropagateImmReadsPass(*TM));
   }
 
   return false;
diff --git a/src/gallium/drivers/radeon/AMDGPUTargetMachine.h b/src/gallium/drivers/radeon/AMDGPUTargetMachine.h
index d4165b09e84..2428fe638a7 100644
--- a/src/gallium/drivers/radeon/AMDGPUTargetMachine.h
+++ b/src/gallium/drivers/radeon/AMDGPUTargetMachine.h
@@ -1,4 +1,4 @@
-//===-- AMDGPUTargetMachine.h - TODO: Add brief description -------===//
+//===-- AMDGPUTargetMachine.h - AMDGPU TargetMachine Interface --*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+//  The AMDGPU TargetMachine interface definition for hw codgen targets.
 //
 //===----------------------------------------------------------------------===//
 
@@ -52,9 +52,6 @@ public:
                                               formatted_raw_ostream &Out,
                                               CodeGenFileType FileType,
                                               bool DisableVerify);
-public:
-   void dumpCode() { mDump = true; }
-   bool shouldDumpCode() const { return mDump; }
 };
 
 } /* End namespace llvm */
diff --git a/src/gallium/drivers/radeon/AMDGPUUtil.cpp b/src/gallium/drivers/radeon/AMDGPUUtil.cpp
index a5045436ab4..bd8f5eef697 100644
--- a/src/gallium/drivers/radeon/AMDGPUUtil.cpp
+++ b/src/gallium/drivers/radeon/AMDGPUUtil.cpp
@@ -1,4 +1,4 @@
-//===-- AMDGPUUtil.cpp - TODO: Add brief description -------===//
+//===-- AMDGPUUtil.cpp - AMDGPU Utility functions -------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,39 +7,39 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// Common utility functions used by hw codegen targets
 //
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUUtil.h"
 #include "AMDGPURegisterInfo.h"
 #include "AMDIL.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 
 using namespace llvm;
 
-/* Some instructions act as place holders to emulate operations that the GPU
- * hardware does automatically. This function can be used to check if
- * an opcode falls into this category. */
-bool llvm::isPlaceHolderOpcode(unsigned opcode)
+// Some instructions act as place holders to emulate operations that the GPU
+// hardware does automatically. This function can be used to check if
+// an opcode falls into this category.
+bool AMDGPU::isPlaceHolderOpcode(unsigned opcode)
 {
   switch (opcode) {
   default: return false;
-  case AMDIL::EXPORT_REG:
   case AMDIL::RETURN:
   case AMDIL::LOAD_INPUT:
   case AMDIL::LAST:
+  case AMDIL::MASK_WRITE:
   case AMDIL::RESERVE_REG:
     return true;
   }
 }
 
-bool llvm::isTransOp(unsigned opcode)
+bool AMDGPU::isTransOp(unsigned opcode)
 {
   switch(opcode) {
     default: return false;
@@ -67,10 +67,12 @@ bool llvm::isTransOp(unsigned opcode)
   }
 }
 
-bool llvm::isTexOp(unsigned opcode)
+bool AMDGPU::isTexOp(unsigned opcode)
 {
   switch(opcode) {
   default: return false;
+  case AMDIL::TEX_LD:
+  case AMDIL::TEX_GET_TEXTURE_RESINFO:
   case AMDIL::TEX_SAMPLE:
   case AMDIL::TEX_SAMPLE_C:
   case AMDIL::TEX_SAMPLE_L:
@@ -79,11 +81,13 @@ bool llvm::isTexOp(unsigned opcode)
   case AMDIL::TEX_SAMPLE_C_LB:
   case AMDIL::TEX_SAMPLE_G:
   case AMDIL::TEX_SAMPLE_C_G:
+  case AMDIL::TEX_GET_GRADIENTS_H:
+  case AMDIL::TEX_GET_GRADIENTS_V:
     return true;
   }
 }
 
-bool llvm::isReductionOp(unsigned opcode)
+bool AMDGPU::isReductionOp(unsigned opcode)
 {
   switch(opcode) {
     default: return false;
@@ -93,13 +97,25 @@ bool llvm::isReductionOp(unsigned opcode)
   }
 }
 
-bool llvm::isFCOp(unsigned opcode)
+bool AMDGPU::isCubeOp(unsigned opcode)
+{
+  switch(opcode) {
+    default: return false;
+    case AMDIL::CUBE_r600:
+    case AMDIL::CUBE_eg:
+      return true;
+  }
+}
+
+
+bool AMDGPU::isFCOp(unsigned opcode)
 {
   switch(opcode) {
   default: return false;
   case AMDIL::BREAK_LOGICALZ_f32:
   case AMDIL::BREAK_LOGICALNZ_i32:
   case AMDIL::BREAK_LOGICALZ_i32:
+  case AMDIL::BREAK_LOGICALNZ_f32:
   case AMDIL::CONTINUE_LOGICALNZ_f32:
   case AMDIL::IF_LOGICALNZ_i32:
   case AMDIL::IF_LOGICALZ_f32:
@@ -112,11 +128,14 @@ bool llvm::isFCOp(unsigned opcode)
   }
 }
 
-void AMDGPU::utilAddLiveIn(MachineFunction * MF, MachineRegisterInfo & MRI,
-    const struct TargetInstrInfo * TII, unsigned physReg, unsigned virtReg)
+void AMDGPU::utilAddLiveIn(llvm::MachineFunction * MF,
+													 llvm::MachineRegisterInfo & MRI,
+													 const struct llvm::TargetInstrInfo * TII,
+													 unsigned physReg, unsigned virtReg)
 {
     if (!MRI.isLiveIn(physReg)) {
       MRI.addLiveIn(physReg, virtReg);
+      MF->front().addLiveIn(physReg);
       BuildMI(MF->front(), MF->front().begin(), DebugLoc(),
                            TII->get(TargetOpcode::COPY), virtReg)
             .addReg(physReg);
diff --git a/src/gallium/drivers/radeon/AMDGPUUtil.h b/src/gallium/drivers/radeon/AMDGPUUtil.h
index 299146e1ba7..15f2ce57af9 100644
--- a/src/gallium/drivers/radeon/AMDGPUUtil.h
+++ b/src/gallium/drivers/radeon/AMDGPUUtil.h
@@ -1,4 +1,4 @@
-//===-- AMDGPUUtil.h - TODO: Add brief description -------===//
+//===-- AMDGPUUtil.h - AMDGPU Utility function declarations -----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,43 +7,40 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// Declarations for utility functions common to all hw codegen targets.
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef AMDGPU_UTIL_H
 #define AMDGPU_UTIL_H
 
-#include "AMDGPURegisterInfo.h"
-#include "llvm/Support/DataTypes.h"
-
 namespace llvm {
 
-class AMDILMachineFunctionInfo;
+class MachineFunction;
+class MachineRegisterInfo;
+class TargetInstrInfo;
+
+}
 
-class TargetMachine;
-class TargetRegisterInfo;
+namespace AMDGPU {
 
 bool isPlaceHolderOpcode(unsigned opcode);
 
 bool isTransOp(unsigned opcode);
 bool isTexOp(unsigned opcode);
 bool isReductionOp(unsigned opcode);
+bool isCubeOp(unsigned opcode);
 bool isFCOp(unsigned opcode);
 
-/* XXX: Move these to AMDGPUInstrInfo.h */
+// XXX: Move these to AMDGPUInstrInfo.h
 #define MO_FLAG_CLAMP (1 << 0)
 #define MO_FLAG_NEG   (1 << 1)
 #define MO_FLAG_ABS   (1 << 2)
 #define MO_FLAG_MASK  (1 << 3)
 
-} /* End namespace llvm */
-
-namespace AMDGPU {
-
 void utilAddLiveIn(llvm::MachineFunction * MF, llvm::MachineRegisterInfo & MRI,
     const struct llvm::TargetInstrInfo * TII, unsigned physReg, unsigned virtReg);
 
 } // End namespace AMDGPU
 
-#endif /* AMDGPU_UTIL_H */
+#endif // AMDGPU_UTIL_H
diff --git a/src/gallium/drivers/radeon/AMDIL.h b/src/gallium/drivers/radeon/AMDIL.h
index 317ea124f66..6759ccd9527 100644
--- a/src/gallium/drivers/radeon/AMDIL.h
+++ b/src/gallium/drivers/radeon/AMDIL.h
@@ -137,11 +137,6 @@ enum AddressSpaces {
   LAST_ADDRESS     = 8
 };
 
-// We are piggybacking on the CommentFlag enum in MachineInstr.h to
-// set bits in AsmPrinterFlags of the MachineInstruction. We will
-// start at bit 16 and allocate down while LLVM will start at bit
-// 1 and allocate up.
-
 // This union/struct combination is an easy way to read out the
 // exact bits that are needed.
 typedef union ResourceRec {
@@ -181,26 +176,6 @@ typedef union ResourceRec {
 
 } // namespace AMDILAS
 
-// The OpSwizzle encodes a subset of all possible
-// swizzle combinations into a number of bits using
-// only the combinations utilized by the backend.
-// The lower 128 are for source swizzles and the
-// upper 128 or for destination swizzles.
-// The valid mappings can be found in the
-// getSrcSwizzle and getDstSwizzle functions of
-// AMDILUtilityFunctions.cpp.
-typedef union SwizzleRec {
-  struct {
-#ifdef __BIG_ENDIAN__
-    unsigned char dst : 1;
-    unsigned char swizzle : 7;
-#else
-    unsigned char swizzle : 7;
-    unsigned char dst : 1;
-#endif
-  } bits;
-  unsigned char u8all;
-} OpSwizzle;
 // Enums corresponding to AMDIL condition codes for IL.  These
 // values must be kept in sync with the ones in the .td file.
 namespace AMDILCC {
diff --git a/src/gallium/drivers/radeon/AMDIL.td b/src/gallium/drivers/radeon/AMDIL.td
index 9bcccac2411..deee290fad5 100644
--- a/src/gallium/drivers/radeon/AMDIL.td
+++ b/src/gallium/drivers/radeon/AMDIL.td
@@ -1,4 +1,4 @@
-//===-- AMDIL.td - TODO: Add brief description -------===//
+//===-- AMDIL.td - AMDIL Tablegen files --*- tablegen -*-------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/src/gallium/drivers/radeon/AMDIL7XXDevice.cpp b/src/gallium/drivers/radeon/AMDIL7XXDevice.cpp
index 6625dd77d5f..d7c96573a15 100644
--- a/src/gallium/drivers/radeon/AMDIL7XXDevice.cpp
+++ b/src/gallium/drivers/radeon/AMDIL7XXDevice.cpp
@@ -1,4 +1,4 @@
-//===-- AMDIL7XXDevice.cpp - TODO: Add brief description -------===//
+//===-- AMDIL7XXDevice.cpp - Device Info for 7XX GPUs ---------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/src/gallium/drivers/radeon/AMDILBase.td b/src/gallium/drivers/radeon/AMDILBase.td
index 2706b211f2d..31ebed31d72 100644
--- a/src/gallium/drivers/radeon/AMDILBase.td
+++ b/src/gallium/drivers/radeon/AMDILBase.td
@@ -60,6 +60,11 @@ def FeatureDebug : SubtargetFeature<"debug",
         "CapsOverride[AMDILDeviceInfo::Debug]",
         "true",
         "Debug mode is enabled, so disable hardware accelerated address spaces.">;
+def FeatureDumpCode : SubtargetFeature <"DumpCode",
+        "mDumpCode",
+        "true",
+        "Dump MachineInstrs in the CodeEmitter">;
+
 
 //===----------------------------------------------------------------------===//
 // Register File, Calling Conv, Instruction Descriptions
diff --git a/src/gallium/drivers/radeon/AMDILCFGStructurizer.cpp b/src/gallium/drivers/radeon/AMDILCFGStructurizer.cpp
index 289af6f210e..cdcd5e89880 100644
--- a/src/gallium/drivers/radeon/AMDILCFGStructurizer.cpp
+++ b/src/gallium/drivers/radeon/AMDILCFGStructurizer.cpp
@@ -7,22 +7,22 @@
 //
 //==-----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "structcfg"
-#ifdef DEBUG
-#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
-#else
 #define DEBUGME 0
-#endif
+#define DEBUG_TYPE "structcfg"
 
 #include "AMDILTargetMachine.h"
 #include "AMDILUtilityFunctions.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/DominatorInternals.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -30,8 +30,6 @@
 #define FirstNonDebugInstr(A) A->begin()
 using namespace llvm;
 
-// bixia TODO: move this out to analysis lib. Make this work for both target
-// AMDIL and CBackend.
 // TODO: move-begin.
 
 //===----------------------------------------------------------------------===//
@@ -109,23 +107,6 @@ void ReverseVector(SmallVector<NodeT *, DEFAULT_VEC_SLOTS> &Src) {
 //
 //===----------------------------------------------------------------------===//
 
-#include "AMDILTargetMachine.h"
-#include "AMDILUtilityFunctions.h"
-#include "llvm/ADT/SCCIterator.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/DominatorInternals.h"
-#include "llvm/Analysis/Dominators.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionAnalysis.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineJumpTableInfo.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-
 namespace llvm {
 
 /// PostDominatorTree Class - Concrete subclass of DominatorTree that is used
@@ -3156,10 +3137,6 @@ struct CFGStructTraits<AMDILCFGStructurizer>
          iterEnd = srcBlk->end();
          iter != iterEnd; ++iter) {
       MachineInstr *instr = func->CloneMachineInstr(iter);
-      // This is a workaround for LLVM bugzilla 8420 because CloneMachineInstr
-      // does not clone the AsmPrinterFlags.
-      instr->setAsmPrinterFlag(
-         (llvm::MachineInstr::CommentFlag)iter->getAsmPrinterFlags());
       newBlk->push_back(instr);
     }
     return newBlk;
diff --git a/src/gallium/drivers/radeon/AMDILCodeEmitter.h b/src/gallium/drivers/radeon/AMDILCodeEmitter.h
index b0ea1455cf9..fa46cbd203d 100644
--- a/src/gallium/drivers/radeon/AMDILCodeEmitter.h
+++ b/src/gallium/drivers/radeon/AMDILCodeEmitter.h
@@ -1,23 +1,21 @@
-//                     The LLVM Compiler Infrastructure
+//===-- AMDILCodeEmitter.h - AMDIL Code Emitter interface -----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-//===-- AMDILCodeEmitter.h - TODO: Add brief description -------===//
-//===-- AMDILCodeEmitter.h - TODO: Add brief description -------===//
-//===-- AMDILCodeEmitter.h - TODO: Add brief description -------===//
+//===----------------------------------------------------------------------===//
 //
-//                     The LLVM Compiler Infrastructure
+// CodeEmitter interface for R600 and SI codegen.
 //
+//===----------------------------------------------------------------------===//
 
 #ifndef AMDILCODEEMITTER_H
 #define AMDILCODEEMITTER_H
 
 namespace llvm {
 
-  /* XXX: Temp HACK to work around tablegen name generation */
   class AMDILCodeEmitter {
   public:
     uint64_t getBinaryCodeForInstr(const MachineInstr &MI) const;
diff --git a/src/gallium/drivers/radeon/AMDILConversions.td b/src/gallium/drivers/radeon/AMDILConversions.td
index 0db66ae8475..1bc5e4ddf37 100644
--- a/src/gallium/drivers/radeon/AMDILConversions.td
+++ b/src/gallium/drivers/radeon/AMDILConversions.td
@@ -1,4 +1,4 @@
-//===-- AMDILConversions.td - TODO: Add brief description -------===//
+//==- AMDILConversions.td - Type conversion tablegen patterns -*-tablegen -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/src/gallium/drivers/radeon/AMDILDevice.cpp b/src/gallium/drivers/radeon/AMDILDevice.cpp
index aa6d8af7012..4294a8bef0c 100644
--- a/src/gallium/drivers/radeon/AMDILDevice.cpp
+++ b/src/gallium/drivers/radeon/AMDILDevice.cpp
@@ -1,4 +1,4 @@
-//===-- AMDILDevice.cpp - TODO: Add brief description -------===//
+//===-- AMDILDevice.cpp - Base class for AMDIL Devices --------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/src/gallium/drivers/radeon/AMDILDeviceInfo.cpp b/src/gallium/drivers/radeon/AMDILDeviceInfo.cpp
index 89b8312c294..cbf5b512471 100644
--- a/src/gallium/drivers/radeon/AMDILDeviceInfo.cpp
+++ b/src/gallium/drivers/radeon/AMDILDeviceInfo.cpp
@@ -1,4 +1,4 @@
-//===-- AMDILDeviceInfo.cpp - TODO: Add brief description -------===//
+//===-- AMDILDeviceInfo.cpp - AMDILDeviceInfo class -----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,11 +6,16 @@
 // License. See LICENSE.TXT for details.
 //
 //==-----------------------------------------------------------------------===//
+//
+// Function that creates DeviceInfo from a device name and other information.
+//
+//==-----------------------------------------------------------------------===//
 #include "AMDILDevices.h"
 #include "AMDILSubtarget.h"
 
 using namespace llvm;
 namespace llvm {
+namespace AMDILDeviceInfo {
     AMDILDevice*
 getDeviceFromName(const std::string &deviceName, AMDILSubtarget *ptr, bool is64bit, bool is64on32bit)
 {
@@ -84,4 +89,5 @@ getDeviceFromName(const std::string &deviceName, AMDILSubtarget *ptr, bool is64b
         return new AMDIL7XXDevice(ptr);
     }
 }
-}
+} // End namespace AMDILDeviceInfo
+} // End namespace llvm
diff --git a/src/gallium/drivers/radeon/AMDILDeviceInfo.h b/src/gallium/drivers/radeon/AMDILDeviceInfo.h
index c4acf9145ae..06ac4322d0f 100644
--- a/src/gallium/drivers/radeon/AMDILDeviceInfo.h
+++ b/src/gallium/drivers/radeon/AMDILDeviceInfo.h
@@ -1,4 +1,4 @@
-//===-- AMDILDeviceInfo.h - TODO: Add brief description -------===//
+//===-- AMDILDeviceInfo.h - Constants for describing devices --------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -82,8 +82,8 @@ namespace llvm
     };
 
 
+  AMDILDevice*
+    getDeviceFromName(const std::string &name, AMDILSubtarget *ptr, bool is64bit = false, bool is64on32bit = false);
   } // namespace AMDILDeviceInfo
-  llvm::AMDILDevice*
-    getDeviceFromName(const std::string &name, llvm::AMDILSubtarget *ptr, bool is64bit = false, bool is64on32bit = false);
 } // namespace llvm
 #endif // _AMDILDEVICEINFO_H_
diff --git a/src/gallium/drivers/radeon/AMDILDevices.h b/src/gallium/drivers/radeon/AMDILDevices.h
index 3fc5fa05669..cfcc3304b4b 100644
--- a/src/gallium/drivers/radeon/AMDILDevices.h
+++ b/src/gallium/drivers/radeon/AMDILDevices.h
@@ -1,4 +1,4 @@
-//===-- AMDILDevices.h - TODO: Add brief description -------===//
+//===-- AMDILDevices.h - Consolidate AMDIL Device headers -----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/src/gallium/drivers/radeon/AMDILEnumeratedTypes.td b/src/gallium/drivers/radeon/AMDILEnumeratedTypes.td
index 445fd608bbb..f10936b8c6c 100644
--- a/src/gallium/drivers/radeon/AMDILEnumeratedTypes.td
+++ b/src/gallium/drivers/radeon/AMDILEnumeratedTypes.td
@@ -1,4 +1,4 @@
-//===-- AMDILEnumeratedTypes.td - TODO: Add brief description -------===//
+//===-- AMDILEnumeratedTypes.td - IL Type definitions --*- tablegen -*-----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/src/gallium/drivers/radeon/AMDILEvergreenDevice.cpp b/src/gallium/drivers/radeon/AMDILEvergreenDevice.cpp
index 7b5c52345d2..779b2d3df2f 100644
--- a/src/gallium/drivers/radeon/AMDILEvergreenDevice.cpp
+++ b/src/gallium/drivers/radeon/AMDILEvergreenDevice.cpp
@@ -1,4 +1,4 @@
-//===-- AMDILEvergreenDevice.cpp - TODO: Add brief description -------===//
+//===-- AMDILEvergreenDevice.cpp - Device Info for Evergreen --------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/src/gallium/drivers/radeon/AMDILISelDAGToDAG.cpp b/src/gallium/drivers/radeon/AMDILISelDAGToDAG.cpp
index ff04d9d55bf..b8898828dd6 100644
--- a/src/gallium/drivers/radeon/AMDILISelDAGToDAG.cpp
+++ b/src/gallium/drivers/radeon/AMDILISelDAGToDAG.cpp
@@ -13,9 +13,12 @@
 #include "AMDILDevices.h"
 #include "AMDILTargetMachine.h"
 #include "AMDILUtilityFunctions.h"
+#include "llvm/ADT/ValueMap.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/Support/Compiler.h"
+#include <list>
+#include <queue>
 
 using namespace llvm;
 
@@ -35,13 +38,21 @@ class AMDILDAGToDAGISel : public SelectionDAGISel {
 public:
   AMDILDAGToDAGISel(AMDILTargetMachine &TM AMDIL_OPT_LEVEL_DECL);
   virtual ~AMDILDAGToDAGISel();
-  inline SDValue getSmallIPtrImm(unsigned Imm);
 
   SDNode *Select(SDNode *N);
+  virtual const char *getPassName() const;
+
+private:
+  inline SDValue getSmallIPtrImm(unsigned Imm);
+
   // Complex pattern selectors
   bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2);
   bool SelectADDR(SDValue N, SDValue &R1, SDValue &R2);
   bool SelectADDR64(SDValue N, SDValue &R1, SDValue &R2);
+
+  static bool checkType(const Value *ptr, unsigned int addrspace);
+  static const Value *getBasePointerValue(const Value *V);
+
   static bool isGlobalStore(const StoreSDNode *N);
   static bool isPrivateStore(const StoreSDNode *N);
   static bool isLocalStore(const StoreSDNode *N);
@@ -54,8 +65,6 @@ public:
   static bool isLocalLoad(const LoadSDNode *N);
   static bool isRegionLoad(const LoadSDNode *N);
 
-  virtual const char *getPassName() const;
-private:
   SDNode *xformAtomicInst(SDNode *N);
 
   // Include the pieces autogenerated from the target description.
@@ -165,26 +174,75 @@ SDNode *AMDILDAGToDAGISel::Select(SDNode *N) {
   return SelectCode(N);
 }
 
+bool AMDILDAGToDAGISel::checkType(const Value *ptr, unsigned int addrspace) {
+  if (!ptr) {
+    return false;
+  }
+  Type *ptrType = ptr->getType();
+  return dyn_cast<PointerType>(ptrType)->getAddressSpace() == addrspace;
+}
+
+const Value * AMDILDAGToDAGISel::getBasePointerValue(const Value *V)
+{
+  if (!V) {
+    return NULL;
+  }
+  const Value *ret = NULL;
+  ValueMap<const Value *, bool> ValueBitMap;
+  std::queue<const Value *, std::list<const Value *> > ValueQueue;
+  ValueQueue.push(V);
+  while (!ValueQueue.empty()) {
+    V = ValueQueue.front();
+    if (ValueBitMap.find(V) == ValueBitMap.end()) {
+      ValueBitMap[V] = true;
+      if (dyn_cast<Argument>(V) && dyn_cast<PointerType>(V->getType())) {
+        ret = V;
+        break;
+      } else if (dyn_cast<GlobalVariable>(V)) {
+        ret = V;
+        break;
+      } else if (dyn_cast<Constant>(V)) {
+        const ConstantExpr *CE = dyn_cast<ConstantExpr>(V);
+        if (CE) {
+          ValueQueue.push(CE->getOperand(0));
+        }
+      } else if (const AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
+        ret = AI;
+        break;
+      } else if (const Instruction *I = dyn_cast<Instruction>(V)) {
+        uint32_t numOps = I->getNumOperands();
+        for (uint32_t x = 0; x < numOps; ++x) {
+          ValueQueue.push(I->getOperand(x));
+        }
+      } else {
+        // assert(0 && "Found a Value that we didn't know how to handle!");
+      }
+    }
+    ValueQueue.pop();
+  }
+  return ret;
+}
+
 bool AMDILDAGToDAGISel::isGlobalStore(const StoreSDNode *N) {
-  return check_type(N->getSrcValue(), AMDILAS::GLOBAL_ADDRESS);
+  return checkType(N->getSrcValue(), AMDILAS::GLOBAL_ADDRESS);
 }
 
 bool AMDILDAGToDAGISel::isPrivateStore(const StoreSDNode *N) {
-  return (!check_type(N->getSrcValue(), AMDILAS::LOCAL_ADDRESS)
-          && !check_type(N->getSrcValue(), AMDILAS::GLOBAL_ADDRESS)
-          && !check_type(N->getSrcValue(), AMDILAS::REGION_ADDRESS));
+  return (!checkType(N->getSrcValue(), AMDILAS::LOCAL_ADDRESS)
+          && !checkType(N->getSrcValue(), AMDILAS::GLOBAL_ADDRESS)
+          && !checkType(N->getSrcValue(), AMDILAS::REGION_ADDRESS));
 }
 
 bool AMDILDAGToDAGISel::isLocalStore(const StoreSDNode *N) {
-  return check_type(N->getSrcValue(), AMDILAS::LOCAL_ADDRESS);
+  return checkType(N->getSrcValue(), AMDILAS::LOCAL_ADDRESS);
 }
 
 bool AMDILDAGToDAGISel::isRegionStore(const StoreSDNode *N) {
-  return check_type(N->getSrcValue(), AMDILAS::REGION_ADDRESS);
+  return checkType(N->getSrcValue(), AMDILAS::REGION_ADDRESS);
 }
 
 bool AMDILDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int cbID) {
-  if (check_type(N->getSrcValue(), AMDILAS::CONSTANT_ADDRESS)) {
+  if (checkType(N->getSrcValue(), AMDILAS::CONSTANT_ADDRESS)) {
     return true;
   }
   MachineMemOperand *MMO = N->getMemOperand();
@@ -195,27 +253,27 @@ bool AMDILDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int cbID) {
       && ((V && dyn_cast<GlobalValue>(V))
           || (BV && dyn_cast<GlobalValue>(
                         getBasePointerValue(MMO->getValue()))))) {
-    return check_type(N->getSrcValue(), AMDILAS::PRIVATE_ADDRESS);
+    return checkType(N->getSrcValue(), AMDILAS::PRIVATE_ADDRESS);
   } else {
     return false;
   }
 }
 
 bool AMDILDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) {
-  return check_type(N->getSrcValue(), AMDILAS::GLOBAL_ADDRESS);
+  return checkType(N->getSrcValue(), AMDILAS::GLOBAL_ADDRESS);
 }
 
 bool AMDILDAGToDAGISel::isLocalLoad(const  LoadSDNode *N) {
-  return check_type(N->getSrcValue(), AMDILAS::LOCAL_ADDRESS);
+  return checkType(N->getSrcValue(), AMDILAS::LOCAL_ADDRESS);
 }
 
 bool AMDILDAGToDAGISel::isRegionLoad(const  LoadSDNode *N) {
-  return check_type(N->getSrcValue(), AMDILAS::REGION_ADDRESS);
+  return checkType(N->getSrcValue(), AMDILAS::REGION_ADDRESS);
 }
 
 bool AMDILDAGToDAGISel::isCPLoad(const LoadSDNode *N) {
   MachineMemOperand *MMO = N->getMemOperand();
-  if (check_type(N->getSrcValue(), AMDILAS::PRIVATE_ADDRESS)) {
+  if (checkType(N->getSrcValue(), AMDILAS::PRIVATE_ADDRESS)) {
     if (MMO) {
       const Value *V = MMO->getValue();
       const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(V);
@@ -228,19 +286,19 @@ bool AMDILDAGToDAGISel::isCPLoad(const LoadSDNode *N) {
 }
 
 bool AMDILDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) {
-  if (check_type(N->getSrcValue(), AMDILAS::PRIVATE_ADDRESS)) {
+  if (checkType(N->getSrcValue(), AMDILAS::PRIVATE_ADDRESS)) {
     // Check to make sure we are not a constant pool load or a constant load
     // that is marked as a private load
     if (isCPLoad(N) || isConstantLoad(N, -1)) {
       return false;
     }
   }
-  if (!check_type(N->getSrcValue(), AMDILAS::LOCAL_ADDRESS)
-      && !check_type(N->getSrcValue(), AMDILAS::GLOBAL_ADDRESS)
-      && !check_type(N->getSrcValue(), AMDILAS::REGION_ADDRESS)
-      && !check_type(N->getSrcValue(), AMDILAS::CONSTANT_ADDRESS)
-      && !check_type(N->getSrcValue(), AMDILAS::PARAM_D_ADDRESS)
-      && !check_type(N->getSrcValue(), AMDILAS::PARAM_I_ADDRESS))
+  if (!checkType(N->getSrcValue(), AMDILAS::LOCAL_ADDRESS)
+      && !checkType(N->getSrcValue(), AMDILAS::GLOBAL_ADDRESS)
+      && !checkType(N->getSrcValue(), AMDILAS::REGION_ADDRESS)
+      && !checkType(N->getSrcValue(), AMDILAS::CONSTANT_ADDRESS)
+      && !checkType(N->getSrcValue(), AMDILAS::PARAM_D_ADDRESS)
+      && !checkType(N->getSrcValue(), AMDILAS::PARAM_I_ADDRESS))
   {
     return true;
   }
diff --git a/src/gallium/drivers/radeon/AMDILISelLowering.cpp b/src/gallium/drivers/radeon/AMDILISelLowering.cpp
index 54c6ea65065..19b12fcf72b 100644
--- a/src/gallium/drivers/radeon/AMDILISelLowering.cpp
+++ b/src/gallium/drivers/radeon/AMDILISelLowering.cpp
@@ -623,6 +623,48 @@ translateToOpcode(uint64_t CCCode, unsigned int regClass)
   assert(0 && "Unknown opcode retrieved");
   return 0;
 }
+
+/// Helper function used by LowerFormalArguments
+static const TargetRegisterClass*
+getRegClassFromType(unsigned int type) {
+  switch (type) {
+  default:
+    assert(0 && "Passed in type does not match any register classes.");
+  case MVT::i8:
+    return &AMDIL::GPRI8RegClass;
+  case MVT::i16:
+    return &AMDIL::GPRI16RegClass;
+  case MVT::i32:
+    return &AMDIL::GPRI32RegClass;
+  case MVT::f32:
+    return &AMDIL::GPRF32RegClass;
+  case MVT::i64:
+    return &AMDIL::GPRI64RegClass;
+  case MVT::f64:
+    return &AMDIL::GPRF64RegClass;
+  case MVT::v4f32:
+    return &AMDIL::GPRV4F32RegClass;
+  case MVT::v4i8:
+    return &AMDIL::GPRV4I8RegClass;
+  case MVT::v4i16:
+    return &AMDIL::GPRV4I16RegClass;
+  case MVT::v4i32:
+    return &AMDIL::GPRV4I32RegClass;
+  case MVT::v2f32:
+    return &AMDIL::GPRV2F32RegClass;
+  case MVT::v2i8:
+    return &AMDIL::GPRV2I8RegClass;
+  case MVT::v2i16:
+    return &AMDIL::GPRV2I16RegClass;
+  case MVT::v2i32:
+    return &AMDIL::GPRV2I32RegClass;
+  case MVT::v2f64:
+    return &AMDIL::GPRV2F64RegClass;
+  case MVT::v2i64:
+    return &AMDIL::GPRV2I64RegClass;
+  }
+}
+
 SDValue
 AMDILTargetLowering::LowerMemArgument(
     SDValue Chain,
@@ -2189,6 +2231,7 @@ AMDILTargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const
   SDValue Result = DAG.getTargetExternalSymbol(Sym, MVT::i32);
   return Result;
 }
+
 /// LowerFORMAL_ARGUMENTS - transform physical registers into
 /// virtual registers and generate load operations for
 /// arguments places on the stack.
@@ -3191,7 +3234,7 @@ AMDILTargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const
     amdtm = reinterpret_cast<const AMDILTargetMachine*>
     (&this->getTargetMachine());
   const AMDILSubtarget*
-    stm = dynamic_cast<const AMDILSubtarget*>(
+    stm = static_cast<const AMDILSubtarget*>(
         amdtm->getSubtargetImpl());
   if (RST == MVT::f64 && RHSVT.isVector()
       && stm->device()->getGeneration() > AMDILDeviceInfo::HD6XXX)  {
@@ -3248,7 +3291,7 @@ AMDILTargetLowering::LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const
     amdtm = reinterpret_cast<const AMDILTargetMachine*>
     (&this->getTargetMachine());
   const AMDILSubtarget*
-    stm = dynamic_cast<const AMDILSubtarget*>(
+    stm = static_cast<const AMDILSubtarget*>(
         amdtm->getSubtargetImpl());
   if (RST == MVT::f64 && RHSVT.isVector()
       && stm->device()->getGeneration() > AMDILDeviceInfo::HD6XXX)  {
@@ -3314,7 +3357,7 @@ AMDILTargetLowering::genu32tof64(SDValue RHS, EVT LHSVT,
     amdtm = reinterpret_cast<const AMDILTargetMachine*>
     (&this->getTargetMachine());
   const AMDILSubtarget*
-    stm = dynamic_cast<const AMDILSubtarget*>(
+    stm = static_cast<const AMDILSubtarget*>(
         amdtm->getSubtargetImpl());
   if (stm->calVersion() >= CAL_VERSION_SC_135) {
     // unsigned x = RHS;
@@ -3489,7 +3532,7 @@ AMDILTargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
     amdtm = reinterpret_cast<const AMDILTargetMachine*>
     (&this->getTargetMachine());
   const AMDILSubtarget*
-    stm = dynamic_cast<const AMDILSubtarget*>(
+    stm = static_cast<const AMDILSubtarget*>(
         amdtm->getSubtargetImpl());
   if (LST == MVT::f64 && LHSVT.isVector()
       && stm->device()->getGeneration() > AMDILDeviceInfo::HD6XXX)  {
@@ -3543,7 +3586,7 @@ AMDILTargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
     amdtm = reinterpret_cast<const AMDILTargetMachine*>
     (&this->getTargetMachine());
   const AMDILSubtarget*
-    stm = dynamic_cast<const AMDILSubtarget*>(
+    stm = static_cast<const AMDILSubtarget*>(
         amdtm->getSubtargetImpl());
   if (LST == MVT::f64 && LHSVT.isVector()
       && stm->device()->getGeneration() > AMDILDeviceInfo::HD6XXX)  {
@@ -3843,7 +3886,6 @@ SDValue
 AMDILTargetLowering::LowerBUILD_VECTOR( SDValue Op, SelectionDAG &DAG ) const
 {
   EVT VT = Op.getValueType();
-  //printSDValue(Op, 1);
   SDValue Nodes1;
   SDValue second;
   SDValue third;
@@ -3965,7 +4007,6 @@ AMDILTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     SelectionDAG &DAG) const
 {
   EVT VT = Op.getValueType();
-  //printSDValue(Op, 1);
   const ConstantSDNode *CSDN = dyn_cast<ConstantSDNode>(Op.getOperand(1));
   uint64_t swizzleNum = 0;
   DebugLoc DL = Op.getDebugLoc();
@@ -4782,7 +4823,7 @@ uint32_t
 AMDILTargetLowering::genVReg(uint32_t regType) const
 {
   return mBB->getParent()->getRegInfo().createVirtualRegister(
-      getRegClassFromID(regType));
+      getTargetMachine().getRegisterInfo()->getRegClass(regType));
 }
 
 MachineInstrBuilder
diff --git a/src/gallium/drivers/radeon/AMDILInstrInfo.cpp b/src/gallium/drivers/radeon/AMDILInstrInfo.cpp
index fbc3e45b357..cd2fb48209c 100644
--- a/src/gallium/drivers/radeon/AMDILInstrInfo.cpp
+++ b/src/gallium/drivers/radeon/AMDILInstrInfo.cpp
@@ -10,13 +10,10 @@
 // This file contains the AMDIL implementation of the TargetInstrInfo class.
 //
 //===----------------------------------------------------------------------===//
-#include "AMDILInstrInfo.h"
-#include "AMDILUtilityFunctions.h"
-
-#define GET_INSTRINFO_CTOR
-#include "AMDILGenInstrInfo.inc"
 
 #include "AMDILInstrInfo.h"
+#include "AMDIL.h"
+#include "AMDILISelLowering.h"
 #include "AMDILUtilityFunctions.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -24,6 +21,9 @@
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/Instructions.h"
 
+#define GET_INSTRINFO_CTOR
+#include "AMDILGenInstrInfo.inc"
+
 using namespace llvm;
 
 AMDILInstrInfo::AMDILInstrInfo(AMDILTargetMachine &tm)
@@ -36,28 +36,6 @@ const AMDILRegisterInfo &AMDILInstrInfo::getRegisterInfo() const {
   return RI;
 }
 
-/// Return true if the instruction is a register to register move and leave the
-/// source and dest operands in the passed parameters.
-bool AMDILInstrInfo::isMoveInstr(const MachineInstr &MI, unsigned int &SrcReg,
-                                 unsigned int &DstReg, unsigned int &SrcSubIdx,
-                                 unsigned int &DstSubIdx) const {
-  // FIXME: we should look for:
-  //    add with 0
-  //assert(0 && "is Move Instruction has not been implemented yet!");
-  //return true;
-  if (!isMove(MI.getOpcode())) {
-    return false;
-  }
-  if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg()) {
-    return false;
-  }
-  SrcReg = MI.getOperand(1).getReg();
-  DstReg = MI.getOperand(0).getReg();
-  DstSubIdx = 0;
-  SrcSubIdx = 0;
-  return true;
-}
-
 bool AMDILInstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
                                            unsigned &SrcReg, unsigned &DstReg,
                                            unsigned &SubIdx) const {
@@ -99,22 +77,7 @@ bool AMDILInstrInfo::hasStoreFromStackSlot(const MachineInstr *MI,
 // TODO: Implement this function
   return false;
 }
-#if 0
-void
-AMDILInstrInfo::reMaterialize(MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator MI,
-                              unsigned DestReg, unsigned SubIdx,
-                             const MachineInstr *Orig,
-                             const TargetRegisterInfo *TRI) const {
-// TODO: Implement this function
-}
 
-MachineInst AMDILInstrInfo::duplicate(MachineInstr *Orig,
-                                      MachineFunction &MF) const {
-// TODO: Implement this function
-  return NULL;
-}
-#endif
 MachineInstr *
 AMDILInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
                                       MachineBasicBlock::iterator &MBBI,
@@ -122,25 +85,6 @@ AMDILInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
 // TODO: Implement this function
   return NULL;
 }
-#if 0
-MachineInst AMDILInstrInfo::commuteInstruction(MachineInstr *MI,
-                                               bool NewMI = false) const {
-// TODO: Implement this function
-  return NULL;
-}
-bool
-AMDILInstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
-                                     unsigned &SrcOpIdx2) const
-{
-// TODO: Implement this function
-}
-bool
-AMDILInstrInfo::produceSameValue(const MachineInstr *MI0,
-                                const MachineInstr *MI1) const
-{
-// TODO: Implement this function
-}
-#endif
 bool AMDILInstrInfo::getNextBranchInstr(MachineBasicBlock::iterator &iter,
                                         MachineBasicBlock &MBB) const {
   while (iter != MBB.end()) {
@@ -299,43 +243,6 @@ MachineBasicBlock::iterator skipFlowControl(MachineBasicBlock *MBB) {
   return MBB->end();
 }
 
-bool
-AMDILInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
-                             MachineBasicBlock::iterator I,
-                             unsigned DestReg, unsigned SrcReg,
-                             const TargetRegisterClass *DestRC,
-                             const TargetRegisterClass *SrcRC,
-                             DebugLoc DL) const {
-  // If we are adding to the end of a basic block we can safely assume that the
-  // move is caused by a PHI node since all move instructions that are non-PHI
-  // have already been inserted into the basic blocks Therefor we call the skip
-  // flow control instruction to move the iterator before the flow control
-  // instructions and put the move instruction there.
-  bool phi = (DestReg < 1025) || (SrcReg < 1025);
-  int movInst = phi ? getMoveInstFromID(DestRC->getID())
-                    : getPHIMoveInstFromID(DestRC->getID());
-  
-  MachineBasicBlock::iterator iTemp = (I == MBB.end()) ? skipFlowControl(&MBB)
-                                                       : I;
-  if (DestRC != SrcRC) {
-    //int convInst;
-    size_t dSize = DestRC->getSize();
-    size_t sSize = SrcRC->getSize();
-    if (dSize > sSize) {
-      // Elements are going to get duplicated.
-      BuildMI(MBB, iTemp, DL, get(movInst), DestReg).addReg(SrcReg);
-    } else if (dSize == sSize) {
-      // Direct copy, conversions are not handled.
-      BuildMI(MBB, iTemp, DL, get(movInst), DestReg).addReg(SrcReg);
-    } else if (dSize < sSize) {
-      // Elements are going to get dropped.
-      BuildMI(MBB, iTemp, DL, get(movInst), DestReg).addReg(SrcReg);
-    }
-  } else {
-    BuildMI( MBB, iTemp, DL, get(movInst), DestReg).addReg(SrcReg);
-  }
-  return true;
-}
 void
 AMDILInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MI, DebugLoc DL,
@@ -427,15 +334,11 @@ AMDILInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   if (MI != MBB.end()) {
     DL = MI->getDebugLoc();
   }
-  MachineInstr *nMI = BuildMI(MBB, MI, DL, get(Opc))
+  BuildMI(MBB, MI, DL, get(Opc))
     .addReg(SrcReg, getKillRegState(isKill))
     .addFrameIndex(FrameIndex)
     .addMemOperand(MMO)
     .addImm(0);
-  AMDILAS::InstrResEnc curRes;
-  curRes.bits.ResourceID 
-    = TM.getSubtargetImpl()->device()->getResourceID(AMDILDevice::SCRATCH_ID);
-  setAsmPrinterFlags(nMI, curRes);
 }
 
 void
@@ -511,16 +414,11 @@ AMDILInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   if (MI != MBB.end()) {
     DL = MI->getDebugLoc();
   }
-  MachineInstr* nMI = BuildMI(MBB, MI, DL, get(Opc))
+  BuildMI(MBB, MI, DL, get(Opc))
     .addReg(DestReg, RegState::Define)
     .addFrameIndex(FrameIndex)
     .addMemOperand(MMO)
     .addImm(0);
-  AMDILAS::InstrResEnc curRes;
-  curRes.bits.ResourceID 
-    = TM.getSubtargetImpl()->device()->getResourceID(AMDILDevice::SCRATCH_ID);
-  setAsmPrinterFlags(nMI, curRes);
-
 }
 MachineInstr *
 AMDILInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
@@ -569,65 +467,6 @@ AMDILInstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc,
   return 0;
 }
 
-bool
-AMDILInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
-                                        int64_t &Offset1,
-                                        int64_t &Offset2) const {
-  return false;
-  if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode()) {
-    return false;
-  }
-  const MachineSDNode *mload1 = dyn_cast<MachineSDNode>(Load1);
-  const MachineSDNode *mload2 = dyn_cast<MachineSDNode>(Load2);
-  if (!mload1 || !mload2) {
-    return false;
-  }
-  if (mload1->memoperands_empty() ||
-      mload2->memoperands_empty()) {
-    return false;
-  }
-  MachineMemOperand *memOp1 = (*mload1->memoperands_begin());
-  MachineMemOperand *memOp2 = (*mload2->memoperands_begin());
-  const Value *mv1 = memOp1->getValue();
-  const Value *mv2 = memOp2->getValue();
-  if (!memOp1->isLoad() || !memOp2->isLoad()) {
-    return false;
-  }
-  if (getBasePointerValue(mv1) == getBasePointerValue(mv2)) {
-    if (isa<GetElementPtrInst>(mv1) && isa<GetElementPtrInst>(mv2)) {
-      const GetElementPtrInst *gep1 = dyn_cast<GetElementPtrInst>(mv1);
-      const GetElementPtrInst *gep2 = dyn_cast<GetElementPtrInst>(mv2);
-      if (!gep1 || !gep2) {
-        return false;
-      }
-      if (gep1->getNumOperands() != gep2->getNumOperands()) {
-        return false;
-      }
-      for (unsigned i = 0, e = gep1->getNumOperands() - 1; i < e; ++i) {
-        const Value *op1 = gep1->getOperand(i);
-        const Value *op2 = gep2->getOperand(i);
-        if (op1 != op2) {
-          // If any value except the last one is different, return false.
-          return false;
-        }
-      }
-      unsigned size = gep1->getNumOperands()-1;
-      if (!isa<ConstantInt>(gep1->getOperand(size))
-          || !isa<ConstantInt>(gep2->getOperand(size))) {
-        return false;
-      }
-      Offset1 = dyn_cast<ConstantInt>(gep1->getOperand(size))->getSExtValue();
-      Offset2 = dyn_cast<ConstantInt>(gep2->getOperand(size))->getSExtValue();
-      return true;
-    } else if (isa<Argument>(mv1) && isa<Argument>(mv2)) {
-      return false;
-    } else if (isa<GlobalValue>(mv1) && isa<GlobalValue>(mv2)) {
-      return false;
-    }
-  }
-  return false;
-}
-
 bool AMDILInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
                                              int64_t Offset1, int64_t Offset2,
                                              unsigned NumLoads) const {
@@ -654,16 +493,6 @@ bool AMDILInstrInfo::isPredicated(const MachineInstr *MI) const {
   // TODO: Implement this function
   return false;
 }
-#if 0
-bool AMDILInstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
-  // TODO: Implement this function
-}
-
-bool AMDILInstrInfo::PredicateInstruction(MachineInstr *MI,
-        const SmallVectorImpl<MachineOperand> &Pred) const {
-    // TODO: Implement this function
-}
-#endif
 bool
 AMDILInstrInfo::SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
                                   const SmallVectorImpl<MachineOperand> &Pred2)
@@ -689,21 +518,112 @@ AMDILInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
   return true;
 }
 
-unsigned AMDILInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
-  // TODO: Implement this function
-  return 0;
+bool AMDILInstrInfo::isLoadInst(MachineInstr *MI) const {
+  if (strstr(getName(MI->getOpcode()), "LOADCONST")) {
+    return false;
+  }
+  return strstr(getName(MI->getOpcode()), "LOAD");
 }
 
-#if 0
-unsigned
-AMDILInstrInfo::GetFunctionSizeInBytes(const MachineFunction &MF) const {
-  // TODO: Implement this function
-  return 0;
+bool AMDILInstrInfo::isSWSExtLoadInst(MachineInstr *MI) const
+{
+switch (MI->getOpcode()) {
+    default:
+      break;
+      ExpandCaseToByteShortTypes(AMDIL::LOCALLOAD);
+      ExpandCaseToByteShortTypes(AMDIL::GLOBALLOAD);
+      ExpandCaseToByteShortTypes(AMDIL::REGIONLOAD);
+      ExpandCaseToByteShortTypes(AMDIL::PRIVATELOAD);
+      ExpandCaseToByteShortTypes(AMDIL::CPOOLLOAD);
+      ExpandCaseToByteShortTypes(AMDIL::CONSTANTLOAD);
+      return true;
+  };
+  return false;
 }
 
-unsigned AMDILInstrInfo::getInlineAsmLength(const char *Str,
-                                            const MCAsmInfo &MAI) const {
-  // TODO: Implement this function
-  return 0;
+bool AMDILInstrInfo::isExtLoadInst(MachineInstr *MI) const {
+  return strstr(getName(MI->getOpcode()), "EXTLOAD");
+}
+
+bool AMDILInstrInfo::isSExtLoadInst(MachineInstr *MI) const {
+  return strstr(getName(MI->getOpcode()), "SEXTLOAD");
+}
+
+bool AMDILInstrInfo::isAExtLoadInst(MachineInstr *MI) const {
+  return strstr(getName(MI->getOpcode()), "AEXTLOAD");
+}
+
+bool AMDILInstrInfo::isZExtLoadInst(MachineInstr *MI) const {
+  return strstr(getName(MI->getOpcode()), "ZEXTLOAD");
+}
+
+bool AMDILInstrInfo::isStoreInst(MachineInstr *MI) const {
+  return strstr(getName(MI->getOpcode()), "STORE");
+}
+
+bool AMDILInstrInfo::isTruncStoreInst(MachineInstr *MI) const {
+  return strstr(getName(MI->getOpcode()), "TRUNCSTORE");
+}
+
+bool AMDILInstrInfo::isAtomicInst(MachineInstr *MI) const {
+  return strstr(getName(MI->getOpcode()), "ATOM");
+}
+
+bool AMDILInstrInfo::isVolatileInst(MachineInstr *MI) const {
+  if (!MI->memoperands_empty()) {
+    for (MachineInstr::mmo_iterator mob = MI->memoperands_begin(),
+        moe = MI->memoperands_end(); mob != moe; ++mob) {
+      // If there is a volatile mem operand, this is a volatile instruction.
+      if ((*mob)->isVolatile()) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+bool AMDILInstrInfo::isGlobalInst(llvm::MachineInstr *MI) const
+{
+  return strstr(getName(MI->getOpcode()), "GLOBAL");
+}
+bool AMDILInstrInfo::isPrivateInst(llvm::MachineInstr *MI) const
+{
+  return strstr(getName(MI->getOpcode()), "PRIVATE");
+}
+bool AMDILInstrInfo::isConstantInst(llvm::MachineInstr *MI) const
+{
+  return strstr(getName(MI->getOpcode()), "CONSTANT")
+    || strstr(getName(MI->getOpcode()), "CPOOL");
+}
+bool AMDILInstrInfo::isRegionInst(llvm::MachineInstr *MI) const
+{
+  return strstr(getName(MI->getOpcode()), "REGION");
+}
+bool AMDILInstrInfo::isLocalInst(llvm::MachineInstr *MI) const
+{
+  return strstr(getName(MI->getOpcode()), "LOCAL");
+}
+bool AMDILInstrInfo::isImageInst(llvm::MachineInstr *MI) const
+{
+  return strstr(getName(MI->getOpcode()), "IMAGE");
+}
+bool AMDILInstrInfo::isAppendInst(llvm::MachineInstr *MI) const
+{
+  return strstr(getName(MI->getOpcode()), "APPEND");
+}
+bool AMDILInstrInfo::isRegionAtomic(llvm::MachineInstr *MI) const
+{
+  return strstr(getName(MI->getOpcode()), "ATOM_R");
+}
+bool AMDILInstrInfo::isLocalAtomic(llvm::MachineInstr *MI) const
+{
+  return strstr(getName(MI->getOpcode()), "ATOM_L");
+}
+bool AMDILInstrInfo::isGlobalAtomic(llvm::MachineInstr *MI) const
+{
+  return strstr(getName(MI->getOpcode()), "ATOM_G")
+    || isArenaAtomic(MI);
+}
+bool AMDILInstrInfo::isArenaAtomic(llvm::MachineInstr *MI) const
+{
+  return strstr(getName(MI->getOpcode()), "ATOM_A");
 }
-#endif
diff --git a/src/gallium/drivers/radeon/AMDILInstrInfo.h b/src/gallium/drivers/radeon/AMDILInstrInfo.h
index 88dd4e9441a..4121246e6f9 100644
--- a/src/gallium/drivers/radeon/AMDILInstrInfo.h
+++ b/src/gallium/drivers/radeon/AMDILInstrInfo.h
@@ -40,12 +40,6 @@ public:
   // always be able to get register info as well (through this method).
   const AMDILRegisterInfo &getRegisterInfo() const;
 
-  // Return true if the instruction is a register to register move and leave the
-  // source and dest operands in the passed parameters.
-  bool isMoveInstr(const MachineInstr &MI, unsigned int &SrcReg,
-                   unsigned int &DstReg, unsigned int &SrcSubIdx,
-                   unsigned int &DstSubIdx) const;
-
   bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
                              unsigned &DstReg, unsigned &SubIdx) const;
 
@@ -62,29 +56,10 @@ public:
                              const MachineMemOperand *&MMO,
                              int &FrameIndex) const;
 
-
-#if 0
-  void reMaterialize(MachineBasicBlock &MBB,
-                     MachineBasicBlock::iterator MI,
-                     unsigned DestReg, unsigned SubIdx,
-                     const MachineInstr *Orig,
-                     const TargetRegisterInfo *TRI) const;
-  MachineInstr *duplicate(MachineInstr *Orig,
-                          MachineFunction &MF) const;
-#endif
   MachineInstr *
   convertToThreeAddress(MachineFunction::iterator &MFI,
                         MachineBasicBlock::iterator &MBBI,
                         LiveVariables *LV) const;
-#if 0
-  MachineInstr *commuteInstruction(MachineInstr *MI,
-                                   bool NewMI = false) const;
-  bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
-                             unsigned &SrcOpIdx2) const;
-  bool produceSameValue(const MachineInstr *MI0,
-                        const MachineInstr *MI1) const;
-
-#endif
 
   bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                      MachineBasicBlock *&FBB,
@@ -99,12 +74,6 @@ public:
                const SmallVectorImpl<MachineOperand> &Cond,
                DebugLoc DL) const;
 
-  bool copyRegToReg(MachineBasicBlock &MBB,
-                    MachineBasicBlock::iterator I,
-                    unsigned DestReg, unsigned SrcReg,
-                    const TargetRegisterClass *DestRC,
-                    const TargetRegisterClass *SrcRC,
-                    DebugLoc DL) const;
   virtual void copyPhysReg(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MI, DebugLoc DL,
                            unsigned DestReg, unsigned SrcReg,
@@ -141,8 +110,6 @@ public:
   unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
                                       bool UnfoldLoad, bool UnfoldStore,
                                       unsigned *LoadRegIndex = 0) const;
-  bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
-                               int64_t &Offset1, int64_t &Offset2) const;
   bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
                                int64_t Offset1, int64_t Offset2,
                                unsigned NumLoads) const;
@@ -151,24 +118,36 @@ public:
   void insertNoop(MachineBasicBlock &MBB,
                   MachineBasicBlock::iterator MI) const;
   bool isPredicated(const MachineInstr *MI) const;
-#if 0
-  bool isUnpredicatedTerminator(const MachineInstr *MI) const;
-  bool PredicateInstruction(MachineInstr *MI,
-                            const SmallVectorImpl<MachineOperand> &Pred) const;
-#endif
   bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
                          const SmallVectorImpl<MachineOperand> &Pred2) const;
   bool DefinesPredicate(MachineInstr *MI,
                         std::vector<MachineOperand> &Pred) const;
   bool isPredicable(MachineInstr *MI) const;
   bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const;
-  unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
-#if 0
-  unsigned GetFunctionSizeInBytes(const MachineFunction &MF) const;
-  unsigned getInlineAsmLength(const char *Str,
-                              const MCAsmInfo &MAI) const;
-#endif
-  };
+
+  // Helper functions that check the opcode for status information
+  bool isLoadInst(llvm::MachineInstr *MI) const;
+  bool isExtLoadInst(llvm::MachineInstr *MI) const;
+  bool isSWSExtLoadInst(llvm::MachineInstr *MI) const;
+  bool isSExtLoadInst(llvm::MachineInstr *MI) const;
+  bool isZExtLoadInst(llvm::MachineInstr *MI) const;
+  bool isAExtLoadInst(llvm::MachineInstr *MI) const;
+  bool isStoreInst(llvm::MachineInstr *MI) const;
+  bool isTruncStoreInst(llvm::MachineInstr *MI) const;
+  bool isAtomicInst(llvm::MachineInstr *MI) const;
+  bool isVolatileInst(llvm::MachineInstr *MI) const;
+  bool isGlobalInst(llvm::MachineInstr *MI) const;
+  bool isPrivateInst(llvm::MachineInstr *MI) const;
+  bool isConstantInst(llvm::MachineInstr *MI) const;
+  bool isRegionInst(llvm::MachineInstr *MI) const;
+  bool isLocalInst(llvm::MachineInstr *MI) const;
+  bool isImageInst(llvm::MachineInstr *MI) const;
+  bool isAppendInst(llvm::MachineInstr *MI) const;
+  bool isRegionAtomic(llvm::MachineInstr *MI) const;
+  bool isLocalAtomic(llvm::MachineInstr *MI) const;
+  bool isGlobalAtomic(llvm::MachineInstr *MI) const;
+  bool isArenaAtomic(llvm::MachineInstr *MI) const;
+};
 
 }
 
diff --git a/src/gallium/drivers/radeon/AMDILInstructions.td b/src/gallium/drivers/radeon/AMDILInstructions.td
index f824a67d7ad..db56e2121b3 100644
--- a/src/gallium/drivers/radeon/AMDILInstructions.td
+++ b/src/gallium/drivers/radeon/AMDILInstructions.td
@@ -1,4 +1,4 @@
-//===-- AMDILInstructions.td - TODO: Add brief description -------===//
+//===-- AMDILInstructions.td - AMDIL Instruction definitions --------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/src/gallium/drivers/radeon/AMDILMCCodeEmitter.cpp b/src/gallium/drivers/radeon/AMDILMCCodeEmitter.cpp
deleted file mode 100644
index 9366f2e7bcb..00000000000
--- a/src/gallium/drivers/radeon/AMDILMCCodeEmitter.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-//===---- AMDILMCCodeEmitter.cpp - Convert AMDIL text to AMDIL binary ----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-//
-//===---------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "amdil-emitter"
-#include "AMDIL.h"
-#include "AMDILInstrInfo.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-#if 0
-namespace {
-  class AMDILMCCodeEmitter : public MCCodeEmitter {
-    AMDILMCCodeEmitter(const AMDILMCCodeEmitter &);// DO NOT IMPLEMENT
-    void operator=(const AMDILMCCodeEmitter &); // DO NOT IMPLEMENT
-    const TargetMachine &TM;
-    const TargetInstrInfo &TII;
-    MCContext &Ctx;
-    bool Is64BitMode;
-    public:
-    AMDILMCCodeEmitter(TargetMachine &tm, MCContext &ctx, bool is64Bit);
-    ~AMDILMCCodeEmitter();
-    unsigned getNumFixupKinds() const;
-    const MCFixupKindInfo& getFixupKindInfo(MCFixupKind Kind) const;
-    static unsigned GetAMDILRegNum(const MCOperand &MO);
-    void EmitByte(unsigned char C, unsigned &CurByte, raw_ostream &OS) const;
-    void EmitConstant(uint64_t Val, unsigned Size, unsigned &CurByte,
-        raw_ostream &OS) const;
-    void EmitImmediate(const MCOperand &Disp, unsigned ImmSize,
-        MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &os,
-        SmallVectorImpl<MCFixup> &Fixups, int ImmOffset = 0) const;
-
-    void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-        SmallVectorImpl<MCFixup> &Fixups) const;
-
-  }; // class AMDILMCCodeEmitter
-}; // anonymous namespace
-
-namespace llvm {
-  MCCodeEmitter *createAMDILMCCodeEmitter(const Target &,
-      TargetMachine &TM, MCContext &Ctx)
-  {
-    return new AMDILMCCodeEmitter(TM, Ctx, false);
-  }
-}
-
-AMDILMCCodeEmitter::AMDILMCCodeEmitter(TargetMachine &tm, MCContext &ctx
-    , bool is64Bit)
-: TM(tm), TII(*TM.getInstrInfo()), Ctx(ctx)
-{
-  Is64BitMode = is64Bit;
-}
-
-AMDILMCCodeEmitter::~AMDILMCCodeEmitter()
-{
-}
-
-unsigned
-AMDILMCCodeEmitter::getNumFixupKinds() const
-{
-  return 0;
-}
-
-const MCFixupKindInfo &
-AMDILMCCodeEmitter::getFixupKindInfo(MCFixupKind Kind) const
-{
-//  const static MCFixupKindInfo Infos[] = {};
-  if (Kind < FirstTargetFixupKind) {
-    return MCCodeEmitter::getFixupKindInfo(Kind);
-  }
-  assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
-      "Invalid kind!");
-  return MCCodeEmitter::getFixupKindInfo(Kind);
- // return Infos[Kind - FirstTargetFixupKind];
-
-}
-
-void
-AMDILMCCodeEmitter::EmitByte(unsigned char C, unsigned &CurByte,
-    raw_ostream &OS) const
-{
-  OS << (char) C;
-  ++CurByte;
-}
-void
-AMDILMCCodeEmitter::EmitConstant(uint64_t Val, unsigned Size, unsigned &CurByte,
-    raw_ostream &OS) const
-{
-  // Output the constant in little endian byte order
-  for (unsigned i = 0; i != Size; ++i) {
-    EmitByte(Val & 255, CurByte, OS);
-    Val >>= 8;
-  }
-}
-void
-AMDILMCCodeEmitter::EmitImmediate(const MCOperand &DispOp, unsigned ImmSize,
-    MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &OS,
-    SmallVectorImpl<MCFixup> &Fixups, int ImmOffset) const
-{
-  // If this is a simple integer displacement that doesn't require a relocation
-  // emit it now.
-  if (DispOp.isImm()) {
-    EmitConstant(DispOp.getImm() + ImmOffset, ImmSize, CurByte, OS);
-  }
-
-  // If we have an immoffset, add it to the expression
-  const MCExpr *Expr = DispOp.getExpr();
-
-  if (ImmOffset) {
-    Expr = MCBinaryExpr::CreateAdd(Expr,
-        MCConstantExpr::Create(ImmOffset, Ctx), Ctx);
-  }
-  // Emit a symbolic constant as a fixup and 4 zeros.
-  Fixups.push_back(MCFixup::Create(CurByte, Expr, FixupKind));
-  // TODO: Why the 4 zeros?
-  EmitConstant(0, ImmSize, CurByte, OS);
-}
-
-void
-AMDILMCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-    SmallVectorImpl<MCFixup> &Fixups) const
-{
-#if 0
-  unsigned Opcode = MI.getOpcode();
-  const TargetInstrDesc &Desc = TII.get(Opcode);
-  unsigned TSFlags = Desc.TSFlags;
-
-  // Keep track of the current byte being emitted.
-  unsigned CurByte = 0;
-
-  unsigned NumOps = Desc.getNumOperands();
-  unsigned CurOp = 0;
-
-  unsigned char BaseOpcode = 0;
-#ifndef NDEBUG
-  // FIXME: Verify.
-  if (// !Desc.isVariadic() &&
-      CurOp != NumOps) {
-    errs() << "Cannot encode all operands of: ";
-    MI.dump();
-    errs() << '\n';
-    abort();
-  }
-#endif
-#endif
-}
-#endif
diff --git a/src/gallium/drivers/radeon/AMDILMachinePeephole.cpp b/src/gallium/drivers/radeon/AMDILMachinePeephole.cpp
index b8e536361f0..5cb988785e2 100644
--- a/src/gallium/drivers/radeon/AMDILMachinePeephole.cpp
+++ b/src/gallium/drivers/radeon/AMDILMachinePeephole.cpp
@@ -8,17 +8,11 @@
 //==-----------------------------------------------------------------------===//
 
 
-#define DEBUG_TYPE "machine_peephole"
-#if !defined(NDEBUG)
-#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
-#else
-#define DEBUGME (false)
-#endif
-
 #include "AMDIL.h"
+#include "AMDILInstrInfo.h"
 #include "AMDILSubtarget.h"
-#include "AMDILUtilityFunctions.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetMachine.h"
@@ -56,7 +50,7 @@ namespace llvm
 AMDILMachinePeephole::AMDILMachinePeephole(TargetMachine &tm AMDIL_OPT_LEVEL_DECL)
   : MachineFunctionPass(ID), TM(tm)
 {
-  mDebug = DEBUGME;
+  mDebug = false;
 }
 
 bool
@@ -64,6 +58,8 @@ AMDILMachinePeephole::runOnMachineFunction(MachineFunction &MF)
 {
   bool Changed = false;
   const AMDILSubtarget *STM = &TM.getSubtarget<AMDILSubtarget>();
+  const AMDILInstrInfo * AMDILII =
+                         static_cast<const AMDILInstrInfo *>(TM.getInstrInfo());
   for (MachineFunction::iterator MBB = MF.begin(), MBE = MF.end();
       MBB != MBE; ++MBB) {
     MachineBasicBlock *mb = MBB;
@@ -74,7 +70,7 @@ AMDILMachinePeephole::runOnMachineFunction(MachineFunction &MF)
       name = TM.getInstrInfo()->getName(mi->getOpcode());
       switch (mi->getOpcode()) {
         default:
-          if (isAtomicInst(TM.getInstrInfo(), mi)) {
+          if (AMDILII->isAtomicInst(mi)) {
             // If we don't support the hardware accellerated address spaces,
             // then the atomic needs to be transformed to the global atomic.
             if (strstr(name, "_L_")
@@ -94,7 +90,8 @@ AMDILMachinePeephole::runOnMachineFunction(MachineFunction &MF)
                   TM.getInstrInfo()->get(
                     (mi->getOpcode() - AMDIL::ATOM_R_ADD) + AMDIL::ATOM_G_ADD));
             }
-          } else if ((isLoadInst(TM.getInstrInfo(), mi) || isStoreInst(TM.getInstrInfo(), mi)) && isVolatileInst(TM.getInstrInfo(), mi)) {
+          } else if ((AMDILII->isLoadInst(mi) || AMDILII->isStoreInst(mi))
+                     && AMDILII->isVolatileInst(mi)) {
             insertFence(MIB);
           }
           continue;
diff --git a/src/gallium/drivers/radeon/AMDILMultiClass.td b/src/gallium/drivers/radeon/AMDILMultiClass.td
index 92691db52fd..d6828178ba7 100644
--- a/src/gallium/drivers/radeon/AMDILMultiClass.td
+++ b/src/gallium/drivers/radeon/AMDILMultiClass.td
@@ -1,4 +1,4 @@
-//===-- AMDILMultiClass.td - TODO: Add brief description -------===//
+//===-- AMDILMultiClass.td - AMDIL Multiclass defs ---*- tablegen -*-------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/src/gallium/drivers/radeon/AMDILNIDevice.cpp b/src/gallium/drivers/radeon/AMDILNIDevice.cpp
index 8fda1c18ae5..d4112cda0b5 100644
--- a/src/gallium/drivers/radeon/AMDILNIDevice.cpp
+++ b/src/gallium/drivers/radeon/AMDILNIDevice.cpp
@@ -1,4 +1,4 @@
-//===-- AMDILNIDevice.cpp - TODO: Add brief description -------===//
+//===-- AMDILNIDevice.cpp - Device Info for Northern Islands devices ------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/src/gallium/drivers/radeon/AMDILPeepholeOptimizer.cpp b/src/gallium/drivers/radeon/AMDILPeepholeOptimizer.cpp
index 5fe9f53c8c8..b62c7ab048b 100644
--- a/src/gallium/drivers/radeon/AMDILPeepholeOptimizer.cpp
+++ b/src/gallium/drivers/radeon/AMDILPeepholeOptimizer.cpp
@@ -1,4 +1,4 @@
-//===-- AMDILPeepholeOptimizer.cpp - TODO: Add brief description -------===//
+//===-- AMDILPeepholeOptimizer.cpp - AMDIL Peephole optimizations ---------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,20 +7,14 @@
 //
 //==-----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "PeepholeOpt"
-#ifdef DEBUG
-#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
-#else
-#define DEBUGME 0
-#endif
-
 #include "AMDILAlgorithms.tpp"
 #include "AMDILDevices.h"
-#include "AMDILUtilityFunctions.h"
+#include "AMDILInstrInfo.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Constants.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/Function.h"
@@ -41,6 +35,9 @@ using namespace llvm;
 // The Peephole optimization pass is used to do simple last minute optimizations
 // that are required for correct code or to remove redundant functions
 namespace {
+
+class OpaqueType;
+
 class LLVM_LIBRARY_VISIBILITY AMDILPeepholeOpt : public FunctionPass {
 public:
   TargetMachine &TM;
@@ -114,6 +111,19 @@ private:
   // samplers at compile time.
   bool propagateSamplerInst(CallInst *CI);
 
+  // Helper functions
+
+  // Group of functions that recursively calculate the size of a structure based
+  // on it's sub-types.
+  size_t getTypeSize(Type * const T, bool dereferencePtr = false);
+  size_t getTypeSize(StructType * const ST, bool dereferencePtr = false);
+  size_t getTypeSize(IntegerType * const IT, bool dereferencePtr = false);
+  size_t getTypeSize(FunctionType * const FT,bool dereferencePtr = false);
+  size_t getTypeSize(ArrayType * const AT, bool dereferencePtr = false);
+  size_t getTypeSize(VectorType * const VT, bool dereferencePtr = false);
+  size_t getTypeSize(PointerType * const PT, bool dereferencePtr = false);
+  size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr = false);
+
   LLVMContext *mCTX;
   Function *mF;
   const AMDILSubtarget *mSTM;
@@ -134,7 +144,7 @@ namespace llvm {
 AMDILPeepholeOpt::AMDILPeepholeOpt(TargetMachine &tm AMDIL_OPT_LEVEL_DECL)
   : FunctionPass(ID), TM(tm) 
 {
-  mDebug = DEBUGME;
+  mDebug = false;
   optLevel = TM.getOptLevel();
 
 }
@@ -1136,3 +1146,106 @@ AMDILPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const
   FunctionPass::getAnalysisUsage(AU);
   AU.setPreservesAll();
 }
+
+size_t AMDILPeepholeOpt::getTypeSize(Type * const T, bool dereferencePtr) {
+  size_t size = 0;
+  if (!T) {
+    return size;
+  }
+  switch (T->getTypeID()) {
+  case Type::X86_FP80TyID:
+  case Type::FP128TyID:
+  case Type::PPC_FP128TyID:
+  case Type::LabelTyID:
+    assert(0 && "These types are not supported by this backend");
+  default:
+  case Type::FloatTyID:
+  case Type::DoubleTyID:
+    size = T->getPrimitiveSizeInBits() >> 3;
+    break;
+  case Type::PointerTyID:
+    size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr);
+    break;
+  case Type::IntegerTyID:
+    size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr);
+    break;
+  case Type::StructTyID:
+    size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr);
+    break;
+  case Type::ArrayTyID:
+    size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr);
+    break;
+  case Type::FunctionTyID:
+    size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr);
+    break;
+  case Type::VectorTyID:
+    size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr);
+    break;
+  };
+  return size;
+}
+
+size_t AMDILPeepholeOpt::getTypeSize(StructType * const ST,
+    bool dereferencePtr) {
+  size_t size = 0;
+  if (!ST) {
+    return size;
+  }
+  Type *curType;
+  StructType::element_iterator eib;
+  StructType::element_iterator eie;
+  for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) {
+    curType = *eib;
+    size += getTypeSize(curType, dereferencePtr);
+  }
+  return size;
+}
+
+size_t AMDILPeepholeOpt::getTypeSize(IntegerType * const IT,
+    bool dereferencePtr) {
+  return IT ? (IT->getBitWidth() >> 3) : 0;
+}
+
+size_t AMDILPeepholeOpt::getTypeSize(FunctionType * const FT,
+    bool dereferencePtr) {
+    assert(0 && "Should not be able to calculate the size of an function type");
+    return 0;
+}
+
+size_t AMDILPeepholeOpt::getTypeSize(ArrayType * const AT,
+    bool dereferencePtr) {
+  return (size_t)(AT ? (getTypeSize(AT->getElementType(),
+                                    dereferencePtr) * AT->getNumElements())
+                     : 0);
+}
+
+size_t AMDILPeepholeOpt::getTypeSize(VectorType * const VT,
+    bool dereferencePtr) {
+  return VT ? (VT->getBitWidth() >> 3) : 0;
+}
+
+size_t AMDILPeepholeOpt::getTypeSize(PointerType * const PT,
+    bool dereferencePtr) {
+  if (!PT) {
+    return 0;
+  }
+  Type *CT = PT->getElementType();
+  if (CT->getTypeID() == Type::StructTyID &&
+      PT->getAddressSpace() == AMDILAS::PRIVATE_ADDRESS) {
+    return getTypeSize(dyn_cast<StructType>(CT));
+  } else if (dereferencePtr) {
+    size_t size = 0;
+    for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) {
+      size += getTypeSize(PT->getContainedType(x), dereferencePtr);
+    }
+    return size;
+  } else {
+    return 4;
+  }
+}
+
+size_t AMDILPeepholeOpt::getTypeSize(OpaqueType * const OT,
+    bool dereferencePtr) {
+  //assert(0 && "Should not be able to calculate the size of an opaque type");
+  return 4;
+}
diff --git a/src/gallium/drivers/radeon/AMDILRegisterInfo.cpp b/src/gallium/drivers/radeon/AMDILRegisterInfo.cpp
index 5588233378c..d7c1dc74b8b 100644
--- a/src/gallium/drivers/radeon/AMDILRegisterInfo.cpp
+++ b/src/gallium/drivers/radeon/AMDILRegisterInfo.cpp
@@ -20,7 +20,8 @@
 
 #include "AMDILRegisterInfo.h"
 #include "AMDIL.h"
-#include "AMDILUtilityFunctions.h"
+#include "AMDILInstrInfo.h"
+#include "AMDILTargetMachine.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -109,7 +110,9 @@ AMDILRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     if (!MI.getOperand(x).isFI()) {
       continue;
     }
-    bool def = isStoreInst(TM.getInstrInfo(), &MI);
+    const AMDILInstrInfo * AMDILII =
+                         static_cast<const AMDILInstrInfo *>(TM.getInstrInfo());
+    bool def = AMDILII->isStoreInst(&MI);
     int FrameIndex = MI.getOperand(x).getIndex();
     int64_t Offset = MFI->getObjectOffset(FrameIndex);
     //int64_t Size = MF.getFrameInfo()->getObjectSize(FrameIndex);
diff --git a/src/gallium/drivers/radeon/AMDILSIDevice.cpp b/src/gallium/drivers/radeon/AMDILSIDevice.cpp
index ce560984ef9..ae402a5d1f7 100644
--- a/src/gallium/drivers/radeon/AMDILSIDevice.cpp
+++ b/src/gallium/drivers/radeon/AMDILSIDevice.cpp
@@ -1,49 +1,49 @@
-//===-- AMDILSIDevice.cpp - TODO: Add brief description -------===//
+//===-- AMDILSIDevice.cpp - Device Info for Southern Islands GPUs ---------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-//==-----------------------------------------------------------------------===//
-#include "AMDILSIDevice.h"
-#include "AMDILEvergreenDevice.h"
-#include "AMDILNIDevice.h"
-#include "AMDILSubtarget.h"
+//==-----------------------------------------------------------------------===//
+#include "AMDILSIDevice.h"
+#include "AMDILEvergreenDevice.h"
+#include "AMDILNIDevice.h"
+#include "AMDILSubtarget.h"
 
-using namespace llvm;
-
-AMDILSIDevice::AMDILSIDevice(AMDILSubtarget *ST)
-  : AMDILEvergreenDevice(ST)
-{
-}
-AMDILSIDevice::~AMDILSIDevice()
-{
-}
-
-size_t
-AMDILSIDevice::getMaxLDSSize() const
-{
-  if (usesHardware(AMDILDeviceInfo::LocalMem)) {
-    return MAX_LDS_SIZE_900;
-  } else {
-    return 0;
-  }
-}
-
-uint32_t
-AMDILSIDevice::getGeneration() const
-{
-  return AMDILDeviceInfo::HD7XXX;
-}
-
-std::string
-AMDILSIDevice::getDataLayout() const
-{
-    return std::string("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16"
-      "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:32:32"
-      "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64"
-      "-v96:128:128-v128:128:128-v192:256:256-v256:256:256"
-      "-v512:512:512-v1024:1024:1024-v2048:2048:2048"
-      "-n8:16:32:64");
-}
+using namespace llvm;
+
+AMDILSIDevice::AMDILSIDevice(AMDILSubtarget *ST)
+  : AMDILEvergreenDevice(ST)
+{
+}
+AMDILSIDevice::~AMDILSIDevice()
+{
+}
+
+size_t
+AMDILSIDevice::getMaxLDSSize() const
+{
+  if (usesHardware(AMDILDeviceInfo::LocalMem)) {
+    return MAX_LDS_SIZE_900;
+  } else {
+    return 0;
+  }
+}
+
+uint32_t
+AMDILSIDevice::getGeneration() const
+{
+  return AMDILDeviceInfo::HD7XXX;
+}
+
+std::string
+AMDILSIDevice::getDataLayout() const
+{
+    return std::string("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16"
+      "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:32:32"
+      "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64"
+      "-v96:128:128-v128:128:128-v192:256:256-v256:256:256"
+      "-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+      "-n8:16:32:64");
+}
diff --git a/src/gallium/drivers/radeon/AMDILSIDevice.h b/src/gallium/drivers/radeon/AMDILSIDevice.h
index 69f35a0588d..b272af7cfcf 100644
--- a/src/gallium/drivers/radeon/AMDILSIDevice.h
+++ b/src/gallium/drivers/radeon/AMDILSIDevice.h
@@ -1,45 +1,45 @@
-//===------- AMDILSIDevice.h - Define SI Device for AMDIL -*- C++ -*------===//
+//===------- AMDILSIDevice.h - Define SI Device for AMDIL -*- C++ -*------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-//==-----------------------------------------------------------------------===//
-//
-// Interface for the subtarget data classes.
-//
-//===---------------------------------------------------------------------===//
-// This file will define the interface that each generation needs to
-// implement in order to correctly answer queries on the capabilities of the
+//==-----------------------------------------------------------------------===//
+//
+// Interface for the subtarget data classes.
+//
+//===---------------------------------------------------------------------===//
+// This file will define the interface that each generation needs to
+// implement in order to correctly answer queries on the capabilities of the
 // specific hardware.
-//===---------------------------------------------------------------------===//
-#ifndef _AMDILSIDEVICE_H_
-#define _AMDILSIDEVICE_H_
-#include "AMDILEvergreenDevice.h"
-#include "AMDILSubtarget.h"
+//===---------------------------------------------------------------------===//
+#ifndef _AMDILSIDEVICE_H_
+#define _AMDILSIDEVICE_H_
+#include "AMDILEvergreenDevice.h"
+#include "AMDILSubtarget.h"
+
+namespace llvm {
+  class AMDILSubtarget;
+//===---------------------------------------------------------------------===//
+// SI generation of devices and their respective sub classes
+//===---------------------------------------------------------------------===//
+
+// The AMDILSIDevice is the base class for all Northern Island series of
+// cards. It is very similiar to the AMDILEvergreenDevice, with the major
+// exception being differences in wavefront size and hardware capabilities.  The
+// SI devices are all 64 wide wavefronts and also add support for signed 24 bit
+// integer operations
+
+  class AMDILSIDevice : public AMDILEvergreenDevice {
+    public:
+      AMDILSIDevice(AMDILSubtarget*);
+      virtual ~AMDILSIDevice();
+      virtual size_t getMaxLDSSize() const;
+      virtual uint32_t getGeneration() const;
+      virtual std::string getDataLayout() const;
+    protected:
+  }; // AMDILSIDevice
 
-namespace llvm {
-  class AMDILSubtarget;
-//===---------------------------------------------------------------------===//
-// SI generation of devices and their respective sub classes
-//===---------------------------------------------------------------------===//
-
-// The AMDILSIDevice is the base class for all Northern Island series of
-// cards. It is very similiar to the AMDILEvergreenDevice, with the major
-// exception being differences in wavefront size and hardware capabilities.  The
-// SI devices are all 64 wide wavefronts and also add support for signed 24 bit
-// integer operations
-
-  class AMDILSIDevice : public AMDILEvergreenDevice {
-    public:
-      AMDILSIDevice(AMDILSubtarget*);
-      virtual ~AMDILSIDevice();
-      virtual size_t getMaxLDSSize() const;
-      virtual uint32_t getGeneration() const;
-      virtual std::string getDataLayout() const;
-    protected:
-  }; // AMDILSIDevice
-
-} // namespace llvm
-#endif // _AMDILSIDEVICE_H_
+} // namespace llvm
+#endif // _AMDILSIDEVICE_H_
diff --git a/src/gallium/drivers/radeon/AMDILSubtarget.cpp b/src/gallium/drivers/radeon/AMDILSubtarget.cpp
index 11b6bbe0c01..249cb03f4a3 100644
--- a/src/gallium/drivers/radeon/AMDILSubtarget.cpp
+++ b/src/gallium/drivers/radeon/AMDILSubtarget.cpp
@@ -27,7 +27,8 @@ using namespace llvm;
 #define GET_SUBTARGETINFO_TARGET_DESC
 #include "AMDILGenSubtargetInfo.inc"
 
-AMDILSubtarget::AMDILSubtarget(llvm::StringRef TT, llvm::StringRef CPU, llvm::StringRef FS) : AMDILGenSubtargetInfo( TT, CPU, FS )
+AMDILSubtarget::AMDILSubtarget(llvm::StringRef TT, llvm::StringRef CPU, llvm::StringRef FS) : AMDILGenSubtargetInfo( TT, CPU, FS ),
+  mDumpCode(false)
 {
   memset(CapsOverride, 0, sizeof(*CapsOverride)
       * AMDILDeviceInfo::MaxNumberCapabilities);
@@ -93,7 +94,7 @@ AMDILSubtarget::AMDILSubtarget(llvm::StringRef TT, llvm::StringRef CPU, llvm::St
   }
 #endif
   mDevName = GPU;
-  mDevice = getDeviceFromName(mDevName, this, mIs64bit);
+  mDevice = AMDILDeviceInfo::getDeviceFromName(mDevName, this, mIs64bit);
 }
 AMDILSubtarget::~AMDILSubtarget()
 {
diff --git a/src/gallium/drivers/radeon/AMDILSubtarget.h b/src/gallium/drivers/radeon/AMDILSubtarget.h
index a4b0e34ada7..38fcb859ac6 100644
--- a/src/gallium/drivers/radeon/AMDILSubtarget.h
+++ b/src/gallium/drivers/radeon/AMDILSubtarget.h
@@ -42,6 +42,7 @@ namespace llvm {
       uint32_t mVersion;
       bool mIs64bit;
       bool mIs32on64bit;
+      bool mDumpCode;
     public:
       AMDILSubtarget(llvm::StringRef TT, llvm::StringRef CPU, llvm::StringRef FS);
       virtual ~AMDILSubtarget();
@@ -67,6 +68,7 @@ namespace llvm {
         ParseSubtargetFeatures(
             llvm::StringRef CPU,
             llvm::StringRef FS);
+      bool dumpCode() const { return mDumpCode; }
 
   };
 
diff --git a/src/gallium/drivers/radeon/AMDILTargetMachine.cpp b/src/gallium/drivers/radeon/AMDILTargetMachine.cpp
index 77fac1d97bd..0879d43ad72 100644
--- a/src/gallium/drivers/radeon/AMDILTargetMachine.cpp
+++ b/src/gallium/drivers/radeon/AMDILTargetMachine.cpp
@@ -150,8 +150,8 @@ bool AMDILPassConfig::addPreISel()
 
 bool AMDILPassConfig::addInstSelector()
 {
-  PM.add(createAMDILPeepholeOpt(*TM));
-  PM.add(createAMDILISelDag(getAMDILTargetMachine()));
+  PM->add(createAMDILPeepholeOpt(*TM));
+  PM->add(createAMDILISelDag(getAMDILTargetMachine()));
   return false;
 }
 
@@ -162,7 +162,7 @@ bool AMDILPassConfig::addPreRegAlloc()
     llvm::RegisterScheduler::setDefault(&llvm::createSourceListDAGScheduler);
   }
 
-  PM.add(createAMDILMachinePeephole(*TM));
+  PM->add(createAMDILMachinePeephole(*TM));
   return false;
 }
 
@@ -175,8 +175,8 @@ bool AMDILPassConfig::addPostRegAlloc() {
 /// true if -print-machineinstrs should print out the code after the passes.
 bool AMDILPassConfig::addPreEmitPass()
 {
-  PM.add(createAMDILCFGPreparationPass(*TM));
-  PM.add(createAMDILCFGStructurizerPass(*TM));
+  PM->add(createAMDILCFGPreparationPass(*TM));
+  PM->add(createAMDILCFGStructurizerPass(*TM));
   return true;
 }
 
diff --git a/src/gallium/drivers/radeon/AMDILTokenDesc.td b/src/gallium/drivers/radeon/AMDILTokenDesc.td
index b81f593506f..2dafb2cd559 100644
--- a/src/gallium/drivers/radeon/AMDILTokenDesc.td
+++ b/src/gallium/drivers/radeon/AMDILTokenDesc.td
@@ -1,4 +1,4 @@
-//===-- AMDILTokenDesc.td - TODO: Add brief description -------===//
+//===-- AMDILTokenDesc.td - AMDIL Token Definitions --*- tablegen -*-----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/src/gallium/drivers/radeon/AMDILUtilityFunctions.cpp b/src/gallium/drivers/radeon/AMDILUtilityFunctions.cpp
deleted file mode 100644
index f2ef4eb7771..00000000000
--- a/src/gallium/drivers/radeon/AMDILUtilityFunctions.cpp
+++ /dev/null
@@ -1,683 +0,0 @@
-//===-- AMDILUtilityFunctions.cpp - AMDIL Utility Functions       ---------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-//
-// This file provides the implementations of functions that are declared in the
-// AMDILUtilityFUnctions.h file.
-//
-//===----------------------------------------------------------------------===//
-#include "AMDILUtilityFunctions.h"
-#include "AMDILISelLowering.h"
-#include "llvm/ADT/ValueMap.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Instruction.h"
-#include "llvm/Instructions.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/FormattedStream.h"
-#include "llvm/Type.h"
-
-#include <cstdio>
-#include <list>
-#include <queue>
-
-#define GET_OPCODE_NAME(TII, MI) \
-  TII->getName(MI->getOpcode())
-
-
-using namespace llvm;
-int64_t GET_SCALAR_SIZE(llvm::Type *A) {
-  return A->getScalarSizeInBits();
-}
-
-const TargetRegisterClass * getRegClassFromID(unsigned int ID) {
-  switch (ID) {
-  default:
-    assert(0 && "Passed in ID does not match any register classes.");
-    return NULL;
-  case AMDIL::GPRI8RegClassID:
-    return &AMDIL::GPRI8RegClass;
-  case AMDIL::GPRI16RegClassID:
-    return &AMDIL::GPRI16RegClass;
-  case AMDIL::GPRI32RegClassID:
-    return &AMDIL::GPRI32RegClass;
-  case AMDIL::GPRF32RegClassID:
-    return &AMDIL::GPRF32RegClass;
-  case AMDIL::GPRI64RegClassID:
-    return &AMDIL::GPRI64RegClass;
-  case AMDIL::GPRF64RegClassID:
-    return &AMDIL::GPRF64RegClass;
-  case AMDIL::GPRV4F32RegClassID:
-    return &AMDIL::GPRV4F32RegClass;
-  case AMDIL::GPRV4I8RegClassID:
-    return &AMDIL::GPRV4I8RegClass;
-  case AMDIL::GPRV4I16RegClassID:
-    return &AMDIL::GPRV4I16RegClass;
-  case AMDIL::GPRV4I32RegClassID:
-    return &AMDIL::GPRV4I32RegClass;
-  case AMDIL::GPRV2F32RegClassID:
-    return &AMDIL::GPRV2F32RegClass;
-  case AMDIL::GPRV2I8RegClassID:
-    return &AMDIL::GPRV2I8RegClass;
-  case AMDIL::GPRV2I16RegClassID:
-    return &AMDIL::GPRV2I16RegClass;
-  case AMDIL::GPRV2I32RegClassID:
-    return &AMDIL::GPRV2I32RegClass;
-  case AMDIL::GPRV2F64RegClassID:
-    return &AMDIL::GPRV2F64RegClass;
-  case AMDIL::GPRV2I64RegClassID:
-    return &AMDIL::GPRV2I64RegClass;
-  };
-}
-
-unsigned int getMoveInstFromID(unsigned int ID) {
-  switch (ID) {
-  default:
-    assert(0 && "Passed in ID does not match any move instructions.");
-  case AMDIL::GPRI8RegClassID:
-    return AMDIL::MOVE_i8;
-  case AMDIL::GPRI16RegClassID:
-    return AMDIL::MOVE_i16;
-  case AMDIL::GPRI32RegClassID:
-    return AMDIL::MOVE_i32;
-  case AMDIL::GPRF32RegClassID:
-    return AMDIL::MOVE_f32;
-  case AMDIL::GPRI64RegClassID:
-    return AMDIL::MOVE_i64;
-  case AMDIL::GPRF64RegClassID:
-    return AMDIL::MOVE_f64;
-  case AMDIL::GPRV4F32RegClassID:
-    return AMDIL::MOVE_v4f32;
-  case AMDIL::GPRV4I8RegClassID:
-    return AMDIL::MOVE_v4i8;
-  case AMDIL::GPRV4I16RegClassID:
-    return AMDIL::MOVE_v4i16;
-  case AMDIL::GPRV4I32RegClassID:
-    return AMDIL::MOVE_v4i32;
-  case AMDIL::GPRV2F32RegClassID:
-    return AMDIL::MOVE_v2f32;
-  case AMDIL::GPRV2I8RegClassID:
-    return AMDIL::MOVE_v2i8;
-  case AMDIL::GPRV2I16RegClassID:
-    return AMDIL::MOVE_v2i16;
-  case AMDIL::GPRV2I32RegClassID:
-    return AMDIL::MOVE_v2i32;
-  case AMDIL::GPRV2F64RegClassID:
-    return AMDIL::MOVE_v2f64;
-  case AMDIL::GPRV2I64RegClassID:
-    return AMDIL::MOVE_v2i64;
-  };
-  return -1;
-}
-
-unsigned int getPHIMoveInstFromID(unsigned int ID) {
-  switch (ID) {
-  default:
-    assert(0 && "Passed in ID does not match any move instructions.");
-  case AMDIL::GPRI8RegClassID:
-    return AMDIL::PHIMOVE_i8;
-  case AMDIL::GPRI16RegClassID:
-    return AMDIL::PHIMOVE_i16;
-  case AMDIL::GPRI32RegClassID:
-    return AMDIL::PHIMOVE_i32;
-  case AMDIL::GPRF32RegClassID:
-    return AMDIL::PHIMOVE_f32;
-  case AMDIL::GPRI64RegClassID:
-    return AMDIL::PHIMOVE_i64;
-  case AMDIL::GPRF64RegClassID:
-    return AMDIL::PHIMOVE_f64;
-  case AMDIL::GPRV4F32RegClassID:
-    return AMDIL::PHIMOVE_v4f32;
-  case AMDIL::GPRV4I8RegClassID:
-    return AMDIL::PHIMOVE_v4i8;
-  case AMDIL::GPRV4I16RegClassID:
-    return AMDIL::PHIMOVE_v4i16;
-  case AMDIL::GPRV4I32RegClassID:
-    return AMDIL::PHIMOVE_v4i32;
-  case AMDIL::GPRV2F32RegClassID:
-    return AMDIL::PHIMOVE_v2f32;
-  case AMDIL::GPRV2I8RegClassID:
-    return AMDIL::PHIMOVE_v2i8;
-  case AMDIL::GPRV2I16RegClassID:
-    return AMDIL::PHIMOVE_v2i16;
-  case AMDIL::GPRV2I32RegClassID:
-    return AMDIL::PHIMOVE_v2i32;
-  case AMDIL::GPRV2F64RegClassID:
-    return AMDIL::PHIMOVE_v2f64;
-  case AMDIL::GPRV2I64RegClassID:
-    return AMDIL::PHIMOVE_v2i64;
-  };
-  return -1;
-}
-
-const TargetRegisterClass* getRegClassFromType(unsigned int type) {
-  switch (type) {
-  default:
-    assert(0 && "Passed in type does not match any register classes.");
-  case MVT::i8:
-    return &AMDIL::GPRI8RegClass;
-  case MVT::i16:
-    return &AMDIL::GPRI16RegClass;
-  case MVT::i32:
-    return &AMDIL::GPRI32RegClass;
-  case MVT::f32:
-    return &AMDIL::GPRF32RegClass;
-  case MVT::i64:
-    return &AMDIL::GPRI64RegClass;
-  case MVT::f64:
-    return &AMDIL::GPRF64RegClass;
-  case MVT::v4f32:
-    return &AMDIL::GPRV4F32RegClass;
-  case MVT::v4i8:
-    return &AMDIL::GPRV4I8RegClass;
-  case MVT::v4i16:
-    return &AMDIL::GPRV4I16RegClass;
-  case MVT::v4i32:
-    return &AMDIL::GPRV4I32RegClass;
-  case MVT::v2f32:
-    return &AMDIL::GPRV2F32RegClass;
-  case MVT::v2i8:
-    return &AMDIL::GPRV2I8RegClass;
-  case MVT::v2i16:
-    return &AMDIL::GPRV2I16RegClass;
-  case MVT::v2i32:
-    return &AMDIL::GPRV2I32RegClass;
-  case MVT::v2f64:
-    return &AMDIL::GPRV2F64RegClass;
-  case MVT::v2i64:
-    return &AMDIL::GPRV2I64RegClass;
-  }
-}
-
-void printSDNode(const SDNode *N) {
-  printf("Opcode: %d isTargetOpcode: %d isMachineOpcode: %d\n",
-         N->getOpcode(), N->isTargetOpcode(), N->isMachineOpcode());
-  printf("Empty: %d OneUse: %d Size: %d NodeID: %d\n",
-         N->use_empty(), N->hasOneUse(), (int)N->use_size(), N->getNodeId());
-  for (unsigned int i = 0; i < N->getNumOperands(); ++i) {
-    printf("OperandNum: %d ValueCount: %d ValueType: %d\n",
-           i, N->getNumValues(), N->getValueType(0) .getSimpleVT().SimpleTy);
-    printSDValue(N->getOperand(i), 0);
-  }
-}
-
-void printSDValue(const SDValue &Op, int level) {
-  printf("\nOp: %p OpCode: %d NumOperands: %d ", (void*)&Op, Op.getOpcode(),
-         Op.getNumOperands());
-  printf("IsTarget: %d IsMachine: %d ", Op.isTargetOpcode(),
-         Op.isMachineOpcode());
-  if (Op.isMachineOpcode()) {
-    printf("MachineOpcode: %d\n", Op.getMachineOpcode());
-  } else {
-    printf("\n");
-  }
-  EVT vt = Op.getValueType();
-  printf("ValueType: %d \n", vt.getSimpleVT().SimpleTy);
-  printf("UseEmpty: %d OneUse: %d\n", Op.use_empty(), Op.hasOneUse());
-  if (level) {
-    printf("Children for %d:\n", level);
-    for (unsigned int i = 0; i < Op.getNumOperands(); ++i) {
-      printf("Child %d->%d:", level, i);
-      printSDValue(Op.getOperand(i), level - 1);
-    }
-  }
-}
-
-bool isPHIMove(unsigned int opcode) {
-  switch (opcode) {
-  default:
-    return false;
-    ExpandCaseToAllTypes(AMDIL::PHIMOVE);
-    return true;
-  }
-  return false;
-}
-
-bool isMove(unsigned int opcode) {
-  switch (opcode) {
-  default:
-    return false;
-    ExpandCaseToAllTypes(AMDIL::MOVE);
-    return true;
-  }
-  return false;
-}
-
-bool isMoveOrEquivalent(unsigned int opcode) {
-  switch (opcode) {
-  default:
-    return isMove(opcode) || isPHIMove(opcode);
-    ExpandCaseToAllScalarTypes(AMDIL::IL_ASCHAR);
-    ExpandCaseToAllScalarTypes(AMDIL::IL_ASSHORT);
-    ExpandCaseToAllScalarTypes(AMDIL::IL_ASINT);
-    ExpandCaseToAllScalarTypes(AMDIL::IL_ASLONG);
-    ExpandCaseToAllScalarTypes(AMDIL::IL_ASDOUBLE);
-    ExpandCaseToAllScalarTypes(AMDIL::IL_ASFLOAT);
-    ExpandCaseToAllScalarTypes(AMDIL::IL_ASV2CHAR);
-    ExpandCaseToAllScalarTypes(AMDIL::IL_ASV2SHORT);
-    ExpandCaseToAllScalarTypes(AMDIL::IL_ASV2INT);
-    ExpandCaseToAllScalarTypes(AMDIL::IL_ASV2FLOAT);
-    ExpandCaseToAllScalarTypes(AMDIL::IL_ASV2LONG);
-    ExpandCaseToAllScalarTypes(AMDIL::IL_ASV2DOUBLE);
-    ExpandCaseToAllScalarTypes(AMDIL::IL_ASV4CHAR);
-    ExpandCaseToAllScalarTypes(AMDIL::IL_ASV4SHORT);
-    ExpandCaseToAllScalarTypes(AMDIL::IL_ASV4INT);
-    ExpandCaseToAllScalarTypes(AMDIL::IL_ASV4FLOAT);
-    case AMDIL::INTTOANY_i8:
-    case AMDIL::INTTOANY_i16:
-    case AMDIL::INTTOANY_i32:
-    case AMDIL::INTTOANY_f32:
-    case AMDIL::DLO:
-    case AMDIL::LLO:
-    case AMDIL::LLO_v2i64:
-      return true;
-  };
-  return false;
-}
-
-bool check_type(const Value *ptr, unsigned int addrspace) {
-  if (!ptr) {
-    return false;
-  }
-  Type *ptrType = ptr->getType();
-  return dyn_cast<PointerType>(ptrType)->getAddressSpace() == addrspace;
-}
-
-size_t getTypeSize(Type * const T, bool dereferencePtr) {
-  size_t size = 0;
-  if (!T) {
-    return size;
-  }
-  switch (T->getTypeID()) {
-  case Type::X86_FP80TyID:
-  case Type::FP128TyID:
-  case Type::PPC_FP128TyID:
-  case Type::LabelTyID:
-    assert(0 && "These types are not supported by this backend");
-  default:
-  case Type::FloatTyID:
-  case Type::DoubleTyID:
-    size = T->getPrimitiveSizeInBits() >> 3;
-    break;
-  case Type::PointerTyID:
-    size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr);
-    break;
-  case Type::IntegerTyID:
-    size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr);
-    break;
-  case Type::StructTyID:
-    size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr);
-    break;
-  case Type::ArrayTyID:
-    size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr);
-    break;
-  case Type::FunctionTyID:
-    size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr);
-    break;
-  case Type::VectorTyID:
-    size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr);
-    break;
-  };
-  return size;
-}
-
-size_t getTypeSize(StructType * const ST, bool dereferencePtr) {
-  size_t size = 0;
-  if (!ST) {
-    return size;
-  }
-  Type *curType;
-  StructType::element_iterator eib;
-  StructType::element_iterator eie;
-  for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) {
-    curType = *eib;
-    size += getTypeSize(curType, dereferencePtr);
-  }
-  return size;
-}
-
-size_t getTypeSize(IntegerType * const IT, bool dereferencePtr) {
-  return IT ? (IT->getBitWidth() >> 3) : 0;
-}
-
-size_t getTypeSize(FunctionType * const FT, bool dereferencePtr) {
-    assert(0 && "Should not be able to calculate the size of an function type");
-    return 0;
-}
-
-size_t getTypeSize(ArrayType * const AT, bool dereferencePtr) {
-  return (size_t)(AT ? (getTypeSize(AT->getElementType(),
-                                    dereferencePtr) * AT->getNumElements())
-                     : 0);
-}
-
-size_t getTypeSize(VectorType * const VT, bool dereferencePtr) {
-  return VT ? (VT->getBitWidth() >> 3) : 0;
-}
-
-size_t getTypeSize(PointerType * const PT, bool dereferencePtr) {
-  if (!PT) {
-    return 0;
-  }
-  Type *CT = PT->getElementType();
-  if (CT->getTypeID() == Type::StructTyID &&
-      PT->getAddressSpace() == AMDILAS::PRIVATE_ADDRESS) {
-    return getTypeSize(dyn_cast<StructType>(CT));
-  } else if (dereferencePtr) {
-    size_t size = 0;
-    for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) {
-      size += getTypeSize(PT->getContainedType(x), dereferencePtr);
-    }
-    return size;
-  } else {
-    return 4;
-  }
-}
-
-size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr) {
-  //assert(0 && "Should not be able to calculate the size of an opaque type");
-  return 4;
-}
-
-size_t getNumElements(Type * const T) {
-  size_t size = 0;
-  if (!T) {
-    return size;
-  }
-  switch (T->getTypeID()) {
-  case Type::X86_FP80TyID:
-  case Type::FP128TyID:
-  case Type::PPC_FP128TyID:
-  case Type::LabelTyID:
-    assert(0 && "These types are not supported by this backend");
-  default:
-  case Type::FloatTyID:
-  case Type::DoubleTyID:
-    size = 1;
-    break;
-  case Type::PointerTyID:
-    size = getNumElements(dyn_cast<PointerType>(T));
-    break;
-  case Type::IntegerTyID:
-    size = getNumElements(dyn_cast<IntegerType>(T));
-    break;
-  case Type::StructTyID:
-    size = getNumElements(dyn_cast<StructType>(T));
-    break;
-  case Type::ArrayTyID:
-    size = getNumElements(dyn_cast<ArrayType>(T));
-    break;
-  case Type::FunctionTyID:
-    size = getNumElements(dyn_cast<FunctionType>(T));
-    break;
-  case Type::VectorTyID:
-    size = getNumElements(dyn_cast<VectorType>(T));
-    break;
-  };
-  return size;
-}
-
-size_t getNumElements(StructType * const ST) {
-  size_t size = 0;
-  if (!ST) {
-    return size;
-  }
-  Type *curType;
-  StructType::element_iterator eib;
-  StructType::element_iterator eie;
-  for (eib = ST->element_begin(), eie = ST->element_end();
-       eib != eie; ++eib) {
-    curType = *eib;
-    size += getNumElements(curType);
-  }
-  return size;
-}
-
-size_t getNumElements(IntegerType * const IT) {
-  return (!IT) ? 0 : 1;
-}
-
-size_t getNumElements(FunctionType * const FT) {
-  assert(0 && "Should not be able to calculate the number of "
-         "elements of a function type");
-  return 0;
-}
-
-size_t getNumElements(ArrayType * const AT) {
-  return (!AT) ? 0
-               :  (size_t)(getNumElements(AT->getElementType()) *
-                           AT->getNumElements());
-}
-
-size_t getNumElements(VectorType * const VT) {
-  return (!VT) ? 0
-               : VT->getNumElements() * getNumElements(VT->getElementType());
-}
-
-size_t getNumElements(PointerType * const PT) {
-  size_t size = 0;
-  if (!PT) {
-    return size;
-  }
-  for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) {
-    size += getNumElements(PT->getContainedType(x));
-  }
-  return size;
-}
-
-const llvm::Value *getBasePointerValue(const llvm::Value *V)
-{
-  if (!V) {
-    return NULL;
-  }
-  const Value *ret = NULL;
-  ValueMap<const Value *, bool> ValueBitMap;
-  std::queue<const Value *, std::list<const Value *> > ValueQueue;
-  ValueQueue.push(V);
-  while (!ValueQueue.empty()) {
-    V = ValueQueue.front();
-    if (ValueBitMap.find(V) == ValueBitMap.end()) {
-      ValueBitMap[V] = true;
-      if (dyn_cast<Argument>(V) && dyn_cast<PointerType>(V->getType())) {
-        ret = V;
-        break;
-      } else if (dyn_cast<GlobalVariable>(V)) {
-        ret = V;
-        break;
-      } else if (dyn_cast<Constant>(V)) {
-        const ConstantExpr *CE = dyn_cast<ConstantExpr>(V);
-        if (CE) {
-          ValueQueue.push(CE->getOperand(0));
-        }
-      } else if (const AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
-        ret = AI;
-        break;
-      } else if (const Instruction *I = dyn_cast<Instruction>(V)) {
-        uint32_t numOps = I->getNumOperands();
-        for (uint32_t x = 0; x < numOps; ++x) {
-          ValueQueue.push(I->getOperand(x));
-        }
-      } else {
-        // assert(0 && "Found a Value that we didn't know how to handle!");
-      }
-    }
-    ValueQueue.pop();
-  }
-  return ret;
-}
-
-const llvm::Value *getBasePointerValue(const llvm::MachineInstr *MI) {
-  const Value *moVal = NULL;
-  if (!MI->memoperands_empty()) {
-    const MachineMemOperand *memOp = (*MI->memoperands_begin());
-    moVal = memOp ? memOp->getValue() : NULL;
-    moVal = getBasePointerValue(moVal);
-  }
-  return moVal;
-}
-
-bool commaPrint(int i, llvm::raw_ostream &O) {
-  O << ":" << i;
-  return false;
-}
-
-bool isLoadInst(const llvm::TargetInstrInfo * TII, MachineInstr *MI) {
-  if (strstr(GET_OPCODE_NAME(TII, MI), "LOADCONST")) {
-    return false;
-  }
-  return strstr(GET_OPCODE_NAME(TII, MI), "LOAD");
-}
-
-bool isSWSExtLoadInst(MachineInstr *MI)
-{
-switch (MI->getOpcode()) {
-    default:
-      break;
-      ExpandCaseToByteShortTypes(AMDIL::LOCALLOAD);
-      ExpandCaseToByteShortTypes(AMDIL::GLOBALLOAD);
-      ExpandCaseToByteShortTypes(AMDIL::REGIONLOAD);
-      ExpandCaseToByteShortTypes(AMDIL::PRIVATELOAD);
-      ExpandCaseToByteShortTypes(AMDIL::CPOOLLOAD);
-      ExpandCaseToByteShortTypes(AMDIL::CONSTANTLOAD);
-      return true;
-  };
-  return false;
-}
-
-bool isExtLoadInst(const llvm::TargetInstrInfo * TII, MachineInstr *MI) {
-  return strstr(GET_OPCODE_NAME(TII, MI), "EXTLOAD");
-}
-
-bool isSExtLoadInst(const llvm::TargetInstrInfo * TII, MachineInstr *MI) {
-  return strstr(GET_OPCODE_NAME(TII, MI), "SEXTLOAD");
-}
-
-bool isAExtLoadInst(const llvm::TargetInstrInfo * TII, MachineInstr *MI) {
-  return strstr(GET_OPCODE_NAME(TII, MI), "AEXTLOAD");
-}
-
-bool isZExtLoadInst(const llvm::TargetInstrInfo * TII, MachineInstr *MI) {
-  return strstr(GET_OPCODE_NAME(TII, MI), "ZEXTLOAD");
-}
-
-bool isStoreInst(const llvm::TargetInstrInfo * TII, MachineInstr *MI) {
-  return strstr(GET_OPCODE_NAME(TII, MI), "STORE");
-}
-
-bool isTruncStoreInst(const llvm::TargetInstrInfo * TII, MachineInstr *MI) {
-  return strstr(GET_OPCODE_NAME(TII, MI), "TRUNCSTORE");
-}
-
-bool isAtomicInst(const llvm::TargetInstrInfo * TII, MachineInstr *MI) {
-  return strstr(GET_OPCODE_NAME(TII, MI), "ATOM");
-}
-
-bool isVolatileInst(const llvm::TargetInstrInfo * TII, MachineInstr *MI) {
-  if (!MI->memoperands_empty()) {
-    for (MachineInstr::mmo_iterator mob = MI->memoperands_begin(),
-        moe = MI->memoperands_end(); mob != moe; ++mob) {
-      // If there is a volatile mem operand, this is a volatile instruction.
-      if ((*mob)->isVolatile()) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-bool isGlobalInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI)
-{
-  return strstr(GET_OPCODE_NAME(TII, MI), "GLOBAL");
-}
-bool isPrivateInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI)
-{
-  return strstr(GET_OPCODE_NAME(TII, MI), "PRIVATE");
-}
-bool isConstantInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI)
-{
-  return strstr(GET_OPCODE_NAME(TII, MI), "CONSTANT")
-    || strstr(GET_OPCODE_NAME(TII, MI), "CPOOL");
-}
-bool isRegionInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI)
-{
-  return strstr(GET_OPCODE_NAME(TII, MI), "REGION");
-}
-bool isLocalInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI)
-{
-  return strstr(GET_OPCODE_NAME(TII, MI), "LOCAL");
-}
-bool isImageInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI)
-{
-  return strstr(GET_OPCODE_NAME(TII, MI), "IMAGE");
-}
-bool isAppendInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI)
-{
-  return strstr(GET_OPCODE_NAME(TII, MI), "APPEND");
-}
-bool isRegionAtomic(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI)
-{
-  return strstr(GET_OPCODE_NAME(TII, MI), "ATOM_R");
-}
-bool isLocalAtomic(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI)
-{
-  return strstr(GET_OPCODE_NAME(TII, MI), "ATOM_L");
-}
-bool isGlobalAtomic(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI)
-{
-  return strstr(GET_OPCODE_NAME(TII, MI), "ATOM_G")
-    || isArenaAtomic(TII, MI);
-}
-bool isArenaAtomic(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI)
-{
-  return strstr(GET_OPCODE_NAME(TII, MI), "ATOM_A");
-}
-
-const char* getSrcSwizzle(unsigned idx) {
-  const char *srcSwizzles[]  = {
-    "", ".x000", ".0x00", ".00x0", ".000x", ".y000", ".0y00", ".00y0", ".000y", 
-    ".z000", ".0z00", ".00z0", ".000z", ".w000", ".0w00", ".00w0", ".000w",
-    ".xy00", ".00xy", ".zw00", ".00zw", ".xyz0", ".0xyz", ".xyzw", ".0000",
-    ".xxxx", ".yyyy", ".zzzz", ".wwww", ".xyxy", ".zwzw", ".xzxz", ".ywyw",
-    ".x0y0", ".0x0y", ".xy_neg(y)", "_neg(yw)", "_neg(x)", ".xy_neg(xy)",
-    "_neg(xyzw)", ".0yzw", ".x0zw", ".xy0w", ".x", ".y", ".z", ".w", ".xy",
-    ".zw"
-  };
-  assert(idx < sizeof(srcSwizzles)/sizeof(srcSwizzles[0])
-      && "Idx passed in is invalid!");
-  return srcSwizzles[idx];
-}
-const char* getDstSwizzle(unsigned idx) {
-  const char *dstSwizzles[] = {
-    "", ".x___", ".xy__", ".xyz_", ".xyzw", "._y__", "._yz_", "._yzw", ".__z_",
-    ".__zw", ".___w", ".x_zw", ".xy_w", ".x_z_", ".x__w", "._y_w", 
-  };
-  assert(idx < sizeof(dstSwizzles)/sizeof(dstSwizzles[0])
-      && "Idx passed in is invalid!");
-  return dstSwizzles[idx];
-}
-/// Helper function to get the currently set flags
-void getAsmPrinterFlags(MachineInstr *MI, AMDILAS::InstrResEnc &curRes)
-{
-  // We need 16 bits of information, but LLVMr127097 cut the field in half.
-  // So we have to use two different fields to store all of our information.
-  uint16_t upper = MI->getFlags() << 8;
-  uint16_t lower = MI->getAsmPrinterFlags();
-  curRes.u16all = upper | lower;
-}
-/// Helper function to clear the currently set flags and add the new flags.
-void setAsmPrinterFlags(MachineInstr *MI, AMDILAS::InstrResEnc &curRes)
-{
-  // We need 16 bits of information, but LLVMr127097 cut the field in half.
-  // So we have to use two different fields to store all of our information.
-  MI->clearAsmPrinterFlags();
-  MI->setFlags(0);
-  uint8_t lower = curRes.u16all & 0xFF;
-  uint8_t upper = (curRes.u16all >> 8) & 0xFF;
-  MI->setFlags(upper);
-  MI->setAsmPrinterFlag((llvm::MachineInstr::CommentFlag)lower);
-}
diff --git a/src/gallium/drivers/radeon/AMDILUtilityFunctions.h b/src/gallium/drivers/radeon/AMDILUtilityFunctions.h
index 637c868b55c..66af706bbb3 100644
--- a/src/gallium/drivers/radeon/AMDILUtilityFunctions.h
+++ b/src/gallium/drivers/radeon/AMDILUtilityFunctions.h
@@ -7,191 +7,12 @@
 //
 //==-----------------------------------------------------------------------===//
 //
-// This file provides declarations for functions that are used across different
-// classes and provide various conversions or utility to shorten the code
+// This file provides helper macros for expanding case statements.
 //
 //===----------------------------------------------------------------------===//
 #ifndef AMDILUTILITYFUNCTIONS_H_
 #define AMDILUTILITYFUNCTIONS_H_
 
-#include "AMDIL.h"
-#include "AMDILTargetMachine.h"
-#include "llvm/ADT/SmallVector.h"
-
-// Utility functions from ID
-//
-namespace llvm {
-class TargetRegisterClass;
-class SDValue;
-class SDNode;
-class Value;
-class Type;
-class StructType;
-class IntegerType;
-class FunctionType;
-class VectorType;
-class ArrayType;
-class PointerType;
-class OpaqueType;
-class MachineInstr;
-
-}
-enum SrcSwizzles {
-  AMDIL_SRC_SWIZZLE_DEFAULT = 0,
-  AMDIL_SRC_SWIZZLE_X000,
-  AMDIL_SRC_SWIZZLE_0X00,
-  AMDIL_SRC_SWIZZLE_00X0,
-  AMDIL_SRC_SWIZZLE_000X,
-  AMDIL_SRC_SWIZZLE_Y000,
-  AMDIL_SRC_SWIZZLE_0Y00,
-  AMDIL_SRC_SWIZZLE_00Y0,
-  AMDIL_SRC_SWIZZLE_000Y,
-  AMDIL_SRC_SWIZZLE_Z000,
-  AMDIL_SRC_SWIZZLE_0Z00,
-  AMDIL_SRC_SWIZZLE_00Z0,
-  AMDIL_SRC_SWIZZLE_000Z,
-  AMDIL_SRC_SWIZZLE_W000,
-  AMDIL_SRC_SWIZZLE_0W00,
-  AMDIL_SRC_SWIZZLE_00W0,
-  AMDIL_SRC_SWIZZLE_000W,
-  AMDIL_SRC_SWIZZLE_XY00,
-  AMDIL_SRC_SWIZZLE_00XY,
-  AMDIL_SRC_SWIZZLE_ZW00,
-  AMDIL_SRC_SWIZZLE_00ZW,
-  AMDIL_SRC_SWIZZLE_XYZ0,
-  AMDIL_SRC_SWIZZLE_0XYZ,
-  AMDIL_SRC_SWIZZLE_XYZW,
-  AMDIL_SRC_SWIZZLE_0000,
-  AMDIL_SRC_SWIZZLE_XXXX,
-  AMDIL_SRC_SWIZZLE_YYYY,
-  AMDIL_SRC_SWIZZLE_ZZZZ,
-  AMDIL_SRC_SWIZZLE_WWWW,
-  AMDIL_SRC_SWIZZLE_XYXY,
-  AMDIL_SRC_SWIZZLE_ZWZW,
-  AMDIL_SRC_SWIZZLE_XZXZ,
-  AMDIL_SRC_SWIZZLE_YWYW,
-  AMDIL_SRC_SWIZZLE_X0Y0,
-  AMDIL_SRC_SWIZZLE_0X0Y,
-  AMDIL_SRC_SWIZZLE_XY_NEGY,
-  AMDIL_SRC_SWIZZLE_NEGYW,
-  AMDIL_SRC_SWIZZLE_NEGX,
-  AMDIL_SRC_SWIZZLE_XY_NEGXY,
-  AMDIL_SRC_SWIZZLE_NEG_XYZW,
-  AMDIL_SRC_SWIZZLE_0YZW,
-  AMDIL_SRC_SWIZZLE_X0ZW,
-  AMDIL_SRC_SWIZZLE_XY0W,
-  AMDIL_SRC_SWIZZLE_X,
-  AMDIL_SRC_SWIZZLE_Y,
-  AMDIL_SRC_SWIZZLE_Z,
-  AMDIL_SRC_SWIZZLE_W,
-  AMDIL_SRC_SWIZZLE_XY,
-  AMDIL_SRC_SWIZZLE_ZW,
-  AMDIL_SRC_SWIZZLE_LAST
-};
-enum DstSwizzles {
-  AMDIL_DST_SWIZZLE_DEFAULT = 0,
-  AMDIL_DST_SWIZZLE_X___,
-  AMDIL_DST_SWIZZLE_XY__,
-  AMDIL_DST_SWIZZLE_XYZ_,
-  AMDIL_DST_SWIZZLE_XYZW,
-  AMDIL_DST_SWIZZLE__Y__,
-  AMDIL_DST_SWIZZLE__YZ_,
-  AMDIL_DST_SWIZZLE__YZW,
-  AMDIL_DST_SWIZZLE___Z_,
-  AMDIL_DST_SWIZZLE___ZW,
-  AMDIL_DST_SWIZZLE____W,
-  AMDIL_DST_SWIZZLE_X_ZW,
-  AMDIL_DST_SWIZZLE_XY_W,
-  AMDIL_DST_SWIZZLE_X_Z_,
-  AMDIL_DST_SWIZZLE_X__W,
-  AMDIL_DST_SWIZZLE__Y_W,
-  AMDIL_DST_SWIZZLE_LAST
-};
-// Function to get the correct src swizzle string from ID
-const char *getSrcSwizzle(unsigned);
-
-// Function to get the correct dst swizzle string from ID
-const char *getDstSwizzle(unsigned);
-
-const llvm::TargetRegisterClass *getRegClassFromID(unsigned int ID);
-
-unsigned int getMoveInstFromID(unsigned int ID);
-unsigned int getPHIMoveInstFromID(unsigned int ID);
-
-// Utility functions from Type.
-const llvm::TargetRegisterClass *getRegClassFromType(unsigned int type);
-unsigned int getTargetIndependentMoveFromType(unsigned int type);
-
-// Debug functions for SDNode and SDValue.
-void printSDValue(const llvm::SDValue &Op, int level);
-void printSDNode(const llvm::SDNode *N);
-
-// Functions to check if an opcode is a specific type.
-bool isMove(unsigned int opcode);
-bool isPHIMove(unsigned int opcode);
-bool isMoveOrEquivalent(unsigned int opcode);
-
-// Function to check address space
-bool check_type(const llvm::Value *ptr, unsigned int addrspace);
-
-// Group of functions that recursively calculate the size of a structure based
-// on it's sub-types.
-size_t getTypeSize(llvm::Type * const T, bool dereferencePtr = false);
-size_t
-getTypeSize(llvm::StructType * const ST, bool dereferencePtr = false);
-size_t
-getTypeSize(llvm::IntegerType * const IT, bool dereferencePtr = false);
-size_t
-getTypeSize(llvm::FunctionType * const FT, bool dereferencePtr = false);
-size_t
-getTypeSize(llvm::ArrayType * const AT, bool dereferencePtr = false);
-size_t
-getTypeSize(llvm::VectorType * const VT, bool dereferencePtr = false);
-size_t
-getTypeSize(llvm::PointerType * const PT, bool dereferencePtr = false);
-size_t
-getTypeSize(llvm::OpaqueType * const OT, bool dereferencePtr = false);
-
-// Group of functions that recursively calculate the number of elements of a
-// structure based on it's sub-types.
-size_t getNumElements(llvm::Type * const T);
-size_t getNumElements(llvm::StructType * const ST);
-size_t getNumElements(llvm::IntegerType * const IT);
-size_t getNumElements(llvm::FunctionType * const FT);
-size_t getNumElements(llvm::ArrayType * const AT);
-size_t getNumElements(llvm::VectorType * const VT);
-size_t getNumElements(llvm::PointerType * const PT);
-size_t getNumElements(llvm::OpaqueType * const OT);
-const llvm::Value *getBasePointerValue(const llvm::Value *V);
-const llvm::Value *getBasePointerValue(const llvm::MachineInstr *MI);
-
-
-int64_t GET_SCALAR_SIZE(llvm::Type* A);
-
-// Helper functions that check the opcode for status information
-bool isLoadInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-bool isExtLoadInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-bool isSWSExtLoadInst(llvm::MachineInstr *MI);
-bool isSExtLoadInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-bool isZExtLoadInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-bool isAExtLoadInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-bool isStoreInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-bool isTruncStoreInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-bool isAtomicInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-bool isVolatileInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-bool isGlobalInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-bool isPrivateInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-bool isConstantInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-bool isRegionInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-bool isLocalInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-bool isImageInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-bool isAppendInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-bool isRegionAtomic(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-bool isLocalAtomic(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-bool isGlobalAtomic(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-bool isArenaAtomic(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-
-
 // Macros that are used to help with switch statements for various data types
 // However, these macro's do not return anything unlike the second set below.
 #define ExpandCaseTo32bitIntTypes(Instr)  \
@@ -354,9 +175,4 @@ case Instr##_v4f32: \
 case Instr##_v2i64: \
 case Instr##_v2f64:
 
-bool commaPrint(int i, llvm::raw_ostream &O);
-/// Helper function to get the currently get/set flags.
-void getAsmPrinterFlags(llvm::MachineInstr *MI, llvm::AMDILAS::InstrResEnc &curRes);
-void setAsmPrinterFlags(llvm::MachineInstr *MI, llvm::AMDILAS::InstrResEnc &curRes);
-
 #endif // AMDILUTILITYFUNCTIONS_H_
diff --git a/src/gallium/drivers/radeon/AMDILVersion.td b/src/gallium/drivers/radeon/AMDILVersion.td
index b8b02608d3b..d863b068131 100644
--- a/src/gallium/drivers/radeon/AMDILVersion.td
+++ b/src/gallium/drivers/radeon/AMDILVersion.td
@@ -1,4 +1,4 @@
-//===-- AMDILVersion.td - TODO: Add brief description -------===//
+//===-- AMDILVersion.td - Barrier Instruction/Intrinsic definitions------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/src/gallium/drivers/radeon/Makefile b/src/gallium/drivers/radeon/Makefile
index 807dc781c7c..cc409645a6e 100644
--- a/src/gallium/drivers/radeon/Makefile
+++ b/src/gallium/drivers/radeon/Makefile
@@ -18,6 +18,8 @@ CXXFLAGS := $(filter-out -DDEBUG, $(CXXFLAGS))
 
 tablegen = $(TBLGEN) -I $(LLVM_INCLUDEDIR) $1 $2 -o $3
 
+HAVE_LLVM_INTRINSICS = $(shell grep IntrinsicsR600.td $(LLVM_INCLUDEDIR)/llvm/Intrinsics.td)
+
 gen: $(GENERATED_SOURCES)
 
 SIRegisterInfo.td: SIGenRegisterInfo.pl
@@ -26,9 +28,13 @@ SIRegisterInfo.td: SIGenRegisterInfo.pl
 SIRegisterGetHWRegNum.inc: SIGenRegisterInfo.pl
 	$(PERL) $^ $@ > /dev/null
 
-R600ShaderPatterns.td: AMDGPUGenShaderPatterns.pl
-	$(PERL) $^ C > $@
-	
+R600Intrinsics.td: R600IntrinsicsNoOpenCL.td R600IntrinsicsOpenCL.td
+ifeq ($(HAVE_LLVM_INTRINSICS),)
+	cp R600IntrinsicsNoOpenCL.td R600Intrinsics.td
+else
+	cp R600IntrinsicsOpenCL.td R600Intrinsics.td
+endif
+
 R600RegisterInfo.td: R600GenRegisterInfo.pl
 	$(PERL) $^ > $@
 
diff --git a/src/gallium/drivers/radeon/Makefile.sources b/src/gallium/drivers/radeon/Makefile.sources
index 7d2932b4dbd..6dc62320f40 100644
--- a/src/gallium/drivers/radeon/Makefile.sources
+++ b/src/gallium/drivers/radeon/Makefile.sources
@@ -1,6 +1,6 @@
 
 GENERATED_SOURCES := \
-	R600ShaderPatterns.td		\
+	R600Intrinsics.td		\
 	R600RegisterInfo.td		\
 	AMDGPUInstrEnums.td		\
 	SIRegisterInfo.td		\
@@ -29,20 +29,16 @@ CPP_SOURCES := \
 	AMDILISelDAGToDAG.cpp		\
 	AMDILISelLowering.cpp		\
 	AMDILMachinePeephole.cpp	\
-	AMDILMCCodeEmitter.cpp		\
 	AMDILNIDevice.cpp		\
 	AMDILPeepholeOptimizer.cpp	\
 	AMDILRegisterInfo.cpp		\
 	AMDILSIDevice.cpp		\
 	AMDILSubtarget.cpp		\
 	AMDILTargetMachine.cpp		\
-	AMDILUtilityFunctions.cpp	\
 	AMDGPUTargetMachine.cpp		\
 	AMDGPUISelLowering.cpp		\
 	AMDGPUConvertToISA.cpp		\
 	AMDGPULowerInstructions.cpp		\
-	AMDGPULowerShaderInstructions.cpp	\
-	AMDGPUReorderPreloadInstructions.cpp	\
 	AMDGPUInstrInfo.cpp		\
 	AMDGPURegisterInfo.cpp		\
 	AMDGPUUtil.cpp			\
@@ -51,13 +47,12 @@ CPP_SOURCES := \
 	R600InstrInfo.cpp		\
 	R600KernelParameters.cpp	\
 	R600LowerInstructions.cpp	\
-	R600LowerShaderInstructions.cpp	\
+	R600MachineFunctionInfo.cpp	\
 	R600RegisterInfo.cpp		\
 	SIAssignInterpRegs.cpp		\
 	SICodeEmitter.cpp		\
 	SIInstrInfo.cpp			\
 	SIISelLowering.cpp		\
-	SILowerShaderInstructions.cpp	\
 	SIMachineFunctionInfo.cpp	\
 	SIPropagateImmReads.cpp		\
 	SIRegisterInfo.cpp		\
diff --git a/src/gallium/drivers/radeon/R600CodeEmitter.cpp b/src/gallium/drivers/radeon/R600CodeEmitter.cpp
index 8faf0deb8c5..421562255f6 100644
--- a/src/gallium/drivers/radeon/R600CodeEmitter.cpp
+++ b/src/gallium/drivers/radeon/R600CodeEmitter.cpp
@@ -1,4 +1,4 @@
-//===-- R600CodeEmitter.cpp - TODO: Add brief description -------===//
+//===-- R600CodeEmitter.cpp - Code Emitter for R600->Cayman GPU families --===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// This code emitters outputs bytecode that is understood by the r600g driver
+// in the Mesa [1] project.  The bytecode is very similar to the hardware's ISA,
+// except that the size of the instruction fields are rounded up to the
+// nearest byte.
+//
+// [1] http://www.mesa3d.org/
 //
 //===----------------------------------------------------------------------===//
 
@@ -44,8 +49,9 @@ namespace {
   const R600RegisterInfo * TRI;
   bool evergreenEncoding;
 
+  bool isCube;
   bool isReduction;
-  unsigned reductionElement;
+  unsigned currentElement;
   bool isLast;
 
   unsigned section_start;
@@ -53,7 +59,7 @@ namespace {
   public:
 
   R600CodeEmitter(formatted_raw_ostream &OS) : MachineFunctionPass(ID),
-      _OS(OS), TM(NULL), evergreenEncoding(false), isReduction(false),
+      _OS(OS), TM(NULL), evergreenEncoding(false), isCube(false), isReduction(false),
       isLast(true) { }
 
   const char *getPassName() const { return "AMDGPU Machine Code Emitter"; }
@@ -65,7 +71,7 @@ namespace {
   private:
 
   void emitALUInstr(MachineInstr  &MI);
-  void emitSrc(const MachineOperand & MO);
+  void emitSrc(const MachineOperand & MO, int chan_override  = -1);
   void emitDst(const MachineOperand & MO);
   void emitALU(MachineInstr &MI, unsigned numSrc);
   void emitTexInstr(MachineInstr &MI);
@@ -155,10 +161,8 @@ bool R600CodeEmitter::runOnMachineFunction(MachineFunction &MF) {
   } else {
     evergreenEncoding = true;
   }
-  const AMDGPUTargetMachine *amdtm =
-    static_cast<const AMDGPUTargetMachine *>(&MF.getTarget());
 
-  if (amdtm->shouldDumpCode()) {
+  if (STM.dumpCode()) {
     MF.dump();
   }
 
@@ -171,18 +175,26 @@ bool R600CodeEmitter::runOnMachineFunction(MachineFunction &MF) {
           if (MI.getNumOperands() > 1 && MI.getOperand(0).isReg() && MI.getOperand(0).isDead()) {
             continue;
           }
-          if (isTexOp(MI.getOpcode())) {
+          if (AMDGPU::isTexOp(MI.getOpcode())) {
             emitTexInstr(MI);
-          } else if (isFCOp(MI.getOpcode())){
+          } else if (AMDGPU::isFCOp(MI.getOpcode())){
             emitFCInstr(MI);
-          } else if (isReductionOp(MI.getOpcode())) {
+          } else if (AMDGPU::isReductionOp(MI.getOpcode())) {
             isReduction = true;
             isLast = false;
-            for (reductionElement = 0; reductionElement < 4; reductionElement++) {
-              isLast = (reductionElement == 3);
+            for (currentElement = 0; currentElement < 4; currentElement++) {
+              isLast = (currentElement == 3);
               emitALUInstr(MI);
             }
             isReduction = false;
+          } else if (AMDGPU::isCubeOp(MI.getOpcode())) {
+              isCube = true;
+              isLast = false;
+              for (currentElement = 0; currentElement < 4; currentElement++) {
+                isLast = (currentElement == 3);
+                emitALUInstr(MI);
+              }
+              isCube = false;
           } else if (MI.getOpcode() == AMDIL::RETURN ||
                      MI.getOpcode() == AMDIL::BUNDLE ||
                      MI.getOpcode() == AMDIL::KILL) {
@@ -191,12 +203,7 @@ bool R600CodeEmitter::runOnMachineFunction(MachineFunction &MF) {
             switch(MI.getOpcode()) {
             case AMDIL::RAT_WRITE_CACHELESS_eg:
               {
-                /* XXX: Support for autoencoding 64-bit instructions was added
-                 * in LLVM 3.1.  Until we drop support for 3.0, we will use Magic
-                 * numbers for the high bits. */
-                  uint64_t high = 0x95c0100000000000;
                   uint64_t inst = getBinaryCodeForInstr(MI);
-                  inst |= high;
                 /* Set End Of Program bit */
                 /* XXX: Need better check of end of program.  EOP should be
                  * encoded in one of the operands of the MI, and it should be
@@ -286,7 +293,7 @@ void R600CodeEmitter::emitALUInstr(MachineInstr &MI)
 
    /* Some instructions are just place holder instructions that represent
     * operations that the GPU does automatically.  They should be ignored. */
-  if (isPlaceHolderOpcode(MI.getOpcode())) {
+  if (AMDGPU::isPlaceHolderOpcode(MI.getOpcode())) {
     return;
   }
 
@@ -309,18 +316,25 @@ void R600CodeEmitter::emitALUInstr(MachineInstr &MI)
   /* Emit instruction type */
   emitByte(0);
 
-  unsigned int opIndex;
-  for (opIndex = 1; opIndex < numOperands; opIndex++) {
-    /* Literal constants are always stored as the last operand. */
-    if (MI.getOperand(opIndex).isImm() || MI.getOperand(opIndex).isFPImm()) {
-      break;
+  if (isCube) {
+    static const int cube_src_swz[] = {2, 2, 0, 1};
+    emitSrc(MI.getOperand(1), cube_src_swz[currentElement]);
+    emitSrc(MI.getOperand(1), cube_src_swz[3-currentElement]);
+    emitNullBytes(SRC_BYTE_COUNT);
+  } else {
+    unsigned int opIndex;
+    for (opIndex = 1; opIndex < numOperands; opIndex++) {
+      /* Literal constants are always stored as the last operand. */
+      if (MI.getOperand(opIndex).isImm() || MI.getOperand(opIndex).isFPImm()) {
+        break;
+      }
+      emitSrc(MI.getOperand(opIndex));
     }
-    emitSrc(MI.getOperand(opIndex));
-  }
 
     /* Emit zeros for unused sources */
-  for ( ; opIndex < 4; opIndex++) {
-    emitNullBytes(SRC_BYTE_COUNT);
+    for ( ; opIndex < 4; opIndex++) {
+      emitNullBytes(SRC_BYTE_COUNT);
+    }
   }
 
   emitDst(dstOp);
@@ -328,7 +342,7 @@ void R600CodeEmitter::emitALUInstr(MachineInstr &MI)
   emitALU(MI, numOperands - 1);
 }
 
-void R600CodeEmitter::emitSrc(const MachineOperand & MO)
+void R600CodeEmitter::emitSrc(const MachineOperand & MO, int chan_override /* = -1 */)
 {
   uint32_t value = 0;
   /* Emit the source select (2 bytes).  For GPRs, this is the register index.
@@ -354,8 +368,10 @@ void R600CodeEmitter::emitSrc(const MachineOperand & MO)
   }
 
   /* Emit the source channel (1 byte) */
-  if (isReduction) {
-    emitByte(reductionElement);
+  if (chan_override != -1) {
+    emitByte(chan_override);
+  } else if (isReduction) {
+    emitByte(currentElement);
   } else if (MO.isReg()) {
     emitByte(TRI->getHWRegChan(MO.getReg()));
   } else {
@@ -397,8 +413,8 @@ void R600CodeEmitter::emitDst(const MachineOperand & MO)
     emitByte(getHWReg(MO.getReg()));
 
     /* Emit the element of the destination register (1 byte)*/
-    if (isReduction) {
-      emitByte(reductionElement);
+    if (isReduction || isCube) {
+      emitByte(currentElement);
     } else {
       emitByte(TRI->getHWRegChan(MO.getReg()));
     }
@@ -411,7 +427,7 @@ void R600CodeEmitter::emitDst(const MachineOperand & MO)
     }
 
     /* Emit writemask (1 byte).  */
-    if ((isReduction && reductionElement != TRI->getHWRegChan(MO.getReg()))
+    if ((isReduction && currentElement != TRI->getHWRegChan(MO.getReg()))
          || MO.getTargetFlags() & MO_FLAG_MASK) {
       emitByte(0);
     } else {
@@ -570,6 +586,7 @@ void R600CodeEmitter::emitFCInstr(MachineInstr &MI)
   case AMDIL::BREAK_LOGICALZ_f32:
     instr = FC_BREAK;
     break;
+  case AMDIL::BREAK_LOGICALNZ_f32:
   case AMDIL::BREAK_LOGICALNZ_i32:
     instr = FC_BREAK_NZ_INT;
     break;
@@ -577,6 +594,7 @@ void R600CodeEmitter::emitFCInstr(MachineInstr &MI)
     instr = FC_BREAK_Z_INT;
     break;
   case AMDIL::CONTINUE_LOGICALNZ_f32:
+  case AMDIL::CONTINUE_LOGICALNZ_i32:
     instr = FC_CONTINUE;
     break;
   /* XXX: This assumes that all IFs will be if (x != 0).  If we add
@@ -706,44 +724,5 @@ RegElement maskBitToElement(unsigned int maskBit)
   }
 }
 
-unsigned int dstSwizzleToWriteMask(unsigned swizzle)
-{
-  switch(swizzle) {
-  default:
-  case AMDIL_DST_SWIZZLE_DEFAULT:
-    return WRITE_MASK_X | WRITE_MASK_Y | WRITE_MASK_Z | WRITE_MASK_W;
-  case AMDIL_DST_SWIZZLE_X___:
-    return WRITE_MASK_X;
-  case AMDIL_DST_SWIZZLE_XY__:
-    return WRITE_MASK_X | WRITE_MASK_Y;
-  case AMDIL_DST_SWIZZLE_XYZ_:
-    return WRITE_MASK_X | WRITE_MASK_Y | WRITE_MASK_Z;
-  case AMDIL_DST_SWIZZLE_XYZW:
-    return WRITE_MASK_X | WRITE_MASK_Y | WRITE_MASK_Z | WRITE_MASK_W;
-  case AMDIL_DST_SWIZZLE__Y__:
-    return WRITE_MASK_Y;
-  case AMDIL_DST_SWIZZLE__YZ_:
-    return WRITE_MASK_Y | WRITE_MASK_Z;
-  case AMDIL_DST_SWIZZLE__YZW:
-    return WRITE_MASK_Y | WRITE_MASK_Z | WRITE_MASK_W;
-  case AMDIL_DST_SWIZZLE___Z_:
-    return WRITE_MASK_Z;
-  case AMDIL_DST_SWIZZLE___ZW:
-    return WRITE_MASK_Z | WRITE_MASK_W;
-  case AMDIL_DST_SWIZZLE____W:
-    return WRITE_MASK_W;
-  case AMDIL_DST_SWIZZLE_X_ZW:
-    return WRITE_MASK_X | WRITE_MASK_Z | WRITE_MASK_W;
-  case AMDIL_DST_SWIZZLE_XY_W:
-    return WRITE_MASK_X | WRITE_MASK_Y | WRITE_MASK_W;
-  case AMDIL_DST_SWIZZLE_X_Z_:
-    return WRITE_MASK_X | WRITE_MASK_Z;
-  case AMDIL_DST_SWIZZLE_X__W:
-    return WRITE_MASK_X | WRITE_MASK_W;
-  case AMDIL_DST_SWIZZLE__Y_W:
-    return WRITE_MASK_Y | WRITE_MASK_W;
-  }
-}
-
 #include "AMDILGenCodeEmitter.inc"
 
diff --git a/src/gallium/drivers/radeon/R600GenRegisterInfo.pl b/src/gallium/drivers/radeon/R600GenRegisterInfo.pl
index cbded115766..406f3dfdd39 100644
--- a/src/gallium/drivers/radeon/R600GenRegisterInfo.pl
+++ b/src/gallium/drivers/radeon/R600GenRegisterInfo.pl
@@ -1,20 +1,23 @@
-#===-- R600GenRegisterInfo.pl - TODO: Add brief description -------===#
+#===-- R600GenRegisterInfo.pl - Script for generating register info files --===#
 #
 #                     The LLVM Compiler Infrastructure
 #
 # This file is distributed under the University of Illinois Open Source
 # License. See LICENSE.TXT for details.
 #
-#===----------------------------------------------------------------------===#
+#===------------------------------------------------------------------------===#
 #
-# TODO: Add full description
+# This perl script prints to stdout .td code to be used as R600RegisterInfo.td
+# it also generates a file called R600HwRegInfo.include, which contains helper
+# functions for determining the hw encoding of registers.
 #
-#===----------------------------------------------------------------------===#
+#===------------------------------------------------------------------------===#
 
 use strict;
 use warnings;
 
-use AMDGPUConstants;
+use constant CONST_REG_COUNT => 256;
+use constant TEMP_REG_COUNT => 128;
 
 my $CREG_MAX = CONST_REG_COUNT - 1;
 my $TREG_MAX = TEMP_REG_COUNT - 1;
@@ -81,7 +84,7 @@ def R600_Reg32 : RegisterClass <"AMDIL", [f32, i32], 32, (add
     R600_CReg32,
     ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF)>;
 
-def R600_Reg128 : RegisterClass<"AMDIL", [v4f32], 128, (add
+def R600_Reg128 : RegisterClass<"AMDIL", [v4f32, v4i32], 128, (add
     $t128_string)>
 {
   let SubRegClasses = [(R600_TReg32 sel_x, sel_y, sel_z, sel_w)];
@@ -170,3 +173,24 @@ sub print_reg_defs {
   return @reg_list;
 }
 
+#Helper functions
+sub get_hw_index {
+  my ($index) = @_;
+  return int($index / 4);
+}
+
+sub get_chan_str {
+  my ($index) = @_;
+  my $chan = $index % 4;
+  if ($chan == 0 )  {
+    return 'X';
+  } elsif ($chan == 1) {
+    return 'Y';
+  } elsif ($chan == 2) {
+    return 'Z';
+  } elsif ($chan == 3) {
+    return 'W';
+  } else {
+    die("Unknown chan value: $chan");
+  }
+}
diff --git a/src/gallium/drivers/radeon/R600ISelLowering.cpp b/src/gallium/drivers/radeon/R600ISelLowering.cpp
index f92fe2641a5..e85ac31b34c 100644
--- a/src/gallium/drivers/radeon/R600ISelLowering.cpp
+++ b/src/gallium/drivers/radeon/R600ISelLowering.cpp
@@ -1,4 +1,4 @@
-//===-- R600ISelLowering.cpp - TODO: Add brief description -------===//
+//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,12 +7,14 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// Most of the DAG lowering is handled in AMDILISelLowering.cpp.  This file
+// is mostly EmitInstrWithCustomInserter().
 //
 //===----------------------------------------------------------------------===//
 
 #include "R600ISelLowering.h"
 #include "R600InstrInfo.h"
+#include "R600MachineFunctionInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
 using namespace llvm;
@@ -25,9 +27,13 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
 //  setSchedulingPreference(Sched::VLIW);
   addRegisterClass(MVT::v4f32, &AMDIL::R600_Reg128RegClass);
   addRegisterClass(MVT::f32, &AMDIL::R600_Reg32RegClass);
+  addRegisterClass(MVT::v4i32, &AMDIL::R600_Reg128RegClass);
+  addRegisterClass(MVT::i32, &AMDIL::R600_Reg32RegClass);
 
   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Legal);
   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Legal);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Legal);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Legal);
 }
 
 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
@@ -35,10 +41,10 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
 {
   MachineFunction * MF = BB->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
+  MachineBasicBlock::iterator I = *MI;
 
   switch (MI->getOpcode()) {
   default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
-  /* XXX: Use helper function from AMDGPULowerShaderInstructions here */
   case AMDIL::TGID_X:
     addLiveIn(MI, MF, MRI, TII, AMDIL::T1_X);
     break;
@@ -84,7 +90,49 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
   case AMDIL::LOCAL_SIZE_Z:
     lowerImplicitParameter(MI, *BB, MRI, 8);
     break;
+
+  case AMDIL::R600_LOAD_CONST:
+    {
+      int64_t RegIndex = MI->getOperand(1).getImm();
+      unsigned ConstantReg = AMDIL::R600_CReg32RegClass.getRegister(RegIndex);
+      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDIL::COPY))
+                  .addOperand(MI->getOperand(0))
+                  .addReg(ConstantReg);
+      break;
+    }
+
+  case AMDIL::LOAD_INPUT:
+    {
+      int64_t RegIndex = MI->getOperand(1).getImm();
+      addLiveIn(MI, MF, MRI, TII,
+                AMDIL::R600_TReg32RegClass.getRegister(RegIndex));
+      break;
+    }
+  case AMDIL::STORE_OUTPUT:
+    {
+      int64_t OutputIndex = MI->getOperand(1).getImm();
+      unsigned OutputReg = AMDIL::R600_TReg32RegClass.getRegister(OutputIndex);
+
+      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDIL::COPY), OutputReg)
+                  .addOperand(MI->getOperand(0));
+
+      if (!MRI.isLiveOut(OutputReg)) {
+        MRI.addLiveOut(OutputReg);
+      }
+      break;
+    }
+
+  case AMDIL::RESERVE_REG:
+    {
+      R600MachineFunctionInfo * MFI = MF->getInfo<R600MachineFunctionInfo>();
+      int64_t ReservedIndex = MI->getOperand(0).getImm();
+      unsigned ReservedReg =
+                          AMDIL::R600_TReg32RegClass.getRegister(ReservedIndex);
+      MFI->ReservedRegs.push_back(ReservedReg);
+      break;
+    }
   }
+
   MI->eraseFromParent();
   return BB;
 }
diff --git a/src/gallium/drivers/radeon/R600ISelLowering.h b/src/gallium/drivers/radeon/R600ISelLowering.h
index fd26bf538c4..fdd552a172d 100644
--- a/src/gallium/drivers/radeon/R600ISelLowering.h
+++ b/src/gallium/drivers/radeon/R600ISelLowering.h
@@ -1,4 +1,4 @@
-//===-- R600ISelLowering.h - TODO: Add brief description -------===//
+//===-- R600ISelLowering.h - R600 DAG Lowering Interface -*- C++ -*--------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// R600 DAG Lowering interface definition
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/src/gallium/drivers/radeon/R600InstrFormats.td b/src/gallium/drivers/radeon/R600InstrFormats.td
deleted file mode 100644
index 0890eb64509..00000000000
--- a/src/gallium/drivers/radeon/R600InstrFormats.td
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- R600InstrFormats.td - TODO: Add brief description -------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// TODO: Add full description
-//
-//===----------------------------------------------------------------------===//
-
-
-class ALUInst <bits<10> op, dag outs, dag ins, string asm, list<dag> pattern>
-  : InstR600 <, outs, ins , asm, pattern>
diff --git a/src/gallium/drivers/radeon/R600InstrInfo.cpp b/src/gallium/drivers/radeon/R600InstrInfo.cpp
index 0c7ffc4334d..2bd59fd5e1b 100644
--- a/src/gallium/drivers/radeon/R600InstrInfo.cpp
+++ b/src/gallium/drivers/radeon/R600InstrInfo.cpp
@@ -1,4 +1,4 @@
-//===-- R600InstrInfo.cpp - TODO: Add brief description -------===//
+//===-- R600InstrInfo.cpp - R600 Instruction Information ------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// R600 Implementation of TargetInstrInfo.
 //
 //===----------------------------------------------------------------------===//
 
@@ -73,10 +73,22 @@ unsigned R600InstrInfo::getISAOpcode(unsigned opcode) const
     case AMDIL::MOVE_i32:
       return AMDIL::MOV;
     case AMDIL::SHR_i32:
+      return getASHRop();
+    case AMDIL::USHR_i32:
       return getLSHRop();
   }
 }
 
+unsigned R600InstrInfo::getASHRop() const
+{
+	unsigned gen = TM.getSubtarget<AMDILSubtarget>().device()->getGeneration();
+	if (gen < AMDILDeviceInfo::HD5XXX) {
+		return AMDIL::ASHR_r600;
+	} else {
+		return AMDIL::ASHR_eg;
+	}
+}
+
 unsigned R600InstrInfo::getLSHRop() const
 {
   unsigned gen = TM.getSubtarget<AMDILSubtarget>().device()->getGeneration();
diff --git a/src/gallium/drivers/radeon/R600InstrInfo.h b/src/gallium/drivers/radeon/R600InstrInfo.h
index aedaa9f47f3..014eeb0b9f7 100644
--- a/src/gallium/drivers/radeon/R600InstrInfo.h
+++ b/src/gallium/drivers/radeon/R600InstrInfo.h
@@ -1,4 +1,4 @@
-//===-- R600InstrInfo.h - TODO: Add brief description -------===//
+//===-- R600InstrInfo.h - R600 Instruction Info Interface -------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// Interface definition for R600InstrInfo
 //
 //===----------------------------------------------------------------------===//
 
@@ -52,6 +52,7 @@ namespace llvm {
   bool isTrig(const MachineInstr &MI) const;
 
   unsigned getLSHRop() const;
+  unsigned getASHRop() const;
   unsigned getMULHI_UINT() const;
   unsigned getMULLO_UINT() const;
   unsigned getRECIP_UINT() const;
diff --git a/src/gallium/drivers/radeon/R600Instructions.td b/src/gallium/drivers/radeon/R600Instructions.td
index 02043fdeea5..a18240f09bd 100644
--- a/src/gallium/drivers/radeon/R600Instructions.td
+++ b/src/gallium/drivers/radeon/R600Instructions.td
@@ -1,4 +1,4 @@
-//===-- R600Instructions.td - TODO: Add brief description -------===//
+//===-- R600Instructions.td - R600 Instruction defs  -------*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// R600 Tablegen instruction definitions
 //
 //===----------------------------------------------------------------------===//
 
@@ -84,7 +84,7 @@ class R600_3OP <bits<32> inst, string opName, list<dag> pattern,
   InstR600 <inst,
           (outs R600_Reg32:$dst),
           (ins R600_Reg32:$src0, R600_Reg32:$src1, R600_Reg32:$src2, variable_ops),
-          !strconcat(opName, "$dst $src0, $src1, $src2"),
+          !strconcat(opName, " $dst, $src0, $src1, $src2"),
           pattern,
           itin>{
 
@@ -92,7 +92,7 @@ class R600_3OP <bits<32> inst, string opName, list<dag> pattern,
   }
 
 class R600_REDUCTION <bits<32> inst, dag ins, string asm, list<dag> pattern,
-                      InstrItinClass itin = AnyALU> :
+                      InstrItinClass itin = VecALU> :
   InstR600 <inst,
           (outs R600_Reg32:$dst),
           ins,
@@ -152,8 +152,6 @@ class EG_CF_RAT <bits <8> cf_inst, bits <6> rat_inst, dag outs, dag ins,
   let Inst{31-30} = ELEM_SIZE;
 
   /* CF_ALLOC_EXPORT_WORD1_BUF */
-/* XXX: We can't have auto encoding of 64-bit instructions until LLVM 3.1 :( */
-/*
   let Inst{43-32} = ARRAY_SIZE;
   let Inst{47-44} = COMP_MASK;
   let Inst{51-48} = BURST_COUNT;
@@ -162,7 +160,6 @@ class EG_CF_RAT <bits <8> cf_inst, bits <6> rat_inst, dag outs, dag ins,
   let Inst{61-54} = cf_inst;
   let Inst{62}    = MARK;
   let Inst{63}    = BARRIER;
-*/
 }
 
 /*
@@ -311,6 +308,18 @@ def TRUNC : R600_1OP <
   [(set R600_Reg32:$dst, (int_AMDGPU_trunc R600_Reg32:$src))]
 >;
 
+def CEIL : R600_1OP <
+  0x12, "CEIL",
+  [(set R600_Reg32:$dst, (int_AMDIL_round_neginf R600_Reg32:$src))]> {
+  let AMDILOp = AMDILInst.ROUND_NEGINF_f32;
+}
+
+def RNDNE : R600_1OP <
+  0x13, "RNDNE",
+  [(set R600_Reg32:$dst, (int_AMDIL_round_nearest R600_Reg32:$src))]> {
+  let AMDILOp = AMDILInst.ROUND_NEAREST_f32;
+}
+
 def FLOOR : R600_1OP <
   0x14, "FLOOR",
   [(set R600_Reg32:$dst, (int_AMDGPU_floor R600_Reg32:$src))]
@@ -329,64 +338,114 @@ def AND_INT : R600_2OP <
   let AMDILOp = AMDILInst.AND_i32;
 }
 
+def OR_INT : R600_2OP <
+  0x31, "OR_INT",
+  []>{
+  let AMDILOp = AMDILInst.BINARY_OR_i32;
+}
+
 def XOR_INT : R600_2OP <
   0x32, "XOR_INT",
   []
 >;
 
+def NOT_INT : R600_1OP <
+  0x33, "NOT_INT",
+  []>{
+  let AMDILOp = AMDILInst.BINARY_NOT_i32;
+}
+
 def ADD_INT : R600_2OP <
-  0x34, "ADD_INT $dst, $src0, $src1",
+  0x34, "ADD_INT",
   []>{
   let AMDILOp = AMDILInst.ADD_i32;
 }
 
 def SUB_INT : R600_2OP <
-	0x35, "SUB_INT $dst, $src0, $src1",
+	0x35, "SUB_INT",
 	[]
 >;
 
+def MAX_INT : R600_2OP <
+  0x36, "MAX_INT",
+  [(set R600_Reg32:$dst, (int_AMDGPU_imax R600_Reg32:$src0, R600_Reg32:$src1))]>;
+
+def MIN_INT : R600_2OP <
+  0x37, "MIN_INT",
+  [(set R600_Reg32:$dst, (int_AMDGPU_imin R600_Reg32:$src0, R600_Reg32:$src1))]>;
+
+def MAX_UINT : R600_2OP <
+  0x38, "MAX_UINT",
+  [(set R600_Reg32:$dst, (int_AMDGPU_umax R600_Reg32:$src0, R600_Reg32:$src1))]>;
+
+def MIN_UINT : R600_2OP <
+  0x39, "MIN_UINT",
+  [(set R600_Reg32:$dst, (int_AMDGPU_umin R600_Reg32:$src0, R600_Reg32:$src1))]>;
+
+
 def SETE_INT : R600_2OP <
-  0x3A, "SETE_INT $dst, $src0, $src1",
+  0x3A, "SETE_INT",
   []>{
   let AMDILOp = AMDILInst.IEQ;
 }
 
 def SETGT_INT : R600_2OP <
-  0x3B, "SGT_INT $dst, $src0, $src1",
+  0x3B, "SGT_INT",
   []
 >;
 
 def SETGE_INT : R600_2OP <
-	0x3C, "SETGE_INT $dst, $src0, $src1",
+	0x3C, "SETGE_INT",
 	[]>{
   let AMDILOp = AMDILInst.IGE;
 }
 
 def SETNE_INT : R600_2OP <
-  0x3D, "SETNE_INT $dst, $src0, $src1",
+  0x3D, "SETNE_INT",
   []>{
   let AMDILOp = AMDILInst.INE;
 }
 
 def SETGT_UINT : R600_2OP <
-  0x3E, "SETGT_UINT $dst, $src0, $src1",
+  0x3E, "SETGT_UINT",
   []>{
   let AMDILOp = AMDILInst.UGT;
 }
 
 def SETGE_UINT : R600_2OP <
-  0x3F, "SETGE_UINT $dst, $src0, $src1",
+  0x3F, "SETGE_UINT",
   []>{
   let AMDILOp = AMDILInst.UGE;
 }
 
 def CNDE_INT : R600_3OP <
-	0x1C, "CNDE_INT $dst, $src0, $src1, $src2",
+	0x1C, "CNDE_INT",
 	[]
 >;
 
 /* Texture instructions */
 
+
+def TEX_LD : R600_TEX <
+  0x03, "TEX_LD",
+  [(set R600_Reg128:$dst, (int_AMDGPU_txf R600_Reg128:$src0, imm:$src1, imm:$src2))]
+>;
+
+def TEX_GET_TEXTURE_RESINFO : R600_TEX <
+  0x04, "TEX_GET_TEXTURE_RESINFO",
+  [(set R600_Reg128:$dst, (int_AMDGPU_txq R600_Reg128:$src0, imm:$src1, imm:$src2))]
+>;
+
+def TEX_GET_GRADIENTS_H : R600_TEX <
+  0x07, "TEX_GET_GRADIENTS_H",
+  [(set R600_Reg128:$dst, (int_AMDGPU_ddx R600_Reg128:$src0, imm:$src1, imm:$src2))]
+>;
+
+def TEX_GET_GRADIENTS_V : R600_TEX <
+  0x08, "TEX_GET_GRADIENTS_V",
+  [(set R600_Reg128:$dst, (int_AMDGPU_ddy R600_Reg128:$src0, imm:$src1, imm:$src2))]
+>;
+
 def TEX_SAMPLE : R600_TEX <
   0x10, "TEX_SAMPLE",
   [(set R600_Reg128:$dst, (int_AMDGPU_tex R600_Reg128:$src0, imm:$src1, imm:$src2))]
@@ -434,6 +493,11 @@ def KILP : Pat <
   (MASK_WRITE (KILLGT (f32 ONE), (f32 ZERO)))
 >;
 
+def KIL : Pat <
+  (int_AMDGPU_kill R600_Reg32:$src0),
+  (MASK_WRITE (KILLGT (f32 ZERO), (f32 R600_Reg32:$src0)))
+>;
+
 /* Helper classes for common instructions */
 
 class MUL_LIT_Common <bits<32> inst> : R600_3OP <
@@ -470,6 +534,15 @@ class DOT4_Common <bits<32> inst> : R600_REDUCTION <
   [(set R600_Reg32:$dst, (int_AMDGPU_dp4 R600_Reg128:$src0, R600_Reg128:$src1))]
 >;
 
+class CUBE_Common <bits<32> inst> : InstR600 <
+  inst,
+  (outs R600_Reg128:$dst),
+  (ins R600_Reg128:$src),
+  "CUBE $dst $src",
+  [(set R600_Reg128:$dst, (int_AMDGPU_cube R600_Reg128:$src))],
+  VecALU
+>;
+
 class EXP_IEEE_Common <bits<32> inst> : R600_1OP <
   inst, "EXP_IEEE",
   []> {
@@ -509,6 +582,12 @@ class LSHR_Common <bits<32> inst> : R600_2OP <
   let AMDILOp = AMDILInst.USHR_i32;
 }
 
+class ASHR_Common <bits<32> inst> : R600_2OP <
+  inst, "ASHR $dst, $src0, $src1",
+  [] >{
+  let AMDILOp = AMDILInst.SHR_i32;
+}
+
 class MULHI_INT_Common <bits<32> inst> : R600_2OP <
   inst, "MULHI_INT $dst, $src0, $src1",
   [] >{
@@ -608,6 +687,7 @@ let Gen = AMDGPUGen.R600 in {
   def CNDGT_r600 : CNDGT_Common<0x19>;
   def CNDGE_r600 : CNDGE_Common<0x1A>;
   def DOT4_r600 : DOT4_Common<0x50>;
+  def CUBE_r600 : CUBE_Common<0x52>;
   def EXP_IEEE_r600 : EXP_IEEE_Common<0x61>;
   def LOG_CLAMPED_r600 : LOG_CLAMPED_Common<0x62>;
   def LOG_IEEE_r600 : LOG_IEEE_Common<0x63>;
@@ -619,6 +699,7 @@ let Gen = AMDGPUGen.R600 in {
   def INT_TO_FLT_r600 : INT_TO_FLT_Common<0x6c>;
   def SIN_r600 : SIN_Common<0x6E>;
   def COS_r600 : COS_Common<0x6F>;
+  def ASHR_r600 : ASHR_Common<0x70>;
   def LSHR_r600 : LSHR_Common<0x71>;
   def LSHL_r600 : LSHL_Common<0x72>;
   def MULLO_INT_r600 : MULLO_INT_Common<0x73>;
@@ -661,20 +742,12 @@ def RAT_WRITE_CACHELESS_eg :
     EG_CF_RAT <0x57, 0x2, (outs), (ins R600_TReg32_X:$rw_gpr,
                                    R600_TReg32_X:$index_gpr, i32imm:$rat_id), "">
 {
-/*
-  let Inst{3-0}   = RAT_ID;
-  let Inst{21-15} = RW_GPR;
-  let Inst{29-23} = INDEX_GPR;
-  /* Propery of the UAV */
-  let Inst{31-30} = ELEM_SIZE;
-*/
   let RIM         = 0;
   /* XXX: Have a separate instruction for non-indexed writes. */
   let TYPE        = 1;
   let RW_REL      = 0;
   let ELEM_SIZE   = 0;
 
-/*
   let ARRAY_SIZE  = 0;
   let COMP_MASK   = 1;
   let BURST_COUNT = 0;
@@ -682,7 +755,6 @@ def RAT_WRITE_CACHELESS_eg :
   let EOP         = 0;
   let MARK        = 0;
   let BARRIER     = 1;
-*/
 }
 
 def VTX_READ_eg : InstR600ISA < (outs R600_TReg32_X:$dst),
@@ -789,6 +861,7 @@ class TRIG_eg <InstR600 trig, Intrinsic intr> : Pat<
 let Gen = AMDGPUGen.EG_CAYMAN in {
 
   def MULADD_eg : MULADD_Common<0x14>;
+  def ASHR_eg : ASHR_Common<0x15>;
   def LSHR_eg : LSHR_Common<0x16>;
   def LSHL_eg : LSHL_Common<0x17>;
   def CNDE_eg : CNDE_Common<0x19>;
@@ -812,6 +885,7 @@ let Gen = AMDGPUGen.EG_CAYMAN in {
   def RECIP_UINT_eg : RECIP_UINT_Common<0x94>;
   def INT_TO_FLT_eg : INT_TO_FLT_Common<0x9B>;
   def DOT4_eg : DOT4_Common<0xBE>;
+  def CUBE_eg : CUBE_Common<0xC0>;
 
 } // End AMDGPUGen.EG_CAYMAN
 
@@ -905,6 +979,34 @@ def LOCAL_SIZE_Y : R600PreloadInst <"LOCAL_SIZE_Y",
 def LOCAL_SIZE_Z : R600PreloadInst <"LOCAL_SIZE_Z",
                                     int_r600_read_local_size_z>;
 
+def R600_LOAD_CONST : AMDGPUShaderInst <
+  (outs R600_Reg32:$dst),
+  (ins i32imm:$src0),
+  "R600_LOAD_CONST $dst, $src0",
+  [(set R600_Reg32:$dst, (int_AMDGPU_load_const imm:$src0))]
+>;
+
+def LOAD_INPUT : AMDGPUShaderInst <
+  (outs R600_Reg32:$dst),
+  (ins i32imm:$src),
+  "LOAD_INPUT $dst, $src",
+  [(set R600_Reg32:$dst, (int_R600_load_input imm:$src))]
+>;
+
+def RESERVE_REG : AMDGPUShaderInst <
+  (outs),
+  (ins i32imm:$src),
+  "RESERVE_REG $src",
+  [(int_AMDGPU_reserve_reg imm:$src)]
+>;
+
+def STORE_OUTPUT: AMDGPUShaderInst <
+  (outs),
+  (ins R600_Reg32:$src0, i32imm:$src1),
+  "STORE_OUTPUT $src0, $src1",
+  [(int_AMDGPU_store_output R600_Reg32:$src0, imm:$src1)]
+>;
+
 } // End usesCustomInserter = 1, isPseudo = 1
 
 } // End isCodeGenOnly = 1
@@ -933,15 +1035,14 @@ def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 5, sel_y>;
 def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 6, sel_z>;
 def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 7, sel_w>;
 
+def : Extract_Element <i32, v4i32, R600_Reg128, 0, sel_x>;
+def : Extract_Element <i32, v4i32, R600_Reg128, 1, sel_y>;
+def : Extract_Element <i32, v4i32, R600_Reg128, 2, sel_z>;
+def : Extract_Element <i32, v4i32, R600_Reg128, 3, sel_w>;
 
-include "R600ShaderPatterns.td"
-
-// We need this pattern to avoid having real registers in PHI nodes.
-// For some reason this pattern only works when it comes after the other
-// instruction defs.
-def : Pat <
-  (int_R600_load_input imm:$src),
-  (LOAD_INPUT imm:$src)
->;
+def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 4, sel_x>;
+def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 5, sel_y>;
+def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 6, sel_z>;
+def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 7, sel_w>;
 
 } // End isR600toCayman Predicate
diff --git a/src/gallium/drivers/radeon/R600Intrinsics.td b/src/gallium/drivers/radeon/R600Intrinsics.td
deleted file mode 100644
index 8038fee1a3c..00000000000
--- a/src/gallium/drivers/radeon/R600Intrinsics.td
+++ /dev/null
@@ -1,40 +0,0 @@
-//===-- R600Intrinsics.td - TODO: Add brief description -------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// TODO: Add full description
-//
-//===----------------------------------------------------------------------===//
-
-let TargetPrefix = "R600", isTarget = 1 in {
-  def int_R600_load_input : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrReadWriteArgMem]>;
-}
-
-let TargetPrefix = "r600", isTarget = 1 in {
-
-class R600ReadPreloadRegisterIntrinsic<string name>
-  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
-    GCCBuiltin<name>;
-
-multiclass R600ReadPreloadRegisterIntrinsic_xyz<string prefix> {
-  def _x : R600ReadPreloadRegisterIntrinsic<!strconcat(prefix, "_x")>;
-  def _y : R600ReadPreloadRegisterIntrinsic<!strconcat(prefix, "_y")>;
-  def _z : R600ReadPreloadRegisterIntrinsic<!strconcat(prefix, "_z")>;
-}
-
-defm int_r600_read_global_size : R600ReadPreloadRegisterIntrinsic_xyz <
-                                       "__builtin_r600_read_global_size">;
-defm int_r600_read_local_size : R600ReadPreloadRegisterIntrinsic_xyz <
-                                       "__builtin_r600_read_local_size">;
-defm int_r600_read_ngroups : R600ReadPreloadRegisterIntrinsic_xyz <
-                                       "__builtin_r600_read_ngroups">;
-defm int_r600_read_tgid : R600ReadPreloadRegisterIntrinsic_xyz <
-                                       "__builtin_r600_read_tgid">;
-defm int_r600_read_tidig : R600ReadPreloadRegisterIntrinsic_xyz <
-                                       "__builtin_r600_read_tidig">;
-} // End TargetPrefix = "r600"
diff --git a/src/gallium/drivers/radeon/R600IntrinsicsNoOpenCL.td b/src/gallium/drivers/radeon/R600IntrinsicsNoOpenCL.td
new file mode 100644
index 00000000000..73ef4aae234
--- /dev/null
+++ b/src/gallium/drivers/radeon/R600IntrinsicsNoOpenCL.td
@@ -0,0 +1,40 @@
+//===-- R600Intrinsics.td - R600 Instrinsic defs -------*- tablegen -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// R600 Intrinsic Definitions
+//
+//===----------------------------------------------------------------------===//
+
+let TargetPrefix = "R600", isTarget = 1 in {
+  def int_R600_load_input : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrReadWriteArgMem]>;
+}
+
+let TargetPrefix = "r600", isTarget = 1 in {
+
+class R600ReadPreloadRegisterIntrinsic<string name>
+  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+    GCCBuiltin<name>;
+
+multiclass R600ReadPreloadRegisterIntrinsic_xyz<string prefix> {
+  def _x : R600ReadPreloadRegisterIntrinsic<!strconcat(prefix, "_x")>;
+  def _y : R600ReadPreloadRegisterIntrinsic<!strconcat(prefix, "_y")>;
+  def _z : R600ReadPreloadRegisterIntrinsic<!strconcat(prefix, "_z")>;
+}
+
+defm int_r600_read_global_size : R600ReadPreloadRegisterIntrinsic_xyz <
+                                       "__builtin_r600_read_global_size">;
+defm int_r600_read_local_size : R600ReadPreloadRegisterIntrinsic_xyz <
+                                       "__builtin_r600_read_local_size">;
+defm int_r600_read_ngroups : R600ReadPreloadRegisterIntrinsic_xyz <
+                                       "__builtin_r600_read_ngroups">;
+defm int_r600_read_tgid : R600ReadPreloadRegisterIntrinsic_xyz <
+                                       "__builtin_r600_read_tgid">;
+defm int_r600_read_tidig : R600ReadPreloadRegisterIntrinsic_xyz <
+                                       "__builtin_r600_read_tidig">;
+} // End TargetPrefix = "r600"
diff --git a/src/gallium/drivers/radeon/R600IntrinsicsOpenCL.td b/src/gallium/drivers/radeon/R600IntrinsicsOpenCL.td
new file mode 100644
index 00000000000..cd761358475
--- /dev/null
+++ b/src/gallium/drivers/radeon/R600IntrinsicsOpenCL.td
@@ -0,0 +1,16 @@
+//===-- R600Intrinsics.td - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+let TargetPrefix = "R600", isTarget = 1 in {
+  def int_R600_load_input : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrReadWriteArgMem]>;
+}
diff --git a/src/gallium/drivers/radeon/R600KernelParameters.cpp b/src/gallium/drivers/radeon/R600KernelParameters.cpp
index 3fdf48a2bf2..53bfebc7364 100644
--- a/src/gallium/drivers/radeon/R600KernelParameters.cpp
+++ b/src/gallium/drivers/radeon/R600KernelParameters.cpp
@@ -1,4 +1,4 @@
-//===-- R600KernelParameters.cpp - TODO: Add brief description -------===//
+//===-- R600KernelParameters.cpp - Lower kernel function arguments --------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,89 +7,83 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// This pass lowers kernel function arguments to loads from the vertex buffer.
+//
+// Kernel arguemnts are stored in the vertex buffer at an offset of 9 dwords,
+// so arg0 needs to be loaded from VTX_BUFFER[9] and arg1 is loaded from
+// VTX_BUFFER[10], etc.
 //
 //===----------------------------------------------------------------------===//
 
-#include <llvm-c/Core.h>
-#include "R600KernelParameters.h"
-#include "R600OpenCLUtils.h"
+#include "AMDGPU.h"
+#include "AMDIL.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/Constants.h"
+#include "llvm/Function.h"
 #include "llvm/Intrinsics.h"
+#include "llvm/Metadata.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetData.h"
 #include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/TypeBuilder.h"
-// #include "llvm/CodeGen/Function.h"
-
-namespace AMDILAS {
-enum AddressSpaces {
-  PRIVATE_ADDRESS  = 0, // Address space for private memory.
-  GLOBAL_ADDRESS   = 1, // Address space for global memory (RAT0, VTX0).
-  CONSTANT_ADDRESS = 2, // Address space for constant memory.
-  LOCAL_ADDRESS    = 3, // Address space for local memory.
-  REGION_ADDRESS   = 4, // Address space for region memory.
-  ADDRESS_NONE     = 5, // Address space for unknown memory.
-  PARAM_D_ADDRESS  = 6, // Address space for direct addressible parameter memory (CONST0)
-  PARAM_I_ADDRESS  = 7, // Address space for indirect addressible parameter memory (VTX1)
-  LAST_ADDRESS     = 8
-};
-}
-
 
 #include <map>
 #include <set>
 
 using namespace llvm;
-using namespace std;
+
+namespace {
 
 #define CONSTANT_CACHE_SIZE_DW 127
 
-class R600KernelParameters : public llvm::FunctionPass
+class R600KernelParameters : public FunctionPass
 {
-  const llvm::TargetData * TD;
+  const TargetData * TD;
   LLVMContext* Context;
   Module *mod;
-  
+
   struct param
   {
-    param() : val(NULL), ptr_val(NULL), offset_in_dw(0), size_in_dw(0), indirect(false), specialID(0) {}
-    
-    llvm::Value* val;
-    llvm::Value* ptr_val;
+    param() : val(NULL), ptr_val(NULL), offset_in_dw(0), size_in_dw(0),
+              indirect(false), specialID(0) {}
+
+    Value* val;
+    Value* ptr_val;
     int offset_in_dw;
     int size_in_dw;
 
     bool indirect;
-    
-    string specialType;
+
+    std::string specialType;
     int specialID;
-    
+
     int end() { return offset_in_dw + size_in_dw; }
-    /* The first 9 dwords are reserved for the grid sizes. */
+    // The first 9 dwords are reserved for the grid sizes.
     int get_rat_offset() { return 9 + offset_in_dw; }
   };
 
   std::vector<param> params;
 
-  int getLastSpecialID(const string& TypeName);
-  
+  bool isOpenCLKernel(const Function* fun);
+  int getLastSpecialID(const std::string& TypeName);
+
   int getListSize();
-  void AddParam(llvm::Argument* arg);
-  int calculateArgumentSize(llvm::Argument* arg);
-  void RunAna(llvm::Function* fun);
-  void Replace(llvm::Function* fun);
-  bool isIndirect(Value* val, set<Value*>& visited);
-  void Propagate(llvm::Function* fun);
-  void Propagate(llvm::Value* v, const llvm::Twine& name, bool indirect = false);
+  void AddParam(Argument* arg);
+  int calculateArgumentSize(Argument* arg);
+  void RunAna(Function* fun);
+  void Replace(Function* fun);
+  bool isIndirect(Value* val, std::set<Value*>& visited);
+  void Propagate(Function* fun);
+  void Propagate(Value* v, const Twine& name, bool indirect = false);
   Value* ConstantRead(Function* fun, param& p);
   Value* handleSpecial(Function* fun, param& p);
   bool isSpecialType(Type*);
-  string getSpecialTypeName(Type*);
+  std::string getSpecialTypeName(Type*);
 public:
   static char ID;
   R600KernelParameters() : FunctionPass(ID) {};
-  R600KernelParameters(const llvm::TargetData* TD) : FunctionPass(ID), TD(TD) {}
-//   bool runOnFunction (llvm::Function &F);
-  bool runOnFunction (llvm::Function &F);
+  R600KernelParameters(const TargetData* TD) : FunctionPass(ID), TD(TD) {}
+  bool runOnFunction (Function &F);
   void getAnalysisUsage(AnalysisUsage &AU) const;
   const char *getPassName() const;
   bool doInitialization(Module &M);
@@ -98,13 +92,42 @@ public:
 
 char R600KernelParameters::ID = 0;
 
-static RegisterPass<R600KernelParameters> X("kerparam", "OpenCL Kernel Parameter conversion", false, false);
+static RegisterPass<R600KernelParameters> X("kerparam",
+                            "OpenCL Kernel Parameter conversion", false, false);
 
-int R600KernelParameters::getLastSpecialID(const string& TypeName)
+bool R600KernelParameters::isOpenCLKernel(const Function* fun)
+{
+  Module *mod = const_cast<Function*>(fun)->getParent();
+  NamedMDNode * md = mod->getOrInsertNamedMetadata("opencl.kernels");
+
+  if (!md or !md->getNumOperands())
+  {
+    return false;
+  }
+
+  for (int i = 0; i < int(md->getNumOperands()); i++)
+  {
+    if (!md->getOperand(i) or !md->getOperand(i)->getOperand(0))
+    {
+      continue;
+    }
+    
+    assert(md->getOperand(i)->getNumOperands() == 1);
+
+    if (md->getOperand(i)->getOperand(0)->getName() == fun->getName())
+    {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+int R600KernelParameters::getLastSpecialID(const std::string& TypeName)
 {
   int lastID = -1;
-  
-  for (vector<param>::iterator i = params.begin(); i != params.end(); i++)
+
+  for (std::vector<param>::iterator i = params.begin(); i != params.end(); i++)
   {
     if (i->specialType == TypeName)
     {
@@ -125,7 +148,7 @@ int R600KernelParameters::getListSize()
   return params.back().end();
 }
 
-bool R600KernelParameters::isIndirect(Value* val, set<Value*>& visited)
+bool R600KernelParameters::isIndirect(Value* val, std::set<Value*>& visited)
 {
   if (isa<LoadInst>(val))
   {
@@ -144,7 +167,7 @@ bool R600KernelParameters::isIndirect(Value* val, set<Value*>& visited)
   }
 
   visited.insert(val);
-  
+
   if (isa<GetElementPtrInst>(val))
   {
     GetElementPtrInst* GEP = dyn_cast<GetElementPtrInst>(val);
@@ -158,7 +181,7 @@ bool R600KernelParameters::isIndirect(Value* val, set<Value*>& visited)
       }
     }
   }
-  
+
   for (Value::use_iterator i = val->use_begin(); i != val->use_end(); i++)
   {
     Value* v2 = dyn_cast<Value>(*i);
@@ -175,24 +198,24 @@ bool R600KernelParameters::isIndirect(Value* val, set<Value*>& visited)
   return false;
 }
 
-void R600KernelParameters::AddParam(llvm::Argument* arg)
+void R600KernelParameters::AddParam(Argument* arg)
 {
   param p;
-  
+
   p.val = dyn_cast<Value>(arg);
   p.offset_in_dw = getListSize();
   p.size_in_dw = calculateArgumentSize(arg);
 
   if (isa<PointerType>(arg->getType()) and arg->hasByValAttr())
   {
-    set<Value*> visited;
+    std::set<Value*> visited;
     p.indirect = isIndirect(p.val, visited);
   }
-  
+
   params.push_back(p);
 }
 
-int R600KernelParameters::calculateArgumentSize(llvm::Argument* arg)
+int R600KernelParameters::calculateArgumentSize(Argument* arg)
 {
   Type* t = arg->getType();
 
@@ -200,16 +223,16 @@ int R600KernelParameters::calculateArgumentSize(llvm::Argument* arg)
   {
     t = dyn_cast<PointerType>(t)->getElementType();
   }
-  
+
   int store_size_in_dw = (TD->getTypeStoreSize(t) + 3)/4;
 
   assert(store_size_in_dw);
-  
+
   return store_size_in_dw;
 }
 
 
-void R600KernelParameters::RunAna(llvm::Function* fun)
+void R600KernelParameters::RunAna(Function* fun)
 {
   assert(isOpenCLKernel(fun));
 
@@ -220,7 +243,7 @@ void R600KernelParameters::RunAna(llvm::Function* fun)
 
 }
 
-void R600KernelParameters::Replace(llvm::Function* fun)
+void R600KernelParameters::Replace(Function* fun)
 {
   for (std::vector<param>::iterator i = params.begin(); i != params.end(); i++)
   {
@@ -237,11 +260,11 @@ void R600KernelParameters::Replace(llvm::Function* fun)
     if (new_val)
     {
       i->val->replaceAllUsesWith(new_val);
-    }   
+    }
   }
 }
 
-void R600KernelParameters::Propagate(llvm::Function* fun)
+void R600KernelParameters::Propagate(Function* fun)
 {
   for (std::vector<param>::iterator i = params.begin(); i != params.end(); i++)
   {
@@ -256,8 +279,8 @@ void R600KernelParameters::Propagate(Value* v, const Twine& name, bool indirect)
 {
   LoadInst* load = dyn_cast<LoadInst>(v);
   GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(v);
-  
-  unsigned addrspace; 
+
+  unsigned addrspace;
 
   if (indirect)
   {
@@ -274,49 +297,54 @@ void R600KernelParameters::Propagate(Value* v, const Twine& name, bool indirect)
 
     if (dyn_cast<PointerType>(op->getType())->getAddressSpace() != addrspace)
     {
-      op = new BitCastInst(op, PointerType::get(dyn_cast<PointerType>(op->getType())->getElementType(), addrspace), name, dyn_cast<Instruction>(v));
+      op = new BitCastInst(op, PointerType::get(dyn_cast<PointerType>(
+                           op->getType())->getElementType(), addrspace),
+                           name, dyn_cast<Instruction>(v));
     }
 
-    vector<Value*> params(GEP->idx_begin(), GEP->idx_end());
-    
-    GetElementPtrInst* GEP2 = GetElementPtrInst::Create(op, params, name, dyn_cast<Instruction>(v));
+    std::vector<Value*> params(GEP->idx_begin(), GEP->idx_end());
+
+    GetElementPtrInst* GEP2 = GetElementPtrInst::Create(op, params, name,
+                                                      dyn_cast<Instruction>(v));
     GEP2->setIsInBounds(GEP->isInBounds());
     v = dyn_cast<Value>(GEP2);
     GEP->replaceAllUsesWith(GEP2);
     GEP->eraseFromParent();
     load = NULL;
   }
-  
+
   if (load)
   {
-    if (load->getPointerAddressSpace() != addrspace) ///normally at this point we have the right address space
+    ///normally at this point we have the right address space
+    if (load->getPointerAddressSpace() != addrspace)
     {
       Value *orig_ptr = load->getPointerOperand();
       PointerType *orig_ptr_type = dyn_cast<PointerType>(orig_ptr->getType());
-      
-      Type* new_ptr_type = PointerType::get(orig_ptr_type->getElementType(), addrspace);
+
+      Type* new_ptr_type = PointerType::get(orig_ptr_type->getElementType(),
+                                            addrspace);
 
       Value* new_ptr = orig_ptr;
-      
+
       if (orig_ptr->getType() != new_ptr_type)
       {
         new_ptr = new BitCastInst(orig_ptr, new_ptr_type, "prop_cast", load);
       }
-      
+
       Value* new_load = new LoadInst(new_ptr, name, load);
       load->replaceAllUsesWith(new_load);
       load->eraseFromParent();
     }
-    
+
     return;
   }
 
-  vector<User*> users(v->use_begin(), v->use_end());
-  
+  std::vector<User*> users(v->use_begin(), v->use_end());
+
   for (int i = 0; i < int(users.size()); i++)
   {
     Value* v2 = dyn_cast<Value>(users[i]);
-    
+
     if (v2)
     {
       Propagate(v2, name, indirect);
@@ -327,7 +355,7 @@ void R600KernelParameters::Propagate(Value* v, const Twine& name, bool indirect)
 Value* R600KernelParameters::ConstantRead(Function* fun, param& p)
 {
   assert(fun->front().begin() != fun->front().end());
-  
+
   Instruction *first_inst = fun->front().begin();
   IRBuilder <> builder (first_inst);
 /* First 3 dwords are reserved for the dimmension info */
@@ -346,43 +374,54 @@ Value* R600KernelParameters::ConstantRead(Function* fun, param& p)
   {
     addrspace = AMDILAS::PARAM_D_ADDRESS;
   }
-  
+
   Argument *arg = dyn_cast<Argument>(p.val);
   Type * argType = p.val->getType();
   PointerType * argPtrType = dyn_cast<PointerType>(p.val->getType());
-  
+
   if (argPtrType and arg->hasByValAttr())
   {
-    Value* param_addr_space_ptr = ConstantPointerNull::get(PointerType::get(Type::getInt32Ty(*Context), addrspace));
-    Value* param_ptr = GetElementPtrInst::Create(param_addr_space_ptr, ConstantInt::get(Type::getInt32Ty(*Context), p.get_rat_offset()), arg->getName(), first_inst);
-    param_ptr = new BitCastInst(param_ptr, PointerType::get(argPtrType->getElementType(), addrspace), arg->getName(), first_inst);
+    Value* param_addr_space_ptr = ConstantPointerNull::get(
+                                    PointerType::get(Type::getInt32Ty(*Context),
+                                    addrspace));
+    Value* param_ptr = GetElementPtrInst::Create(param_addr_space_ptr,
+                                    ConstantInt::get(Type::getInt32Ty(*Context),
+                                    p.get_rat_offset()), arg->getName(),
+                                    first_inst);
+    param_ptr = new BitCastInst(param_ptr,
+                                PointerType::get(argPtrType->getElementType(),
+                                                 addrspace),
+                                arg->getName(), first_inst);
     p.ptr_val = param_ptr;
     return param_ptr;
   }
   else
   {
-    Value* param_addr_space_ptr = ConstantPointerNull::get(PointerType::get(argType, addrspace));
-    
+    Value* param_addr_space_ptr = ConstantPointerNull::get(PointerType::get(
+                                                        argType, addrspace));
+
     Value* param_ptr = builder.CreateGEP(param_addr_space_ptr,
-             ConstantInt::get(Type::getInt32Ty(*Context), p.get_rat_offset()), arg->getName());
-    
+             ConstantInt::get(Type::getInt32Ty(*Context), p.get_rat_offset()),
+                              arg->getName());
+
     Value* param_value = builder.CreateLoad(param_ptr, arg->getName());
-    
+
     return param_value;
   }
 }
 
 Value* R600KernelParameters::handleSpecial(Function* fun, param& p)
 {
-  string name = getSpecialTypeName(p.val->getType());
+  std::string name = getSpecialTypeName(p.val->getType());
   int ID;
 
   assert(!name.empty());
-  
+
   if (name == "image2d_t" or name == "image3d_t")
   {
-    int lastID = max(getLastSpecialID("image2d_t"), getLastSpecialID("image3d_t"));
-    
+    int lastID = std::max(getLastSpecialID("image2d_t"),
+                     getLastSpecialID("image3d_t"));
+
     if (lastID == -1)
     {
       ID = 2; ///ID0 and ID1 are used internally by the driver
@@ -403,20 +442,22 @@ Value* R600KernelParameters::handleSpecial(Function* fun, param& p)
     else
     {
       ID = lastID + 1;
-    }    
+    }
   }
   else
   {
     ///TODO: give some error message
     return NULL;
   }
-    
+
   p.specialType = name;
   p.specialID = ID;
 
   Instruction *first_inst = fun->front().begin();
 
-  return new IntToPtrInst(ConstantInt::get(Type::getInt32Ty(*Context), p.specialID), p.val->getType(), "resourceID", first_inst);
+  return new IntToPtrInst(ConstantInt::get(Type::getInt32Ty(*Context),
+                                           p.specialID), p.val->getType(),
+                                           "resourceID", first_inst);
 }
 
 
@@ -425,7 +466,7 @@ bool R600KernelParameters::isSpecialType(Type* t)
   return !getSpecialTypeName(t).empty();
 }
 
-string R600KernelParameters::getSpecialTypeName(Type* t)
+std::string R600KernelParameters::getSpecialTypeName(Type* t)
 {
   PointerType *pt = dyn_cast<PointerType>(t);
   StructType *st = NULL;
@@ -437,9 +478,9 @@ string R600KernelParameters::getSpecialTypeName(Type* t)
 
   if (st)
   {
-    string prefix = "struct.opencl_builtin_type_";
-    
-    string name = st->getName().str();
+    std::string prefix = "struct.opencl_builtin_type_";
+
+    std::string name = st->getName().str();
 
     if (name.substr(0, prefix.length()) == prefix)
     {
@@ -458,19 +499,15 @@ bool R600KernelParameters::runOnFunction (Function &F)
     return false;
   }
 
-//  F.dump();
-  
   RunAna(&F);
   Replace(&F);
   Propagate(&F);
-  
-   mod->dump();
+
   return false;
 }
 
 void R600KernelParameters::getAnalysisUsage(AnalysisUsage &AU) const
 {
-//   AU.addRequired<FunctionAnalysis>();
   FunctionPass::getAnalysisUsage(AU);
   AU.setPreservesAll();
 }
@@ -484,7 +521,7 @@ bool R600KernelParameters::doInitialization(Module &M)
 {
   Context = &M.getContext();
   mod = &M;
-  
+
   return false;
 }
 
@@ -493,10 +530,12 @@ bool R600KernelParameters::doFinalization(Module &M)
   return false;
 }
 
-llvm::FunctionPass* createR600KernelParametersPass(const llvm::TargetData* TD)
+} // End anonymous namespace
+
+FunctionPass* llvm::createR600KernelParametersPass(const TargetData* TD)
 {
   FunctionPass *p = new R600KernelParameters(TD);
-  
+
   return p;
 }
 
diff --git a/src/gallium/drivers/radeon/R600KernelParameters.h b/src/gallium/drivers/radeon/R600KernelParameters.h
deleted file mode 100644
index 904a469a5f0..00000000000
--- a/src/gallium/drivers/radeon/R600KernelParameters.h
+++ /dev/null
@@ -1,28 +0,0 @@
-//===-- R600KernelParameters.h - TODO: Add brief description -------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// TODO: Add full description
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef KERNELPARAMETERS_H
-#define KERNELPARAMETERS_H
-
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/Function.h"
-#include "llvm/Pass.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Value.h"
-
-#include <vector>
-
-llvm::FunctionPass* createR600KernelParametersPass(const llvm::TargetData* TD);
-
-
-#endif
diff --git a/src/gallium/drivers/radeon/R600LowerInstructions.cpp b/src/gallium/drivers/radeon/R600LowerInstructions.cpp
index fb5431d0eef..dca1fe195cc 100644
--- a/src/gallium/drivers/radeon/R600LowerInstructions.cpp
+++ b/src/gallium/drivers/radeon/R600LowerInstructions.cpp
@@ -1,4 +1,4 @@
-//===-- R600LowerInstructions.cpp - TODO: Add brief description -------===//
+//===-- R600LowerInstructions.cpp - Lower unsupported AMDIL instructions --===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// This pass lowers AMDIL MachineInstrs that aren't supported by the R600
+// target to either supported AMDIL MachineInstrs or R600 MachineInstrs.
 //
 //===----------------------------------------------------------------------===//
 
@@ -93,8 +94,8 @@ bool R600LowerInstructionsPass::runOnMachineFunction(MachineFunction &MF)
                            &AMDIL::R600_TReg32RegClass);
           BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SETGE_INT),
                   setgt)
-                  .addOperand(MI.getOperand(1))
-                  .addReg(AMDIL::ZERO);
+                  .addReg(AMDIL::ZERO)
+                  .addOperand(MI.getOperand(1));
 
           unsigned add_int = MRI->createVirtualRegister(
                              &AMDIL::R600_TReg32RegClass);
@@ -311,7 +312,8 @@ bool R600LowerInstructionsPass::runOnMachineFunction(MachineFunction &MF)
         MachineInstr * defInstr = MRI->getVRegDef(maskedRegister);
         MachineOperand * def = defInstr->findRegisterDefOperand(maskedRegister);
         def->addTargetFlag(MO_FLAG_MASK);
-        break;
+        /* Continue so the instruction is not erased */
+        continue;
       }
 
       case AMDIL::NEGATE_i32:
@@ -342,6 +344,13 @@ bool R600LowerInstructionsPass::runOnMachineFunction(MachineFunction &MF)
           break;
         }
 
+      case AMDIL::ULT:
+        BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SETGT_UINT))
+                .addOperand(MI.getOperand(0))
+                .addOperand(MI.getOperand(2))
+                .addOperand(MI.getOperand(1));
+        break;
+
       default:
         continue;
       }
diff --git a/src/gallium/drivers/radeon/R600LowerShaderInstructions.cpp b/src/gallium/drivers/radeon/R600LowerShaderInstructions.cpp
deleted file mode 100644
index 394ee7006ce..00000000000
--- a/src/gallium/drivers/radeon/R600LowerShaderInstructions.cpp
+++ /dev/null
@@ -1,143 +0,0 @@
-//===-- R600LowerShaderInstructions.cpp - TODO: Add brief description -------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// TODO: Add full description
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPULowerShaderInstructions.h"
-#include "AMDIL.h"
-#include "AMDILInstrInfo.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-
-using namespace llvm;
-
-namespace {
-  class R600LowerShaderInstructionsPass : public MachineFunctionPass,
-        public AMDGPULowerShaderInstructionsPass {
-
-  private:
-    static char ID;
-    TargetMachine &TM;
-
-    void lowerEXPORT_REG_FAKE(MachineInstr &MI, MachineBasicBlock &MBB,
-        MachineBasicBlock::iterator I);
-    void lowerLOAD_INPUT(MachineInstr & MI);
-    bool lowerSTORE_OUTPUT(MachineInstr & MI, MachineBasicBlock &MBB,
-        MachineBasicBlock::iterator I);
-
-  public:
-    R600LowerShaderInstructionsPass(TargetMachine &tm) :
-      MachineFunctionPass(ID), TM(tm) { }
-
-      bool runOnMachineFunction(MachineFunction &MF);
-
-      const char *getPassName() const { return "R600 Lower Shader Instructions"; }
-    };
-} /* End anonymous namespace */
-
-char R600LowerShaderInstructionsPass::ID = 0;
-
-FunctionPass *llvm::createR600LowerShaderInstructionsPass(TargetMachine &tm) {
-    return new R600LowerShaderInstructionsPass(tm);
-}
-
-#define INSTR_CASE_FLOAT_V(inst) \
-  case AMDIL:: inst##_v4f32: \
-
-#define INSTR_CASE_FLOAT_S(inst) \
-  case AMDIL:: inst##_f32:
-
-#define INSTR_CASE_FLOAT(inst) \
-  INSTR_CASE_FLOAT_V(inst) \
-  INSTR_CASE_FLOAT_S(inst)
-bool R600LowerShaderInstructionsPass::runOnMachineFunction(MachineFunction &MF)
-{
-  MRI = &MF.getRegInfo();
-
-
-  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
-                                                  BB != BB_E; ++BB) {
-    MachineBasicBlock &MBB = *BB;
-    for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end();) {
-      MachineInstr &MI = *I;
-      bool deleteInstr = false;
-      switch (MI.getOpcode()) {
-
-      default: break;
-
-      case AMDIL::RESERVE_REG:
-      case AMDIL::EXPORT_REG:
-        deleteInstr = true;
-        break;
-
-      case AMDIL::LOAD_INPUT:
-        lowerLOAD_INPUT(MI);
-        deleteInstr = true;
-        break;
-
-      case AMDIL::STORE_OUTPUT:
-        deleteInstr = lowerSTORE_OUTPUT(MI, MBB, I);
-        break;
-
-      }
-
-      ++I;
-
-      if (deleteInstr) {
-        MI.eraseFromParent();
-      }
-    }
-  }
-
-  return false;
-}
-
-/* The goal of this function is to replace the virutal destination register of
- * a LOAD_INPUT instruction with the correct physical register that will.
- *
- * XXX: I don't think this is the right way things assign physical registers,
- * but I'm not sure of another way to do this.
- */
-void R600LowerShaderInstructionsPass::lowerLOAD_INPUT(MachineInstr &MI)
-{
-  MachineOperand &dst = MI.getOperand(0);
-  MachineOperand &arg = MI.getOperand(1);
-  int64_t inputIndex = arg.getImm();
-  const TargetRegisterClass * inputClass = TM.getRegisterInfo()->getRegClass(AMDIL::R600_TReg32RegClassID);
-  unsigned newRegister = inputClass->getRegister(inputIndex);
-  unsigned dstReg = dst.getReg();
-
-  preloadRegister(MI.getParent()->getParent(), TM.getInstrInfo(), newRegister,
-                  dstReg);
-}
-
-bool R600LowerShaderInstructionsPass::lowerSTORE_OUTPUT(MachineInstr &MI,
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator I)
-{
-  MachineOperand &valueOp = MI.getOperand(1);
-  MachineOperand &indexOp = MI.getOperand(2);
-  unsigned valueReg = valueOp.getReg();
-  int64_t outputIndex = indexOp.getImm();
-  const TargetRegisterClass * outputClass = TM.getRegisterInfo()->getRegClass(AMDIL::R600_TReg32RegClassID);
-  unsigned newRegister = outputClass->getRegister(outputIndex);
-
-  BuildMI(MBB, I, MBB.findDebugLoc(I), TM.getInstrInfo()->get(AMDIL::COPY),
-                  newRegister)
-                  .addReg(valueReg);
-
-  if (!MRI->isLiveOut(newRegister))
-    MRI->addLiveOut(newRegister);
-
-  return true;
-
-}
diff --git a/src/gallium/drivers/radeon/R600MachineFunctionInfo.cpp b/src/gallium/drivers/radeon/R600MachineFunctionInfo.cpp
new file mode 100644
index 00000000000..48443fb57d8
--- /dev/null
+++ b/src/gallium/drivers/radeon/R600MachineFunctionInfo.cpp
@@ -0,0 +1,16 @@
+//===-- R600MachineFunctionInfo.cpp - R600 Machine Function Info-*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600MachineFunctionInfo.h"
+
+using namespace llvm;
+
+R600MachineFunctionInfo::R600MachineFunctionInfo(const MachineFunction &MF)
+  : MachineFunctionInfo()
+  { }
diff --git a/src/gallium/drivers/radeon/R600MachineFunctionInfo.h b/src/gallium/drivers/radeon/R600MachineFunctionInfo.h
new file mode 100644
index 00000000000..948e1924272
--- /dev/null
+++ b/src/gallium/drivers/radeon/R600MachineFunctionInfo.h
@@ -0,0 +1,33 @@
+//===-- R600MachineFunctionInfo.h - R600 Machine Function Info ----*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// R600MachineFunctionInfo is used for keeping track of which registers have
+// been reserved by the llvm.AMDGPU.reserve.reg intrinsic.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef R600MACHINEFUNCTIONINFO_H
+#define R600MACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include <vector>
+
+namespace llvm {
+
+class R600MachineFunctionInfo : public MachineFunctionInfo {
+
+public:
+  R600MachineFunctionInfo(const MachineFunction &MF);
+  std::vector<unsigned> ReservedRegs;
+
+};
+
+} // End llvm namespace
+
+#endif //R600MACHINEFUNCTIONINFO_H
diff --git a/src/gallium/drivers/radeon/R600OpenCLUtils.h b/src/gallium/drivers/radeon/R600OpenCLUtils.h
deleted file mode 100644
index 91e41d63d0d..00000000000
--- a/src/gallium/drivers/radeon/R600OpenCLUtils.h
+++ /dev/null
@@ -1,49 +0,0 @@
-//===-- OpenCLUtils.h - TODO: Add brief description -------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// TODO: Add full description
-//
-//===----------------------------------------------------------------------===//
-#ifndef OPENCLUTILS_H
-#define OPENCLUTILS_H
-
-#include "llvm/Function.h"
-
-#include <llvm/Module.h>
-
-static bool isOpenCLKernel(const llvm::Function* fun)
-{
-  llvm::Module *mod = const_cast<llvm::Function*>(fun)->getParent();
-  llvm::NamedMDNode * md = mod->getOrInsertNamedMetadata("opencl.kernels");
-
-  if (!md or !md->getNumOperands())
-  {
-    return false;
-  }
-
-  for (int i = 0; i < int(md->getNumOperands()); i++)
-  {
-    if (!md->getOperand(i) or !md->getOperand(i)->getOperand(0))
-    {
-      continue;
-    }
-    
-    assert(md->getOperand(i)->getNumOperands() == 1);
-
-    if (md->getOperand(i)->getOperand(0)->getName() == fun->getName())
-    {
-      return true;
-    }
-  }
-
-  return false;
-}
-
-
-#endif
diff --git a/src/gallium/drivers/radeon/R600RegisterInfo.cpp b/src/gallium/drivers/radeon/R600RegisterInfo.cpp
index 96507b104cf..de559bd2dfa 100644
--- a/src/gallium/drivers/radeon/R600RegisterInfo.cpp
+++ b/src/gallium/drivers/radeon/R600RegisterInfo.cpp
@@ -1,4 +1,4 @@
-//===-- R600RegisterInfo.cpp - TODO: Add brief description -------===//
+//===-- R600RegisterInfo.cpp - R600 Register Information ------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,12 +7,13 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// The file contains the R600 implementation of the TargetRegisterInfo class.
 //
 //===----------------------------------------------------------------------===//
 
 #include "R600RegisterInfo.h"
 #include "AMDGPUTargetMachine.h"
+#include "R600MachineFunctionInfo.h"
 
 using namespace llvm;
 
@@ -26,6 +27,8 @@ R600RegisterInfo::R600RegisterInfo(AMDGPUTargetMachine &tm,
 BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const
 {
   BitVector Reserved(getNumRegs());
+  const R600MachineFunctionInfo * MFI = MF.getInfo<R600MachineFunctionInfo>();
+
   Reserved.set(AMDIL::ZERO);
   Reserved.set(AMDIL::HALF);
   Reserved.set(AMDIL::ONE);
@@ -40,19 +43,11 @@ BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const
     Reserved.set(*I);
   }
 
-  for (MachineFunction::const_iterator BB = MF.begin(),
-                                 BB_E = MF.end(); BB != BB_E; ++BB) {
-    const MachineBasicBlock &MBB = *BB;
-    for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
-                                                                  I != E; ++I) {
-      const MachineInstr &MI = *I;
-      if (MI.getOpcode() == AMDIL::RESERVE_REG) {
-        if (!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg())) {
-          Reserved.set(MI.getOperand(0).getReg());
-        }
-      }
-    }
+  for (std::vector<unsigned>::const_iterator I = MFI->ReservedRegs.begin(),
+                                    E = MFI->ReservedRegs.end(); I != E; ++I) {
+    Reserved.set(*I);
   }
+
   return Reserved;
 }
 
diff --git a/src/gallium/drivers/radeon/R600RegisterInfo.h b/src/gallium/drivers/radeon/R600RegisterInfo.h
index 95a44f971a0..89a11f9333b 100644
--- a/src/gallium/drivers/radeon/R600RegisterInfo.h
+++ b/src/gallium/drivers/radeon/R600RegisterInfo.h
@@ -1,4 +1,4 @@
-//===-- R600RegisterInfo.h - TODO: Add brief description -------===//
+//===-- R600RegisterInfo.h - R600 Register Info Interface ------*- C++ -*--===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// Interface definition for R600RegisterInfo
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/src/gallium/drivers/radeon/R600Schedule.td b/src/gallium/drivers/radeon/R600Schedule.td
index c6b1ca61bb5..d1957903d87 100644
--- a/src/gallium/drivers/radeon/R600Schedule.td
+++ b/src/gallium/drivers/radeon/R600Schedule.td
@@ -1,4 +1,4 @@
-//===-- R600Schedule.td - TODO: Add brief description -------===//
+//===-- R600Schedule.td - R600 Scheduling definitions ------*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// R600 has a VLIW architecture.  On pre-cayman cards there are 5 instruction
+// slots ALU.X, ALU.Y, ALU.Z, ALU.W, and TRANS.  For cayman cards, the TRANS
+// slot has been removed. 
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/src/gallium/drivers/radeon/SIAssignInterpRegs.cpp b/src/gallium/drivers/radeon/SIAssignInterpRegs.cpp
index b0bdf701a74..1ef097f7b1e 100644
--- a/src/gallium/drivers/radeon/SIAssignInterpRegs.cpp
+++ b/src/gallium/drivers/radeon/SIAssignInterpRegs.cpp
@@ -1,4 +1,4 @@
-//===-- SIAssignInterpRegs.cpp - TODO: Add brief description -------===//
+//===-- SIAssignInterpRegs.cpp - Assign interpolation registers -----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// This pass maps the pseudo interpolation registers to the correct physical
+// registers.  Prior to executing a fragment shader, the GPU loads interpolation
+// parameters into physical registers.  The specific physical register that each
+// interpolation parameter ends up in depends on the type of the interpolation
+// parameter as well as how many interpolation parameters are used by the
+// shader.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/src/gallium/drivers/radeon/SICodeEmitter.cpp b/src/gallium/drivers/radeon/SICodeEmitter.cpp
index ad494fae7c6..6970d9f0875 100644
--- a/src/gallium/drivers/radeon/SICodeEmitter.cpp
+++ b/src/gallium/drivers/radeon/SICodeEmitter.cpp
@@ -1,4 +1,4 @@
-//===-- SICodeEmitter.cpp - TODO: Add brief description -------===//
+//===-- SICodeEmitter.cpp - SI Code Emitter -------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// The SI code emitter produces machine code that can be executed directly on
+// the GPU device.
 //
 //===----------------------------------------------------------------------===//
 
@@ -144,8 +145,6 @@ bool SICodeEmitter::runOnMachineFunction(MachineFunction &MF)
 {
   MF.dump();
   TM = &MF.getTarget();
-  const AMDGPUInstrInfo * TII =
-                        static_cast<const AMDGPUInstrInfo*>(TM->getInstrInfo());
 
   emitState(MF);
 
@@ -155,8 +154,7 @@ bool SICodeEmitter::runOnMachineFunction(MachineFunction &MF)
     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
                                                       I != E; ++I) {
       MachineInstr &MI = *I;
-      if (!TII->isRegPreload(MI) && MI.getOpcode() != AMDIL::KILL
-          && MI.getOpcode() != AMDIL::RETURN) {
+      if (MI.getOpcode() != AMDIL::KILL && MI.getOpcode() != AMDIL::RETURN) {
         emitInstr(MI);
       }
     }
diff --git a/src/gallium/drivers/radeon/SIGenRegisterInfo.pl b/src/gallium/drivers/radeon/SIGenRegisterInfo.pl
index 644daa1bc22..bb5ebbd67e6 100644
--- a/src/gallium/drivers/radeon/SIGenRegisterInfo.pl
+++ b/src/gallium/drivers/radeon/SIGenRegisterInfo.pl
@@ -1,16 +1,17 @@
-#===-- SIGenRegisterInfo.pl - TODO: Add brief description -------===#
+#===-- SIGenRegisterInfo.pl - Script for generating register info files ----===#
 #
 #                     The LLVM Compiler Infrastructure
 #
 # This file is distributed under the University of Illinois Open Source
 # License. See LICENSE.TXT for details.
 #
-#===----------------------------------------------------------------------===#
+#===------------------------------------------------------------------------===#
 #
-# TODO: Add full description
+# This perl script prints to stdout .td code to be used as SIRegisterInfo.td
+# it also generates a file called SIHwRegInfo.include, which contains helper
+# functions for determining the hw encoding of registers.
 #
-#===----------------------------------------------------------------------===#
-
+#===------------------------------------------------------------------------===#
 
 use strict;
 use warnings;
diff --git a/src/gallium/drivers/radeon/SIISelLowering.cpp b/src/gallium/drivers/radeon/SIISelLowering.cpp
index 1a4b47ecbf5..441a4a07290 100644
--- a/src/gallium/drivers/radeon/SIISelLowering.cpp
+++ b/src/gallium/drivers/radeon/SIISelLowering.cpp
@@ -1,4 +1,4 @@
-//===-- SIISelLowering.cpp - TODO: Add brief description -------===//
+//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// Most of the DAG lowering is handled in AMDILISelLowering.cpp.  This file is
+// mostly EmitInstrWithCustomInserter().
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/src/gallium/drivers/radeon/SIISelLowering.h b/src/gallium/drivers/radeon/SIISelLowering.h
index e7a79f8e215..229e682ef51 100644
--- a/src/gallium/drivers/radeon/SIISelLowering.h
+++ b/src/gallium/drivers/radeon/SIISelLowering.h
@@ -1,4 +1,4 @@
-//===-- SIISelLowering.h - TODO: Add brief description -------===//
+//===-- SIISelLowering.h - SI DAG Lowering Interface ------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// SI DAG Lowering interface definition
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/src/gallium/drivers/radeon/SIInstrFormats.td b/src/gallium/drivers/radeon/SIInstrFormats.td
index caf9b0ef120..de0d4fa39d2 100644
--- a/src/gallium/drivers/radeon/SIInstrFormats.td
+++ b/src/gallium/drivers/radeon/SIInstrFormats.td
@@ -1,4 +1,4 @@
-//===-- SIInstrFormats.td - TODO: Add brief description -------===//
+//===-- SIInstrFormats.td - SI Instruction Formats ------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,17 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// SI Instruction format definitions.
+//
+// Instructions with _32 take 32-bit operands.
+// Instructions with _64 take 64-bit operands.
+//
+// VOP_* instructions can use either a 32-bit or 64-bit encoding.  The 32-bit
+// encoding is the standard encoding, but instruction that make use of
+// any of the instruction modifiers must use the 64-bit encoding.
+//
+// Instructions with _e32 use the 32-bit encoding.
+// Instructions with _e64 use the 64-bit encoding.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/src/gallium/drivers/radeon/SIInstrInfo.cpp b/src/gallium/drivers/radeon/SIInstrInfo.cpp
index 6f92e96c6e7..0cb97643a7f 100644
--- a/src/gallium/drivers/radeon/SIInstrInfo.cpp
+++ b/src/gallium/drivers/radeon/SIInstrInfo.cpp
@@ -1,4 +1,4 @@
-//===-- SIInstrInfo.cpp - TODO: Add brief description -------===//
+//===-- SIInstrInfo.cpp - SI Instruction Information  ---------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// SI Implementation of TargetInstrInfo.
 //
 //===----------------------------------------------------------------------===//
 
@@ -107,6 +107,8 @@ unsigned SIInstrInfo::getISAOpcode(unsigned AMDILopcode) const
 {
   switch (AMDILopcode) {
   case AMDIL::MAD_f32: return AMDIL::V_MAD_LEGACY_F32;
+  //XXX We need a better way of detecting end of program
+  case AMDIL::RETURN: return AMDIL::S_ENDPGM;
   default: return AMDGPUInstrInfo::getISAOpcode(AMDILopcode);
   }
 }
diff --git a/src/gallium/drivers/radeon/SIInstrInfo.h b/src/gallium/drivers/radeon/SIInstrInfo.h
index bd76c3f94aa..68940ea3ca4 100644
--- a/src/gallium/drivers/radeon/SIInstrInfo.h
+++ b/src/gallium/drivers/radeon/SIInstrInfo.h
@@ -1,4 +1,4 @@
-//===-- SIInstrInfo.h - TODO: Add brief description -------===//
+//===-- SIInstrInfo.h - SI Instruction Info Interface ---------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// Interface definition for SIInstrInfo.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/src/gallium/drivers/radeon/SIInstrInfo.td b/src/gallium/drivers/radeon/SIInstrInfo.td
index 65b28ec84ad..435948ff1de 100644
--- a/src/gallium/drivers/radeon/SIInstrInfo.td
+++ b/src/gallium/drivers/radeon/SIInstrInfo.td
@@ -1,4 +1,4 @@
-//===-- SIInstrInfo.td - TODO: Add brief description -------===//
+//===-- SIInstrInfo.td - SI Instruction Encodings ---------*- tablegen -*--===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,10 +6,6 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// TODO: Add full description
-//
-//===----------------------------------------------------------------------===//
 
 
 
@@ -464,11 +460,4 @@ def IMM12bit : ImmLeaf <
 
 include "SIInstrFormats.td"
 
-def LOAD_CONST : AMDGPUShaderInst <
-  (outs GPRF32:$dst),
-  (ins i32imm:$src),
-  "LOAD_CONST $dst, $src",
-  [(set GPRF32:$dst, (int_AMDGPU_load_const imm:$src))]
->;
-
 include "SIInstructions.td"
diff --git a/src/gallium/drivers/radeon/SIInstructions.td b/src/gallium/drivers/radeon/SIInstructions.td
index 27a8b31757a..57bbc7a5d5a 100644
--- a/src/gallium/drivers/radeon/SIInstructions.td
+++ b/src/gallium/drivers/radeon/SIInstructions.td
@@ -1,4 +1,4 @@
-//===-- SIInstructions.td - TODO: Add brief description -------===//
+//===-- SIInstructions.td - SI Instruction Defintions ---------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,10 +6,6 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// TODO: Add full description
-//
-//===----------------------------------------------------------------------===//
 
 
 def isSI : Predicate<"Subtarget.device()"
@@ -800,6 +796,13 @@ def CONFIG_WRITE : InstSI <
   field bits<32> Inst = 0;
 }
 
+def LOAD_CONST : AMDGPUShaderInst <
+  (outs GPRF32:$dst),
+  (ins i32imm:$src),
+  "LOAD_CONST $dst, $src",
+  [(set GPRF32:$dst, (int_AMDGPU_load_const imm:$src))]
+>;
+
 let usesCustomInserter = 1 in {
 
 def SI_V_CNDLT : InstSI <
@@ -833,7 +836,6 @@ def USE_SGPR_32 : InstSI <
 
 > {
   field bits<32> Inst = 0;
-  let PreloadReg = 1;
 }
 
 def USE_SGPR_64 : InstSI <
@@ -844,7 +846,6 @@ def USE_SGPR_64 : InstSI <
 
 > {
   field bits<32> Inst = 0;
-  let PreloadReg = 1;
 }
 
 def VS_LOAD_BUFFER_INDEX : InstSI <
@@ -854,7 +855,6 @@ def VS_LOAD_BUFFER_INDEX : InstSI <
   [(set VReg_32:$dst, (int_SI_vs_load_buffer_index))]> {
 
   field bits<32> Inst = 0;
-  let PreloadReg = 1;
 }
 
 } // end usesCustomInserter 
diff --git a/src/gallium/drivers/radeon/SIIntrinsics.td b/src/gallium/drivers/radeon/SIIntrinsics.td
index e3014e13916..4d23072d4f1 100644
--- a/src/gallium/drivers/radeon/SIIntrinsics.td
+++ b/src/gallium/drivers/radeon/SIIntrinsics.td
@@ -1,4 +1,4 @@
-//===-- SIIntrinsics.td - TODO: Add brief description -------===//
+//===-- SIIntrinsics.td - SI Intrinsic defs ----------------*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// SI Intrinsic Definitions
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/src/gallium/drivers/radeon/SILowerShaderInstructions.cpp b/src/gallium/drivers/radeon/SILowerShaderInstructions.cpp
deleted file mode 100644
index 5d49d88dc7c..00000000000
--- a/src/gallium/drivers/radeon/SILowerShaderInstructions.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-//===-- SILowerShaderInstructions.cpp - TODO: Add brief description -------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// TODO: Add full description
-//
-//===----------------------------------------------------------------------===//
-
-
-#include "AMDGPU.h"
-#include "AMDGPULowerShaderInstructions.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-
-using namespace llvm;
-
-namespace {
-  class SILowerShaderInstructionsPass : public MachineFunctionPass,
-      public AMDGPULowerShaderInstructionsPass {
-
-  private:
-    static char ID;
-    TargetMachine &TM;
-
-  public:
-    SILowerShaderInstructionsPass(TargetMachine &tm) :
-      MachineFunctionPass(ID), TM(tm) { }
-
-    bool runOnMachineFunction(MachineFunction &MF);
-
-    const char *getPassName() const { return "SI Lower Shader Instructions"; }
-
-    void lowerRETURN(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
-    void lowerSET_M0(MachineInstr &MI, MachineBasicBlock &MBB,
-                     MachineBasicBlock::iterator I);
-  };
-} /* End anonymous namespace */
-
-char SILowerShaderInstructionsPass::ID = 0;
-
-FunctionPass *llvm::createSILowerShaderInstructionsPass(TargetMachine &tm) {
-    return new SILowerShaderInstructionsPass(tm);
-}
-
-bool SILowerShaderInstructionsPass::runOnMachineFunction(MachineFunction &MF)
-{
-  MRI = &MF.getRegInfo();
-  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
-                                                  BB != BB_E; ++BB) {
-    MachineBasicBlock &MBB = *BB;
-    for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
-         I != MBB.end(); I = Next, Next = llvm::next(I) ) {
-      MachineInstr &MI = *I;
-      switch (MI.getOpcode()) {
-      case AMDIL::RETURN:
-        lowerRETURN(MBB, I);
-        break;
-      case AMDIL::SET_M0:
-        lowerSET_M0(MI, MBB, I);
-        break;
-      default: continue;
-      }
-      MI.removeFromParent();
-    }
-  }
-
-  return false;
-}
-
-void SILowerShaderInstructionsPass::lowerRETURN(MachineBasicBlock &MBB,
-    MachineBasicBlock::iterator I)
-{
-  const struct TargetInstrInfo * TII = TM.getInstrInfo();
-  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::S_ENDPGM));
-}
-
-void SILowerShaderInstructionsPass::lowerSET_M0(MachineInstr &MI,
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator I)
-{
-  const struct TargetInstrInfo * TII = TM.getInstrInfo();
-  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::S_MOV_IMM_I32))
-          .addReg(AMDIL::M0)
-          .addOperand(MI.getOperand(1));
-}
diff --git a/src/gallium/drivers/radeon/SIMachineFunctionInfo.cpp b/src/gallium/drivers/radeon/SIMachineFunctionInfo.cpp
index eace40c226c..40ba76f1f86 100644
--- a/src/gallium/drivers/radeon/SIMachineFunctionInfo.cpp
+++ b/src/gallium/drivers/radeon/SIMachineFunctionInfo.cpp
@@ -1,4 +1,4 @@
-//===-- SIMachineFunctionInfo.cpp - TODO: Add brief description -------===//
+//===-- SIMachineFunctionInfo.cpp - SI Machine Function Info -------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,10 +6,6 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// TODO: Add full description
-//
-//===----------------------------------------------------------------------===//
 
 
 #include "SIMachineFunctionInfo.h"
diff --git a/src/gallium/drivers/radeon/SIMachineFunctionInfo.h b/src/gallium/drivers/radeon/SIMachineFunctionInfo.h
index 5647de9d81f..46a021f3613 100644
--- a/src/gallium/drivers/radeon/SIMachineFunctionInfo.h
+++ b/src/gallium/drivers/radeon/SIMachineFunctionInfo.h
@@ -1,4 +1,4 @@
-//===-- SIMachineFunctionInfo.h - TODO: Add brief description -------===//
+//===- SIMachineFunctionInfo.h - SIMachineFunctionInfo interface -*- C++ -*-==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// SIMachineFunctionInfo is used to keep track of the spi_sp_input_addr config
+// register, which is to tell the hardware which interpolation parameters to
+// load.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/src/gallium/drivers/radeon/SIPropagateImmReads.cpp b/src/gallium/drivers/radeon/SIPropagateImmReads.cpp
index 4f925d5de1c..6a165488831 100644
--- a/src/gallium/drivers/radeon/SIPropagateImmReads.cpp
+++ b/src/gallium/drivers/radeon/SIPropagateImmReads.cpp
@@ -1,4 +1,4 @@
-//===-- SIPropagateImmReads.cpp - TODO: Add brief description -------===//
+//===-- SIPropagateImmReads.cpp - Lower Immediate Reads Pass --------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// We can't do this in the ConvertToISA pass, because later passes might
+// create LOADCONST_* instructions that we would miss.  This is why we need 
+// a separate pass for this.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/src/gallium/drivers/radeon/SIRegisterInfo.cpp b/src/gallium/drivers/radeon/SIRegisterInfo.cpp
index da2ec36a773..2d530a4f022 100644
--- a/src/gallium/drivers/radeon/SIRegisterInfo.cpp
+++ b/src/gallium/drivers/radeon/SIRegisterInfo.cpp
@@ -1,4 +1,4 @@
-//===-- SIRegisterInfo.cpp - TODO: Add brief description -------===//
+//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// This file contains the SI implementation of the TargetRegisterInfo class.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/src/gallium/drivers/radeon/SIRegisterInfo.h b/src/gallium/drivers/radeon/SIRegisterInfo.h
index c797e3c8ace..77f3261efc5 100644
--- a/src/gallium/drivers/radeon/SIRegisterInfo.h
+++ b/src/gallium/drivers/radeon/SIRegisterInfo.h
@@ -1,4 +1,4 @@
-//===-- SIRegisterInfo.h - TODO: Add brief description -------===//
+//===-- SIRegisterInfo.h - SI Register Info Interface ----------*- C++ -*--===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// Interface definition for SIRegisterInfo
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/src/gallium/drivers/radeon/SISchedule.td b/src/gallium/drivers/radeon/SISchedule.td
index 9e99268e9ca..28b65b82585 100644
--- a/src/gallium/drivers/radeon/SISchedule.td
+++ b/src/gallium/drivers/radeon/SISchedule.td
@@ -1,4 +1,4 @@
-//===-- SISchedule.td - TODO: Add brief description -------===//
+//===-- SISchedule.td - SI Scheduling definitons -------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// TODO: This is just a place holder for now.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/src/gallium/drivers/radeon/radeon_llvm.h b/src/gallium/drivers/radeon/radeon_llvm.h
index 9be7f90c3e6..4a706397fdd 100644
--- a/src/gallium/drivers/radeon/radeon_llvm.h
+++ b/src/gallium/drivers/radeon/radeon_llvm.h
@@ -36,6 +36,8 @@
 #define RADEON_LLVM_MAX_BRANCH_DEPTH 16
 #define RADEON_LLVM_MAX_LOOP_DEPTH 16
 
+#define RADEON_LLVM_MAX_SYSTEM_VALUES 4
+
 struct radeon_llvm_branch {
 	LLVMBasicBlockRef endif_block;
 	LLVMBasicBlockRef if_block;
@@ -78,6 +80,9 @@ struct radeon_llvm_context {
 			unsigned input_index,
 			const struct tgsi_full_declaration *decl);
 
+	void (*load_system_value)(struct radeon_llvm_context *,
+			unsigned index,
+			const struct tgsi_full_declaration *decl);
 
 	/** User data to use with the callbacks */
 	void * userdata;
@@ -90,6 +95,8 @@ struct radeon_llvm_context {
 	LLVMValueRef outputs[RADEON_LLVM_MAX_OUTPUTS][TGSI_NUM_CHANNELS];
 	unsigned output_reg_count;
 
+	LLVMValueRef system_values[RADEON_LLVM_MAX_SYSTEM_VALUES];
+
 	unsigned reserved_reg_count;
 	/*=== Private Members ===*/
 
@@ -105,6 +112,37 @@ struct radeon_llvm_context {
 	struct gallivm_state gallivm;
 };
 
+static inline LLVMValueRef bitcast(
+		struct lp_build_tgsi_context * bld_base,
+		enum tgsi_opcode_type type,
+		LLVMValueRef value
+)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	LLVMContextRef ctx = bld_base->base.gallivm->context;
+	LLVMTypeRef dst_type;
+
+	switch (type) {
+	case TGSI_TYPE_UNSIGNED:
+	case TGSI_TYPE_SIGNED:
+		dst_type = LLVMInt32TypeInContext(ctx);
+		break;
+	case TGSI_TYPE_UNTYPED:
+	case TGSI_TYPE_FLOAT:
+		dst_type = LLVMFloatTypeInContext(ctx);
+		break;
+	default:
+		dst_type = 0;
+		break;
+	}
+
+	if (dst_type)
+		return LLVMBuildBitCast(builder, value, dst_type, "");
+	else
+		return value;
+}
+
+
 void radeon_llvm_context_init(struct radeon_llvm_context * ctx);
 
 void radeon_llvm_dispose(struct radeon_llvm_context * ctx);
diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.cpp b/src/gallium/drivers/radeon/radeon_llvm_emit.cpp
index b409cb2175e..ebc32106b52 100644
--- a/src/gallium/drivers/radeon/radeon_llvm_emit.cpp
+++ b/src/gallium/drivers/radeon/radeon_llvm_emit.cpp
@@ -93,19 +93,20 @@ radeon_llvm_compile(LLVMModuleRef M, unsigned char ** bytes,
    AMDGPUTriple.setArch(Arch);
 
    Module * mod = unwrap(M);
-   std::string FS = gpu_family;
+   std::string FS;
    TargetOptions TO;
 
+   if (dump) {
+      mod->dump();
+      FS += "+DumpCode";
+   }
+
    std::auto_ptr<TargetMachine> tm(AMDGPUTarget->createTargetMachine(
-                     AMDGPUTriple.getTriple(), gpu_family, "" /* Features */,
+                     AMDGPUTriple.getTriple(), gpu_family, FS,
                      TO, Reloc::Default, CodeModel::Default,
                      CodeGenOpt::Default
                      ));
    TargetMachine &AMDGPUTargetMachine = *tm.get();
-   /* XXX: Use TargetMachine.Options in 3.0 */
-   if (dump) {
-      mod->dump();
-   }
    PassManager PM;
    PM.add(new TargetData(*AMDGPUTargetMachine.getTargetData()));
    PM.add(createPromoteMemoryToRegisterPass());
diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
index 62de9da28de..6e6fc3d12cd 100644
--- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
@@ -29,6 +29,7 @@
 #include "gallivm/lp_bld_gather.h"
 #include "gallivm/lp_bld_flow.h"
 #include "gallivm/lp_bld_init.h"
+#include "gallivm/lp_bld_intr.h"
 #include "gallivm/lp_bld_swizzle.h"
 #include "tgsi/tgsi_info.h"
 #include "tgsi/tgsi_parse.h"
@@ -112,8 +113,25 @@ emit_fetch_immediate(
 	enum tgsi_opcode_type type,
 	unsigned swizzle)
 {
+	LLVMTypeRef ctype;
+	LLVMContextRef ctx = bld_base->base.gallivm->context;
+
+	switch (type) {
+	case TGSI_TYPE_UNSIGNED:
+	case TGSI_TYPE_SIGNED:
+		ctype = LLVMInt32TypeInContext(ctx);
+		break;
+	case TGSI_TYPE_UNTYPED:
+	case TGSI_TYPE_FLOAT:
+		ctype = LLVMFloatTypeInContext(ctx);
+		break;
+	default:
+		ctype = 0;
+		break;
+	}
+
 	struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
-	return bld->immediates[reg->Register.Index][swizzle];
+	return LLVMConstBitCast(bld->immediates[reg->Register.Index][swizzle], ctype);
 }
 
 static LLVMValueRef
@@ -134,7 +152,7 @@ emit_fetch_input(
 		return lp_build_gather_values(bld_base->base.gallivm, values,
 						TGSI_NUM_CHANNELS);
 	} else {
-		return ctx->inputs[radeon_llvm_reg_index_soa(reg->Register.Index, swizzle)];
+		return bitcast(bld_base, type, ctx->inputs[radeon_llvm_reg_index_soa(reg->Register.Index, swizzle)]);
 	}
 }
 
@@ -155,7 +173,7 @@ emit_fetch_temporary(
 	} else {
 		LLVMValueRef temp_ptr;
 		temp_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index, swizzle);
-		return LLVMBuildLoad(builder, temp_ptr, "");
+		return bitcast(bld_base,type,LLVMBuildLoad(builder, temp_ptr, ""));
 	}
 }
 
@@ -213,6 +231,15 @@ static void emit_declaration(
 	}
 	break;
 
+	case TGSI_FILE_SYSTEM_VALUE:
+	{
+		unsigned idx;
+		for (idx = decl->Range.First; idx <= decl->Range.Last; idx++) {
+			ctx->load_system_value(ctx, idx, decl);
+		}
+	}
+	break;
+
 	case TGSI_FILE_OUTPUT:
 	{
 		unsigned idx;
@@ -304,6 +331,9 @@ emit_store(
 		default:
 			return;
 		}
+
+		value = bitcast(bld_base, TGSI_TYPE_FLOAT, value);
+
 		LLVMBuildStore(builder, value, temp_ptr);
 	}
 }
@@ -444,8 +474,10 @@ static void if_emit(
 	struct gallivm_state * gallivm = bld_base->base.gallivm;
 	LLVMValueRef cond;
 	LLVMBasicBlockRef if_block, else_block, endif_block;
-	cond = LLVMBuildFCmp(gallivm->builder, LLVMRealOEQ, emit_data->args[0],
-							bld_base->base.one, "");
+
+	cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
+	        bitcast(bld_base, TGSI_TYPE_UNSIGNED, emit_data->args[0]),
+			bld_base->int_bld.zero, "");
 
 	endif_block = LLVMAppendBasicBlockInContext(gallivm->context,
 						ctx->main_fn, "ENDIF");
@@ -463,6 +495,101 @@ static void if_emit(
 	ctx->branch[ctx->branch_depth - 1].has_else = 0;
 }
 
+static void kil_emit(
+	const struct lp_build_tgsi_action * action,
+	struct lp_build_tgsi_context * bld_base,
+	struct lp_build_emit_data * emit_data)
+{
+	unsigned i;
+	for (i = 0; i < emit_data->arg_count; i++) {
+		emit_data->output[i] = lp_build_intrinsic_unary(
+			bld_base->base.gallivm->builder,
+			action->intr_name,
+			emit_data->dst_type, emit_data->args[i]);
+	}
+}
+
+
+static void emit_prepare_cube_coords(
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	boolean shadowcube = (emit_data->inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE);
+	struct gallivm_state * gallivm = bld_base->base.gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	LLVMTypeRef type = bld_base->base.elem_type;
+	LLVMValueRef coords[4];
+	LLVMValueRef mad_args[3];
+	unsigned i, cnt;
+
+	LLVMValueRef v = lp_build_intrinsic(builder, "llvm.AMDGPU.cube",
+			LLVMVectorType(type, 4),
+			&emit_data->args[0],1);
+
+	/* save src.w for shadow cube */
+	cnt = shadowcube ? 3 : 4;
+
+	for (i = 0; i < cnt; ++i) {
+		LLVMValueRef idx = lp_build_const_int32(gallivm, i);
+		coords[i] = LLVMBuildExtractElement(builder, v, idx, "");
+	}
+
+	coords[2] = lp_build_intrinsic(builder, "llvm.AMDIL.fabs.",
+			type, &coords[2], 1);
+	coords[2] = lp_build_intrinsic(builder, "llvm.AMDGPU.rcp",
+			type, &coords[2], 1);
+
+	mad_args[1] = coords[2];
+	mad_args[2] = LLVMConstReal(type, 1.5);
+
+	mad_args[0] = coords[0];
+	coords[0] = lp_build_intrinsic(builder, "llvm.AMDIL.mad.",
+			type, mad_args, 3);
+
+	mad_args[0] = coords[1];
+	coords[1] = lp_build_intrinsic(builder, "llvm.AMDIL.mad.",
+			type, mad_args, 3);
+
+	/* apply yxwy swizzle to cooords */
+	coords[2] = coords[3];
+	coords[3] = coords[1];
+	coords[1] = coords[0];
+	coords[0] = coords[3];
+
+	emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm,
+						coords, 4);
+}
+
+static void txp_fetch_args(
+	struct lp_build_tgsi_context * bld_base,
+	struct lp_build_emit_data * emit_data)
+{
+	const struct tgsi_full_instruction * inst = emit_data->inst;
+	LLVMValueRef src_w;
+	unsigned chan;
+	LLVMValueRef coords[4];
+
+	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
+	src_w = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
+
+	for (chan = 0; chan < 3; chan++ ) {
+		LLVMValueRef arg = lp_build_emit_fetch(bld_base,
+						emit_data->inst, 0, chan);
+		coords[chan] = lp_build_emit_llvm_binary(bld_base,
+					TGSI_OPCODE_DIV, arg, src_w);
+	}
+	coords[3] = bld_base->base.one;
+	emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm,
+						coords, 4);
+	emit_data->arg_count = 1;
+
+	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
+	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE) &&
+	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ) {
+		emit_prepare_cube_coords(bld_base, emit_data);
+	}
+}
+
 static void tex_fetch_args(
 	struct lp_build_tgsi_context * bld_base,
 	struct lp_build_emit_data * emit_data)
@@ -475,16 +602,261 @@ static void tex_fetch_args(
 
 	*/
 
+	const struct tgsi_full_instruction * inst = emit_data->inst;
+
 	LLVMValueRef coords[4];
 	unsigned chan;
 	for (chan = 0; chan < 4; chan++) {
-		coords[chan] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, chan);
+		coords[chan] = lp_build_emit_fetch(bld_base, inst, 0, chan);
 	}
 
 	emit_data->arg_count = 1;
 	emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm,
 						coords, 4);
 	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
+
+	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
+	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE) &&
+	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ) {
+		emit_prepare_cube_coords(bld_base, emit_data);
+	}
+}
+
+static void emit_icmp(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	unsigned pred;
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	LLVMContextRef context = bld_base->base.gallivm->context;
+
+	switch (emit_data->inst->Instruction.Opcode) {
+	case TGSI_OPCODE_USEQ: pred = LLVMIntEQ; break;
+	case TGSI_OPCODE_USNE: pred = LLVMIntNE; break;
+	case TGSI_OPCODE_USGE: pred = LLVMIntUGE; break;
+	case TGSI_OPCODE_USLT: pred = LLVMIntULT; break;
+	case TGSI_OPCODE_ISGE: pred = LLVMIntSGE; break;
+	case TGSI_OPCODE_ISLT: pred = LLVMIntSLT; break;
+	default:
+		assert(!"unknown instruction");
+	}
+
+	LLVMValueRef v = LLVMBuildICmp(builder, pred,
+			emit_data->args[0], emit_data->args[1],"");
+
+	v = LLVMBuildSExtOrBitCast(builder, v,
+			LLVMInt32TypeInContext(context), "");
+
+	emit_data->output[emit_data->chan] = v;
+}
+
+static void emit_not(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	LLVMValueRef v = bitcast(bld_base, TGSI_TYPE_UNSIGNED,
+			emit_data->args[0]);
+	emit_data->output[emit_data->chan] = LLVMBuildNot(builder, v, "");
+}
+
+static void emit_and(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	emit_data->output[emit_data->chan] = LLVMBuildAnd(builder,
+			emit_data->args[0], emit_data->args[1], "");
+}
+
+static void emit_or(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	emit_data->output[emit_data->chan] = LLVMBuildOr(builder,
+			emit_data->args[0], emit_data->args[1], "");
+}
+
+static void emit_uadd(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	emit_data->output[emit_data->chan] = LLVMBuildAdd(builder,
+			emit_data->args[0], emit_data->args[1], "");
+}
+
+static void emit_udiv(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	emit_data->output[emit_data->chan] = LLVMBuildUDiv(builder,
+			emit_data->args[0], emit_data->args[1], "");
+}
+
+static void emit_idiv(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	emit_data->output[emit_data->chan] = LLVMBuildSDiv(builder,
+			emit_data->args[0], emit_data->args[1], "");
+}
+
+static void emit_mod(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	emit_data->output[emit_data->chan] = LLVMBuildSRem(builder,
+			emit_data->args[0], emit_data->args[1], "");
+}
+
+static void emit_umod(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	emit_data->output[emit_data->chan] = LLVMBuildURem(builder,
+			emit_data->args[0], emit_data->args[1], "");
+}
+
+static void emit_shl(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	emit_data->output[emit_data->chan] = LLVMBuildShl(builder,
+			emit_data->args[0], emit_data->args[1], "");
+}
+
+static void emit_ushr(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	emit_data->output[emit_data->chan] = LLVMBuildLShr(builder,
+			emit_data->args[0], emit_data->args[1], "");
+}
+static void emit_ishr(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	emit_data->output[emit_data->chan] = LLVMBuildAShr(builder,
+			emit_data->args[0], emit_data->args[1], "");
+}
+
+static void emit_xor(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	emit_data->output[emit_data->chan] = LLVMBuildXor(builder,
+			emit_data->args[0], emit_data->args[1], "");
+}
+
+static void emit_ssg(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+
+	LLVMValueRef cmp, val;
+
+	if (emit_data->inst->Instruction.Opcode == TGSI_OPCODE_ISSG) {
+		cmp = LLVMBuildICmp(builder, LLVMIntSGT, emit_data->args[0], bld_base->int_bld.zero, "");
+		val = LLVMBuildSelect(builder, cmp, bld_base->int_bld.one, emit_data->args[0], "");
+		cmp = LLVMBuildICmp(builder, LLVMIntSGE, val, bld_base->int_bld.zero, "");
+		val = LLVMBuildSelect(builder, cmp, val, LLVMConstInt(bld_base->int_bld.elem_type, -1, true), "");
+	} else { // float SSG
+		cmp = LLVMBuildFCmp(builder, LLVMRealUGT, emit_data->args[0], bld_base->int_bld.zero, "");
+		val = LLVMBuildSelect(builder, cmp, bld_base->base.one, emit_data->args[0], "");
+		cmp = LLVMBuildFCmp(builder, LLVMRealUGE, val, bld_base->base.zero, "");
+		val = LLVMBuildSelect(builder, cmp, val, LLVMConstReal(bld_base->base.elem_type, -1), "");
+	}
+
+	emit_data->output[emit_data->chan] = val;
+}
+
+static void emit_ineg(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	emit_data->output[emit_data->chan] = LLVMBuildNeg(builder,
+			emit_data->args[0], "");
+}
+
+static void emit_f2i(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	emit_data->output[emit_data->chan] = LLVMBuildFPToSI(builder,
+			emit_data->args[0], bld_base->int_bld.elem_type, "");
+}
+
+static void emit_f2u(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	emit_data->output[emit_data->chan] = LLVMBuildFPToUI(builder,
+			emit_data->args[0], bld_base->uint_bld.elem_type, "");
+}
+
+static void emit_i2f(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	emit_data->output[emit_data->chan] = LLVMBuildSIToFP(builder,
+			emit_data->args[0], bld_base->base.elem_type, "");
+}
+
+static void emit_u2f(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	emit_data->output[emit_data->chan] = LLVMBuildUIToFP(builder,
+			emit_data->args[0], bld_base->base.elem_type, "");
+}
+
+static void emit_immediate(struct lp_build_tgsi_context * bld_base,
+		const struct tgsi_full_immediate *imm)
+{
+	unsigned i;
+	struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
+
+	for (i = 0; i < 4; ++i) {
+		ctx->soa.immediates[ctx->soa.num_immediates][i] =
+				LLVMConstInt(bld_base->uint_bld.elem_type, imm->u[i].Uint, false   );
+	}
+
+	ctx->soa.num_immediates++;
 }
 
 void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
@@ -526,12 +898,13 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 
 	lp_build_context_init(&bld_base->base, &ctx->gallivm, type);
 	lp_build_context_init(&ctx->soa.bld_base.uint_bld, &ctx->gallivm, lp_uint_type(type));
+	lp_build_context_init(&ctx->soa.bld_base.int_bld, &ctx->gallivm, lp_int_type(type));
 
 	bld_base->soa = 1;
 	bld_base->emit_store = emit_store;
 	bld_base->emit_swizzle = emit_swizzle;
 	bld_base->emit_declaration = emit_declaration;
-	bld_base->emit_immediate = lp_emit_immediate_soa;
+	bld_base->emit_immediate = emit_immediate;
 
 	bld_base->emit_fetch_funcs[TGSI_FILE_IMMEDIATE] = emit_fetch_immediate;
 	bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = emit_fetch_input;
@@ -545,6 +918,60 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 
 	lp_set_default_actions(bld_base);
 
+	bld_base->op_actions[TGSI_OPCODE_IABS].emit = lp_build_tgsi_intrinsic;
+	bld_base->op_actions[TGSI_OPCODE_IABS].intr_name = "llvm.AMDIL.abs.";
+	bld_base->op_actions[TGSI_OPCODE_NOT].emit = emit_not;
+	bld_base->op_actions[TGSI_OPCODE_AND].emit = emit_and;
+	bld_base->op_actions[TGSI_OPCODE_XOR].emit = emit_xor;
+	bld_base->op_actions[TGSI_OPCODE_OR].emit = emit_or;
+	bld_base->op_actions[TGSI_OPCODE_UADD].emit = emit_uadd;
+	bld_base->op_actions[TGSI_OPCODE_UDIV].emit = emit_udiv;
+	bld_base->op_actions[TGSI_OPCODE_IDIV].emit = emit_idiv;
+	bld_base->op_actions[TGSI_OPCODE_MOD].emit = emit_mod;
+	bld_base->op_actions[TGSI_OPCODE_UMOD].emit = emit_umod;
+	bld_base->op_actions[TGSI_OPCODE_INEG].emit = emit_ineg;
+	bld_base->op_actions[TGSI_OPCODE_SHL].emit = emit_shl;
+	bld_base->op_actions[TGSI_OPCODE_ISHR].emit = emit_ishr;
+	bld_base->op_actions[TGSI_OPCODE_USHR].emit = emit_ushr;
+	bld_base->op_actions[TGSI_OPCODE_SSG].emit = emit_ssg;
+	bld_base->op_actions[TGSI_OPCODE_ISSG].emit = emit_ssg;
+	bld_base->op_actions[TGSI_OPCODE_I2F].emit = emit_i2f;
+	bld_base->op_actions[TGSI_OPCODE_U2F].emit = emit_u2f;
+	bld_base->op_actions[TGSI_OPCODE_F2I].emit = emit_f2i;
+	bld_base->op_actions[TGSI_OPCODE_F2U].emit = emit_f2u;
+	bld_base->op_actions[TGSI_OPCODE_DDX].intr_name = "llvm.AMDGPU.ddx";
+	bld_base->op_actions[TGSI_OPCODE_DDX].fetch_args = tex_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_DDY].intr_name = "llvm.AMDGPU.ddy";
+	bld_base->op_actions[TGSI_OPCODE_DDY].fetch_args = tex_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_USEQ].emit = emit_icmp;
+	bld_base->op_actions[TGSI_OPCODE_USGE].emit = emit_icmp;
+	bld_base->op_actions[TGSI_OPCODE_USLT].emit = emit_icmp;
+	bld_base->op_actions[TGSI_OPCODE_USNE].emit = emit_icmp;
+	bld_base->op_actions[TGSI_OPCODE_ISGE].emit = emit_icmp;
+	bld_base->op_actions[TGSI_OPCODE_ISLT].emit = emit_icmp;
+	bld_base->op_actions[TGSI_OPCODE_ROUND].emit = lp_build_tgsi_intrinsic;
+	bld_base->op_actions[TGSI_OPCODE_ROUND].intr_name = "llvm.AMDIL.round.nearest.";
+	bld_base->op_actions[TGSI_OPCODE_MIN].emit = lp_build_tgsi_intrinsic;
+	bld_base->op_actions[TGSI_OPCODE_MIN].intr_name = "llvm.AMDIL.min.";
+	bld_base->op_actions[TGSI_OPCODE_MAX].emit = lp_build_tgsi_intrinsic;
+	bld_base->op_actions[TGSI_OPCODE_MAX].intr_name = "llvm.AMDIL.max.";
+	bld_base->op_actions[TGSI_OPCODE_IMIN].emit = lp_build_tgsi_intrinsic;
+	bld_base->op_actions[TGSI_OPCODE_IMIN].intr_name = "llvm.AMDGPU.imin";
+	bld_base->op_actions[TGSI_OPCODE_IMAX].emit = lp_build_tgsi_intrinsic;
+	bld_base->op_actions[TGSI_OPCODE_IMAX].intr_name = "llvm.AMDGPU.imax";
+	bld_base->op_actions[TGSI_OPCODE_UMIN].emit = lp_build_tgsi_intrinsic;
+	bld_base->op_actions[TGSI_OPCODE_UMIN].intr_name = "llvm.AMDGPU.umin";
+	bld_base->op_actions[TGSI_OPCODE_UMAX].emit = lp_build_tgsi_intrinsic;
+	bld_base->op_actions[TGSI_OPCODE_UMAX].intr_name = "llvm.AMDGPU.umax";
+	bld_base->op_actions[TGSI_OPCODE_TXF].fetch_args = tex_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_TXF].intr_name = "llvm.AMDGPU.txf";
+	bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = tex_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_TXQ].intr_name = "llvm.AMDGPU.txq";
+	bld_base->op_actions[TGSI_OPCODE_CEIL].emit = lp_build_tgsi_intrinsic;
+	bld_base->op_actions[TGSI_OPCODE_CEIL].intr_name = "llvm.AMDIL.round.neginf.";
+
+
+
 	bld_base->op_actions[TGSI_OPCODE_ABS].emit = lp_build_tgsi_intrinsic;
 	bld_base->op_actions[TGSI_OPCODE_ABS].intr_name = "llvm.AMDIL.fabs.";
 	bld_base->op_actions[TGSI_OPCODE_ARL].emit = lp_build_tgsi_intrinsic;
@@ -558,10 +985,6 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_CMP].intr_name = "llvm.AMDGPU.cndlt";
 	bld_base->op_actions[TGSI_OPCODE_COS].emit = lp_build_tgsi_intrinsic;
 	bld_base->op_actions[TGSI_OPCODE_COS].intr_name = "llvm.AMDGPU.cos";
-	bld_base->op_actions[TGSI_OPCODE_DDX].emit = lp_build_tgsi_intrinsic;
-	bld_base->op_actions[TGSI_OPCODE_DDX].intr_name = "llvm.AMDGPU.ddx";
-	bld_base->op_actions[TGSI_OPCODE_DDY].emit = lp_build_tgsi_intrinsic;
-	bld_base->op_actions[TGSI_OPCODE_DDY].intr_name = "llvm.AMDGPU.ddy";
 	bld_base->op_actions[TGSI_OPCODE_DIV].emit = lp_build_tgsi_intrinsic;
 	bld_base->op_actions[TGSI_OPCODE_DIV].intr_name = "llvm.AMDGPU.div";
 	bld_base->op_actions[TGSI_OPCODE_ELSE].emit = else_emit;
@@ -574,7 +997,7 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_FRC].emit = lp_build_tgsi_intrinsic;
 	bld_base->op_actions[TGSI_OPCODE_FRC].intr_name = "llvm.AMDIL.fraction.";
 	bld_base->op_actions[TGSI_OPCODE_IF].emit = if_emit;
-	bld_base->op_actions[TGSI_OPCODE_KIL].emit = lp_build_tgsi_intrinsic;
+	bld_base->op_actions[TGSI_OPCODE_KIL].emit = kil_emit;
 	bld_base->op_actions[TGSI_OPCODE_KIL].intr_name = "llvm.AMDGPU.kill";
 	bld_base->op_actions[TGSI_OPCODE_KILP].emit = lp_build_tgsi_intrinsic;
 	bld_base->op_actions[TGSI_OPCODE_KILP].intr_name = "llvm.AMDGPU.kilp";
@@ -597,7 +1020,7 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_SSG].emit = lp_build_tgsi_intrinsic;
 	bld_base->op_actions[TGSI_OPCODE_SSG].intr_name = "llvm.AMDGPU.ssg";
 	bld_base->op_actions[TGSI_OPCODE_SGE].emit = lp_build_tgsi_intrinsic;
-	bld_base->op_actions[TGSI_OPCODE_SGE].intr_name = "llvm.AMDGPU.sge.";
+	bld_base->op_actions[TGSI_OPCODE_SGE].intr_name = "llvm.AMDGPU.sge";
 	bld_base->op_actions[TGSI_OPCODE_SEQ].emit = lp_build_tgsi_intrinsic;
 	bld_base->op_actions[TGSI_OPCODE_SEQ].intr_name = "llvm.AMDGPU.seq";
 	bld_base->op_actions[TGSI_OPCODE_SLE].fetch_args = radeon_llvm_fetch_args_2_reverse_soa;
@@ -620,6 +1043,7 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_TXD].intr_name = "llvm.AMDGPU.txd";
 	bld_base->op_actions[TGSI_OPCODE_TXL].fetch_args = tex_fetch_args;
 	bld_base->op_actions[TGSI_OPCODE_TXL].intr_name = "llvm.AMDGPU.txl";
+	bld_base->op_actions[TGSI_OPCODE_TXP].fetch_args = txp_fetch_args;
 	bld_base->op_actions[TGSI_OPCODE_TXP].intr_name = "llvm.AMDGPU.tex";
 	bld_base->op_actions[TGSI_OPCODE_TRUNC].emit = lp_build_tgsi_intrinsic;
 	bld_base->op_actions[TGSI_OPCODE_TRUNC].intr_name = "llvm.AMDGPU.trunc";
diff --git a/src/gallium/drivers/radeonsi/evergreen_state.c b/src/gallium/drivers/radeonsi/evergreen_state.c
index 75d6cadc6cc..b094248fee1 100644
--- a/src/gallium/drivers/radeonsi/evergreen_state.c
+++ b/src/gallium/drivers/radeonsi/evergreen_state.c
@@ -1166,24 +1166,6 @@ static void si_delete_sampler_state(struct pipe_context *ctx,
 	free(state);
 }
 
-static unsigned si_map_swizzle(unsigned swizzle)
-{
-	switch (swizzle) {
-	case UTIL_FORMAT_SWIZZLE_Y:
-		return V_008F1C_SQ_SEL_Y;
-	case UTIL_FORMAT_SWIZZLE_Z:
-		return V_008F1C_SQ_SEL_Z;
-	case UTIL_FORMAT_SWIZZLE_W:
-		return V_008F1C_SQ_SEL_W;
-	case UTIL_FORMAT_SWIZZLE_0:
-		return V_008F1C_SQ_SEL_0;
-	case UTIL_FORMAT_SWIZZLE_1:
-		return V_008F1C_SQ_SEL_1;
-	default: /* UTIL_FORMAT_SWIZZLE_X */
-		return V_008F1C_SQ_SEL_X;
-	}
-}
-
 static struct pipe_sampler_view *evergreen_create_sampler_view(struct pipe_context *ctx,
 							struct pipe_resource *texture,
 							const struct pipe_sampler_view *state)
@@ -1259,9 +1241,9 @@ static struct pipe_sampler_view *evergreen_create_sampler_view(struct pipe_conte
 
 	va = r600_resource_va(ctx->screen, texture);
 	view->state[0] = (va + tmp->offset[0]) >> 8;
-	view->state[1] = ((va + tmp->offset[0]) >> 40) & 0xff;
-	view->state[1] |= (S_008F14_DATA_FORMAT(format) |
-			   S_008F14_NUM_FORMAT(num_format));
+	view->state[1] = (S_008F14_BASE_ADDRESS_HI((va + tmp->offset[0]) >> 40) |
+			  S_008F14_DATA_FORMAT(format) |
+			  S_008F14_NUM_FORMAT(num_format));
 	view->state[2] = (S_008F18_WIDTH(texture->width0 - 1) |
 			  S_008F18_HEIGHT(height - 1));
 	view->state[3] = (S_008F1C_DST_SEL_X(si_map_swizzle(swizzle[0])) |
@@ -2087,9 +2069,9 @@ void si_pipe_shader_vs(struct pipe_context *ctx, struct si_pipe_shader *shader)
 	r600_pipe_state_add_reg(rstate,
 				R_02870C_SPI_SHADER_POS_FORMAT,
 				S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
-				S_02870C_POS1_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
-				S_02870C_POS2_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
-				S_02870C_POS3_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP),
+				S_02870C_POS1_EXPORT_FORMAT(V_02870C_SPI_SHADER_NONE) |
+				S_02870C_POS2_EXPORT_FORMAT(V_02870C_SPI_SHADER_NONE) |
+				S_02870C_POS3_EXPORT_FORMAT(V_02870C_SPI_SHADER_NONE),
 				NULL, 0);
 
 	va = r600_resource_va(ctx->screen, (void *)shader->bo);
diff --git a/src/gallium/drivers/radeonsi/r600_state_common.c b/src/gallium/drivers/radeonsi/r600_state_common.c
index 53a34ef519c..06eb96b9ee8 100644
--- a/src/gallium/drivers/radeonsi/r600_state_common.c
+++ b/src/gallium/drivers/radeonsi/r600_state_common.c
@@ -628,12 +628,15 @@ static void r600_vertex_buffer_update(struct r600_context *rctx)
 		ptr[0] = va & 0xFFFFFFFF;
 		ptr[1] = (S_008F04_BASE_ADDRESS_HI(va >> 32) |
 			  S_008F04_STRIDE(vertex_buffer->stride));
-		ptr[2] = (vertex_buffer->buffer->width0 - offset) / vertex_buffer->stride;
-		/* XXX: Hardcoding RGBA */
-		ptr[3] = (S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
-			  S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-			  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
-			  S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
+		if (vertex_buffer->stride > 0)
+			ptr[2] = ((vertex_buffer->buffer->width0 - offset) /
+				  vertex_buffer->stride);
+		else
+			ptr[2] = vertex_buffer->buffer->width0 - offset;
+		ptr[3] = (S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) |
+			  S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) |
+			  S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) |
+			  S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3])) |
 			  S_008F0C_NUM_FORMAT(num_format) |
 			  S_008F0C_DATA_FORMAT(data_format));
 
diff --git a/src/gallium/drivers/radeonsi/radeonsi_pipe.h b/src/gallium/drivers/radeonsi/radeonsi_pipe.h
index ab30892d51a..bba4cf23691 100644
--- a/src/gallium/drivers/radeonsi/radeonsi_pipe.h
+++ b/src/gallium/drivers/radeonsi/radeonsi_pipe.h
@@ -37,6 +37,7 @@
 #include "r600.h"
 #include "radeonsi_public.h"
 #include "r600_resource.h"
+#include "sid.h"
 
 #define R600_MAX_CONST_BUFFERS 1
 #define R600_MAX_CONST_BUFFER_SIZE 4096
@@ -467,6 +468,24 @@ static INLINE uint32_t S_FIXED(float value, uint32_t frac_bits)
 }
 #define ALIGN_DIVUP(x, y) (((x) + (y) - 1) / (y))
 
+static INLINE unsigned si_map_swizzle(unsigned swizzle)
+{
+	switch (swizzle) {
+	case UTIL_FORMAT_SWIZZLE_Y:
+		return V_008F0C_SQ_SEL_Y;
+	case UTIL_FORMAT_SWIZZLE_Z:
+		return V_008F0C_SQ_SEL_Z;
+	case UTIL_FORMAT_SWIZZLE_W:
+		return V_008F0C_SQ_SEL_W;
+	case UTIL_FORMAT_SWIZZLE_0:
+		return V_008F0C_SQ_SEL_0;
+	case UTIL_FORMAT_SWIZZLE_1:
+		return V_008F0C_SQ_SEL_1;
+	default: /* UTIL_FORMAT_SWIZZLE_X */
+		return V_008F0C_SQ_SEL_X;
+	}
+}
+
 static inline unsigned r600_tex_aniso_filter(unsigned filter)
 {
 	if (filter <= 1)   return 0;
diff --git a/src/gallium/drivers/radeonsi/radeonsi_shader.c b/src/gallium/drivers/radeonsi/radeonsi_shader.c
index 6425c352d28..0e1a97bba3e 100644
--- a/src/gallium/drivers/radeonsi/radeonsi_shader.c
+++ b/src/gallium/drivers/radeonsi/radeonsi_shader.c
@@ -199,7 +199,7 @@ static void declare_input_fs(
 	LLVMValueRef attr_number = lp_build_const_int32(gallivm, input_index);
 
 	/* XXX: Handle all possible interpolation modes */
-	switch (decl->Declaration.Interpolate) {
+	switch (decl->Interp.Interpolate) {
 	case TGSI_INTERPOLATE_COLOR:
 		if (si_shader_ctx->rctx->rasterizer->flatshade)
 			intr_name = "llvm.SI.fs.interp.constant";
@@ -331,14 +331,14 @@ static void si_llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
 			i = shader->ninput++;
 			shader->input[i].name = d->Semantic.Name;
 			shader->input[i].sid = d->Semantic.Index;
-			shader->input[i].interpolate = d->Declaration.Interpolate;
-			shader->input[i].centroid = d->Declaration.Centroid;
+			shader->input[i].interpolate = d->Interp.Interpolate;
+			shader->input[i].centroid = d->Interp.Centroid;
 			break;
 		case TGSI_FILE_OUTPUT:
 			i = shader->noutput++;
 			shader->output[i].name = d->Semantic.Name;
 			shader->output[i].sid = d->Semantic.Index;
-			shader->output[i].interpolate = d->Declaration.Interpolate;
+			shader->output[i].interpolate = d->Interp.Interpolate;
 			break;
 		}
 
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c b/src/gallium/drivers/softpipe/sp_tex_sample.c
index d54e02e40cd..d4c01759dbe 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.c
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
@@ -799,7 +799,8 @@ get_texel_2d_array(const struct sp_sampler_variant *samp,
    const struct pipe_resource *texture = samp->view->texture;
    unsigned level = addr.bits.level;
 
-   assert(layer < texture->array_size);
+   assert(layer < (int) texture->array_size);
+   assert(layer >= 0);
 
    if (x < 0 || x >= (int) u_minify(texture->width0, level) ||
        y < 0 || y >= (int) u_minify(texture->height0, level)) {
@@ -1787,9 +1788,9 @@ img_filter_2d_ewa(struct tgsi_sampler *tgsi_sampler,
    float weight_buffer[TGSI_QUAD_SIZE];
    unsigned buffer_next;
    int j;
-   float den;// = 0.0F;
+   float den; /* = 0.0F; */
    float ddq;
-   float U;// = u0 - tex_u;
+   float U; /* = u0 - tex_u; */
    int v;
 
    /* Scale ellipse formula to directly index the Filter Lookup Table.
@@ -1805,8 +1806,8 @@ img_filter_2d_ewa(struct tgsi_sampler *tgsi_sampler,
     * also the same. Note that texel/image access can only be performed using
     * a quad, i.e. it is not possible to get the pixel value for a single
     * tex coord. In order to have a better performance, the access is buffered
-    * using the s_buffer/t_buffer and weight_buffer. Only when the buffer is full,
-    * then the pixel values are read from the image.
+    * using the s_buffer/t_buffer and weight_buffer. Only when the buffer is
+    * full, then the pixel values are read from the image.
     */
    ddq = 2 * A;
    
@@ -1834,7 +1835,9 @@ img_filter_2d_ewa(struct tgsi_sampler *tgsi_sampler,
 
          int u;
          for (u = u0; u <= u1; ++u) {
-            /* Note that the ellipse has been pre-scaled so F = WEIGHT_LUT_SIZE - 1 */
+            /* Note that the ellipse has been pre-scaled so F =
+             * WEIGHT_LUT_SIZE - 1
+             */
             if (q < WEIGHT_LUT_SIZE) {
                /* as a LUT is used, q must never be negative;
                 * should not happen, though
@@ -1873,10 +1876,11 @@ img_filter_2d_ewa(struct tgsi_sampler *tgsi_sampler,
          }
       }
 
-      /* if the tex coord buffer contains unread values, we will read them now.
-       * Note that in most cases we have to read more pixel values than required,
-       * however, as the img_filter_2d_nearest function(s) does not have a count
-       * parameter, we need to read the whole quad and ignore the unused values
+      /* if the tex coord buffer contains unread values, we will read
+       * them now.  Note that in most cases we have to read more pixel
+       * values than required, however, as the img_filter_2d_nearest
+       * function(s) does not have a count parameter, we need to read
+       * the whole quad and ignore the unused values
        */
       if (buffer_next > 0) {
          unsigned jj;
@@ -1895,11 +1899,9 @@ img_filter_2d_ewa(struct tgsi_sampler *tgsi_sampler,
       }
 
       if (den <= 0.0F) {
-         /* Reaching this place would mean
-          * that no pixels intersected the ellipse.
-          * This should never happen because
-          * the filter we use always
-          * intersects at least one pixel.
+         /* Reaching this place would mean that no pixels intersected
+          * the ellipse.  This should never happen because the filter
+          * we use always intersects at least one pixel.
           */
 
          /*rgba[0]=0;
@@ -1907,7 +1909,8 @@ img_filter_2d_ewa(struct tgsi_sampler *tgsi_sampler,
          rgba[2]=0;
          rgba[3]=0;*/
          /* not enough pixels in resampling, resort to direct interpolation */
-         samp->min_img_filter(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba_temp);
+         samp->min_img_filter(tgsi_sampler, s, t, p, NULL,
+                              tgsi_sampler_lod_bias, rgba_temp);
          den = 1;
          num[0] = rgba_temp[0][j];
          num[1] = rgba_temp[1][j];
@@ -2020,7 +2023,6 @@ mip_filter_linear_aniso(struct tgsi_sampler *tgsi_sampler,
 }
 
 
-
 /**
  * Specialized version of mip_filter_linear with hard-wired calls to
  * 2d lambda calculation and 2d_linear_repeat_POT img filters.
@@ -2090,7 +2092,6 @@ mip_filter_linear_2d_linear_repeat_POT(
 }
 
 
-
 /**
  * Do shadow/depth comparisons.
  */
@@ -2287,9 +2288,11 @@ sample_cube(struct tgsi_sampler *tgsi_sampler,
    samp->compare(tgsi_sampler, ssss, tttt, NULL, c0, control, rgba);
 }
 
-static void do_swizzling(const struct sp_sampler_variant *samp,
-                         float in[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE],
-                         float out[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
+
+static void
+do_swizzling(const struct sp_sampler_variant *samp,
+             float in[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE],
+             float out[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
 {
    int j;
    const unsigned swizzle_r = samp->key.bits.swizzle_r;
@@ -2358,6 +2361,7 @@ static void do_swizzling(const struct sp_sampler_variant *samp,
    }
 }
 
+
 static void
 sample_swizzle(struct tgsi_sampler *tgsi_sampler,
                const float s[TGSI_QUAD_SIZE],
@@ -2464,6 +2468,19 @@ get_linear_wrap(unsigned mode)
 }
 
 
+/**
+ * Is swizzling needed for the given state key?
+ */
+static INLINE bool
+any_swizzle(union sp_sampler_key key)
+{
+   return (key.bits.swizzle_r != PIPE_SWIZZLE_RED ||
+           key.bits.swizzle_g != PIPE_SWIZZLE_GREEN ||
+           key.bits.swizzle_b != PIPE_SWIZZLE_BLUE ||
+           key.bits.swizzle_a != PIPE_SWIZZLE_ALPHA);
+}
+
+
 static compute_lambda_func
 get_lambda_func(const union sp_sampler_key key)
 {
@@ -2590,6 +2607,7 @@ sp_sampler_variant_destroy( struct sp_sampler_variant *samp )
    FREE(samp);
 }
 
+
 static void
 sample_get_dims(struct tgsi_sampler *tgsi_sampler, int level,
 		int dims[4])
@@ -2630,35 +2648,43 @@ sample_get_dims(struct tgsi_sampler *tgsi_sampler, int level,
     }
 }
 
-/* this function is only used for unfiltered texel gets
-   via the TGSI TXF opcode. */
+/**
+ * This function is only used for getting unfiltered texels via the
+ * TXF opcode.  The GL spec says that out-of-bounds texel fetches
+ * produce undefined results.  Instead of crashing, lets just clamp
+ * coords to the texture image size.
+ */
 static void
 sample_get_texels(struct tgsi_sampler *tgsi_sampler,
-	   const int v_i[TGSI_QUAD_SIZE],
-	   const int v_j[TGSI_QUAD_SIZE],
-	   const int v_k[TGSI_QUAD_SIZE],
-	   const int lod[TGSI_QUAD_SIZE],
-	   const int8_t offset[3],
-	   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
+                  const int v_i[TGSI_QUAD_SIZE],
+                  const int v_j[TGSI_QUAD_SIZE],
+                  const int v_k[TGSI_QUAD_SIZE],
+                  const int lod[TGSI_QUAD_SIZE],
+                  const int8_t offset[3],
+                  float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
 {
    const struct sp_sampler_variant *samp = sp_sampler_variant(tgsi_sampler);
    union tex_tile_address addr;
    const struct pipe_resource *texture = samp->view->texture;
    int j, c;
    const float *tx;
-   bool need_swizzle = (samp->key.bits.swizzle_r != PIPE_SWIZZLE_RED ||
-                        samp->key.bits.swizzle_g != PIPE_SWIZZLE_GREEN ||
-                        samp->key.bits.swizzle_b != PIPE_SWIZZLE_BLUE ||
-                        samp->key.bits.swizzle_a != PIPE_SWIZZLE_ALPHA);
+   const bool need_swizzle = any_swizzle(samp->key);
+   int width, height, depth, layers;
 
    addr.value = 0;
    /* TODO write a better test for LOD */
    addr.bits.level = lod[0];
 
+   width = u_minify(texture->width0, addr.bits.level);
+   height = u_minify(texture->height0, addr.bits.level);
+   depth = u_minify(texture->depth0, addr.bits.level);
+   layers = texture->array_size;
+
    switch(texture->target) {
    case PIPE_TEXTURE_1D:
       for (j = 0; j < TGSI_QUAD_SIZE; j++) {
-	 tx = get_texel_2d(samp, addr, v_i[j] + offset[0], 0);
+         int x = CLAMP(v_i[j] + offset[0], 0, width - 1);
+	 tx = get_texel_2d(samp, addr, x, 0);
 	 for (c = 0; c < 4; c++) {
 	    rgba[c][j] = tx[c];
 	 }
@@ -2666,8 +2692,9 @@ sample_get_texels(struct tgsi_sampler *tgsi_sampler,
       break;
    case PIPE_TEXTURE_1D_ARRAY:
       for (j = 0; j < TGSI_QUAD_SIZE; j++) {
-	 tx = get_texel_1d_array(samp, addr, v_i[j] + offset[0],
-				 v_j[j] + offset[1]);
+         int x = CLAMP(v_i[j] + offset[0], 0, width - 1);
+         int y = CLAMP(v_j[j] + offset[1], 0, layers - 1);
+	 tx = get_texel_1d_array(samp, addr, x, y);
 	 for (c = 0; c < 4; c++) {
 	    rgba[c][j] = tx[c];
 	 }
@@ -2676,8 +2703,9 @@ sample_get_texels(struct tgsi_sampler *tgsi_sampler,
    case PIPE_TEXTURE_2D:
    case PIPE_TEXTURE_RECT:
       for (j = 0; j < TGSI_QUAD_SIZE; j++) {
-	 tx = get_texel_2d(samp, addr, v_i[j] + offset[0],
-			   v_j[j] + offset[1]);
+         int x = CLAMP(v_i[j] + offset[0], 0, width - 1);
+         int y = CLAMP(v_j[j] + offset[1], 0, height - 1);
+	 tx = get_texel_2d(samp, addr, x, y);
 	 for (c = 0; c < 4; c++) {
 	    rgba[c][j] = tx[c];
 	 }
@@ -2685,9 +2713,10 @@ sample_get_texels(struct tgsi_sampler *tgsi_sampler,
       break;
    case PIPE_TEXTURE_2D_ARRAY:
       for (j = 0; j < TGSI_QUAD_SIZE; j++) {
-	 tx = get_texel_2d_array(samp, addr, v_i[j] + offset[0],
-				 v_j[j] + offset[1],
-				 v_k[j] + offset[2]);
+         int x = CLAMP(v_i[j] + offset[0], 0, width - 1);
+         int y = CLAMP(v_j[j] + offset[1], 0, height - 1);
+         int layer = CLAMP(v_k[j] + offset[2], 0, layers - 1);
+	 tx = get_texel_2d_array(samp, addr, x, y, layer);
 	 for (c = 0; c < 4; c++) {
 	    rgba[c][j] = tx[c];
 	 }
@@ -2695,9 +2724,11 @@ sample_get_texels(struct tgsi_sampler *tgsi_sampler,
       break;
    case PIPE_TEXTURE_3D:
       for (j = 0; j < TGSI_QUAD_SIZE; j++) {
-	 tx = get_texel_3d(samp, addr, v_i[j] + offset[0], 
-			   v_j[j] + offset[1],
-			   v_k[j] + offset[2]);
+         int x = CLAMP(v_i[j] + offset[0], 0, width - 1);
+         int y = CLAMP(v_j[j] + offset[1], 0, height - 1);
+         int z = CLAMP(v_k[j] + offset[2], 0, depth - 1);
+
+	 tx = get_texel_3d(samp, addr, x, y, z);
 	 for (c = 0; c < 4; c++) {
 	    rgba[c][j] = tx[c];
 	 }
@@ -2715,6 +2746,8 @@ sample_get_texels(struct tgsi_sampler *tgsi_sampler,
       do_swizzling(samp, rgba_temp, rgba);
    }
 }
+
+
 /**
  * Create a sampler variant for a given set of non-orthogonal state.
  */
@@ -2830,10 +2863,7 @@ sp_create_sampler_variant( const struct pipe_sampler_state *sampler,
       samp->sample_target = samp->compare;
    }
 
-   if (key.bits.swizzle_r != PIPE_SWIZZLE_RED ||
-       key.bits.swizzle_g != PIPE_SWIZZLE_GREEN ||
-       key.bits.swizzle_b != PIPE_SWIZZLE_BLUE ||
-       key.bits.swizzle_a != PIPE_SWIZZLE_ALPHA) {
+   if (any_swizzle(key)) {
       samp->base.get_samples = sample_swizzle;
    }
    else {
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index ac2d35e5ea4..64ec658b80e 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -241,7 +241,11 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_TGSI_CAN_COMPACT_VARYINGS:
    case PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS:
    case PIPE_CAP_GLSL_FEATURE_LEVEL:
+   case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY:
+   case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY:
       return 0;
+   case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
+      return 1;
 
    default:
       debug_printf("Unexpected PIPE_CAP_ query %u\n", param);
diff --git a/src/gallium/drivers/svga/svga_tgsi_insn.c b/src/gallium/drivers/svga/svga_tgsi_insn.c
index 5e6d1fbc904..a68912608bc 100644
--- a/src/gallium/drivers/svga/svga_tgsi_insn.c
+++ b/src/gallium/drivers/svga/svga_tgsi_insn.c
@@ -871,6 +871,31 @@ static boolean emit_floor(struct svga_shader_emitter *emit,
 }
 
 
+/* Translate the following TGSI CEIL instruction.
+ *    CEIL  DST, SRC
+ * To the following SVGA3D instruction sequence.
+ *    FRC  TMP, -SRC
+ *    ADD  DST, SRC, TMP
+ */
+static boolean emit_ceil(struct svga_shader_emitter *emit,
+                         const struct tgsi_full_instruction *insn)
+{
+   SVGA3dShaderDestToken dst = translate_dst_register(emit, insn, 0);
+   const struct src_register src0 = translate_src_register(emit, &insn->Src[0]);
+   SVGA3dShaderDestToken temp = get_temp(emit);
+
+   /* FRC  TMP, -SRC */
+   if (!submit_op1(emit, inst_token(SVGA3DOP_FRC), temp, negate(src0)))
+      return FALSE;
+
+   /* ADD DST, SRC, TMP */
+   if (!submit_op2(emit, inst_token(SVGA3DOP_ADD), dst, src0, src(temp)))
+      return FALSE;
+
+   return TRUE;
+}
+
+
 /* Translate the following TGSI CMP instruction.
  *    CMP  DST, SRC0, SRC1, SRC2
  * To the following SVGA3D instruction sequence.
@@ -2435,6 +2460,9 @@ static boolean svga_emit_instruction( struct svga_shader_emitter *emit,
    case TGSI_OPCODE_TRUNC:        /* should be TRUNC, not FLR */
       return emit_floor( emit, insn );
 
+   case TGSI_OPCODE_CEIL:
+      return emit_ceil( emit, insn );
+
    case TGSI_OPCODE_CMP:
       return emit_cmp( emit, insn );
 
diff --git a/src/gallium/include/pipe/p_context.h b/src/gallium/include/pipe/p_context.h
index 7709177444f..f59e3881232 100644
--- a/src/gallium/include/pipe/p_context.h
+++ b/src/gallium/include/pipe/p_context.h
@@ -64,6 +64,7 @@ struct pipe_vertex_element;
 struct pipe_video_buffer;
 struct pipe_video_decoder;
 struct pipe_viewport_state;
+struct pipe_compute_state;
 union pipe_color_union;
 union pipe_query_result;
 
@@ -142,6 +143,10 @@ struct pipe_context {
    void   (*bind_geometry_sampler_states)(struct pipe_context *,
                                           unsigned num_samplers,
                                           void **samplers);
+   void   (*bind_compute_sampler_states)(struct pipe_context *,
+                                         unsigned start_slot,
+                                         unsigned num_samplers,
+                                         void **samplers);
    void   (*delete_sampler_state)(struct pipe_context *, void *);
 
    void * (*create_rasterizer_state)(struct pipe_context *,
@@ -221,6 +226,26 @@ struct pipe_context {
                                       unsigned num_views,
                                       struct pipe_sampler_view **);
 
+   void (*set_compute_sampler_views)(struct pipe_context *,
+                                     unsigned start_slot, unsigned num_views,
+                                     struct pipe_sampler_view **);
+
+   /**
+    * Bind an array of shader resources that will be used by the
+    * graphics pipeline.  Any resources that were previously bound to
+    * the specified range will be unbound after this call.
+    *
+    * \param first      first resource to bind.
+    * \param count      number of consecutive resources to bind.
+    * \param resources  array of pointers to the resources to bind, it
+    *                   should contain at least \a count elements
+    *                   unless it's NULL, in which case no new
+    *                   resources will be bound.
+    */
+   void (*set_shader_resources)(struct pipe_context *,
+                                unsigned start, unsigned count,
+                                struct pipe_surface **resources);
+
    void (*set_vertex_buffers)( struct pipe_context *,
                                unsigned num_buffers,
                                const struct pipe_vertex_buffer * );
@@ -410,6 +435,86 @@ struct pipe_context {
     */
    struct pipe_video_buffer *(*create_video_buffer)( struct pipe_context *context,
                                                      const struct pipe_video_buffer *templat );
+
+   /**
+    * Compute kernel execution
+    */
+   /*@{*/
+   /**
+    * Define the compute program and parameters to be used by
+    * pipe_context::launch_grid.
+    */
+   void *(*create_compute_state)(struct pipe_context *context,
+				 const struct pipe_compute_state *);
+   void (*bind_compute_state)(struct pipe_context *, void *);
+   void (*delete_compute_state)(struct pipe_context *, void *);
+
+   /**
+    * Bind an array of shader resources that will be used by the
+    * compute program.  Any resources that were previously bound to
+    * the specified range will be unbound after this call.
+    *
+    * \param first      first resource to bind.
+    * \param count      number of consecutive resources to bind.
+    * \param resources  array of pointers to the resources to bind, it
+    *                   should contain at least \a count elements
+    *                   unless it's NULL, in which case no new
+    *                   resources will be bound.
+    */
+   void (*set_compute_resources)(struct pipe_context *,
+                                 unsigned start, unsigned count,
+                                 struct pipe_surface **resources);
+
+   /**
+    * Bind an array of buffers to be mapped into the address space of
+    * the GLOBAL resource.  Any buffers that were previously bound
+    * between [first, first + count - 1] are unbound after this call.
+    *
+    * \param first      first buffer to map.
+    * \param count      number of consecutive buffers to map.
+    * \param resources  array of pointers to the buffers to map, it
+    *                   should contain at least \a count elements
+    *                   unless it's NULL, in which case no new
+    *                   resources will be bound.
+    * \param handles    array of pointers to the memory locations that
+    *                   will be filled with the respective base
+    *                   addresses each buffer will be mapped to.  It
+    *                   should contain at least \a count elements,
+    *                   unless \a resources is NULL in which case \a
+    *                   handles should be NULL as well.
+    *
+    * Note that the driver isn't required to make any guarantees about
+    * the contents of the \a handles array being valid anytime except
+    * during the subsequent calls to pipe_context::launch_grid.  This
+    * means that the only sensible location handles[i] may point to is
+    * somewhere within the INPUT buffer itself.  This is so to
+    * accommodate implementations that lack virtual memory but
+    * nevertheless migrate buffers on the fly, leading to resource
+    * base addresses that change on each kernel invocation or are
+    * unknown to the pipe driver.
+    */
+   void (*set_global_binding)(struct pipe_context *context,
+                              unsigned first, unsigned count,
+                              struct pipe_resource **resources,
+                              uint32_t **handles);
+
+   /**
+    * Launch the compute kernel starting from instruction \a pc of the
+    * currently bound compute program.
+    *
+    * \a grid_layout and \a block_layout are arrays of size \a
+    * PIPE_COMPUTE_CAP_GRID_DIMENSION that determine the layout of the
+    * grid (in block units) and working block (in thread units) to be
+    * used, respectively.
+    *
+    * \a input will be used to initialize the INPUT resource, and it
+    * should point to a buffer of at least
+    * pipe_compute_state::req_input_mem bytes.
+    */
+   void (*launch_grid)(struct pipe_context *context,
+                       const uint *block_layout, const uint *grid_layout,
+                       uint32_t pc, const void *input);
+   /*@}*/
 };
 
 
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index 398cb98248c..1e05cc4caee 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -304,6 +304,9 @@ enum pipe_transfer_usage {
 #define PIPE_BIND_STREAM_OUTPUT        (1 << 11) /* set_stream_output_buffers */
 #define PIPE_BIND_CURSOR               (1 << 16) /* mouse cursor */
 #define PIPE_BIND_CUSTOM               (1 << 17) /* state-tracker/winsys usages */
+#define PIPE_BIND_GLOBAL               (1 << 18) /* set_global_binding */
+#define PIPE_BIND_SHADER_RESOURCE      (1 << 19) /* set_shader_resources */
+#define PIPE_BIND_COMPUTE_RESOURCE     (1 << 20) /* set_compute_resources */
 
 /* The first two flags above were previously part of the amorphous
  * TEXTURE_USAGE, most of which are now descriptions of the ways a
@@ -346,7 +349,8 @@ enum pipe_transfer_usage {
 #define PIPE_SHADER_VERTEX   0
 #define PIPE_SHADER_FRAGMENT 1
 #define PIPE_SHADER_GEOMETRY 2
-#define PIPE_SHADER_TYPES    3
+#define PIPE_SHADER_COMPUTE  3
+#define PIPE_SHADER_TYPES    4
 
 
 /**
@@ -477,9 +481,10 @@ enum pipe_cap {
    PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY = 65,
    PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY = 66,
    PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY = 67,
-   PIPE_CAP_USER_INDEX_BUFFERS = 68,
-   PIPE_CAP_USER_CONSTANT_BUFFERS = 69,
-   PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT = 70
+   PIPE_CAP_COMPUTE = 68,
+   PIPE_CAP_USER_INDEX_BUFFERS = 69,
+   PIPE_CAP_USER_CONSTANT_BUFFERS = 70,
+   PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT = 71
 };
 
 /**
@@ -522,9 +527,32 @@ enum pipe_shader_cap
    PIPE_SHADER_CAP_INDIRECT_CONST_ADDR = 15,
    PIPE_SHADER_CAP_SUBROUTINES = 16, /* BGNSUB, ENDSUB, CAL, RET */
    PIPE_SHADER_CAP_INTEGERS = 17,
-   PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS = 18
+   PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS = 18,
+   PIPE_SHADER_CAP_PREFERRED_IR = 19
 };
 
+/**
+ * Shader intermediate representation.
+ */
+enum pipe_shader_ir
+{
+   PIPE_SHADER_IR_TGSI
+};
+
+/**
+ * Compute-specific implementation capability.  They can be queried
+ * using pipe_screen::get_compute_param.
+ */
+enum pipe_compute_cap
+{
+   PIPE_COMPUTE_CAP_GRID_DIMENSION,
+   PIPE_COMPUTE_CAP_MAX_GRID_SIZE,
+   PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE,
+   PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE,
+   PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE,
+   PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE,
+   PIPE_COMPUTE_CAP_MAX_INPUT_SIZE
+};
 
 /**
  * Composite query types
diff --git a/src/gallium/include/pipe/p_screen.h b/src/gallium/include/pipe/p_screen.h
index 45c441b2fcf..7ae7c9a04e1 100644
--- a/src/gallium/include/pipe/p_screen.h
+++ b/src/gallium/include/pipe/p_screen.h
@@ -98,6 +98,18 @@ struct pipe_screen {
 			   enum pipe_video_profile profile,
 			   enum pipe_video_cap param );
 
+   /**
+    * Query a compute-specific capability/parameter/limit.
+    * \param param  one of PIPE_COMPUTE_CAP_x
+    * \param ret    pointer to a preallocated buffer that will be
+    *               initialized to the parameter value, or NULL.
+    * \return       size in bytes of the parameter value that would be
+    *               returned.
+    */
+   int (*get_compute_param)(struct pipe_screen *,
+			    enum pipe_compute_cap param,
+			    void *ret);
+
    struct pipe_context * (*context_create)( struct pipe_screen *,
 					    void *priv );
 
diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h
index df2dd5e618e..6b58293f409 100644
--- a/src/gallium/include/pipe/p_shader_tokens.h
+++ b/src/gallium/include/pipe/p_shader_tokens.h
@@ -43,6 +43,7 @@ struct tgsi_header
 #define TGSI_PROCESSOR_FRAGMENT  0
 #define TGSI_PROCESSOR_VERTEX    1
 #define TGSI_PROCESSOR_GEOMETRY  2
+#define TGSI_PROCESSOR_COMPUTE   3
 
 struct tgsi_processor
 {
@@ -76,6 +77,7 @@ enum tgsi_file_type {
    TGSI_FILE_IMMEDIATE_ARRAY     =10,
    TGSI_FILE_TEMPORARY_ARRAY     =11,
    TGSI_FILE_RESOURCE            =12,
+   TGSI_FILE_SAMPLER_VIEW        =13,
    TGSI_FILE_COUNT      /**< how many TGSI_FILE_ types */
 };
 
@@ -114,12 +116,12 @@ struct tgsi_declaration
    unsigned NrTokens    : 8;  /**< UINT */
    unsigned File        : 4;  /**< one of TGSI_FILE_x */
    unsigned UsageMask   : 4;  /**< bitmask of TGSI_WRITEMASK_x flags */
-   unsigned Interpolate : 4;  /**< one of TGSI_INTERPOLATE_x */
    unsigned Dimension   : 1;  /**< any extra dimension info? */
    unsigned Semantic    : 1;  /**< BOOL, any semantic info? */
-   unsigned Centroid    : 1;  /**< centroid sampling? */
+   unsigned Interpolate : 1;  /**< any interpolation info? */
    unsigned Invariant   : 1;  /**< invariant optimization? */
-   unsigned CylindricalWrap:4;   /**< TGSI_CYLINDRICAL_WRAP_x flags */
+   unsigned Local       : 1;  /**< optimize as subroutine local variable? */
+   unsigned Padding     : 7;
 };
 
 struct tgsi_declaration_range
@@ -134,6 +136,14 @@ struct tgsi_declaration_dimension
    unsigned Padding:16;
 };
 
+struct tgsi_declaration_interp
+{
+   unsigned Interpolate : 4;   /**< one of TGSI_INTERPOLATE_x */
+   unsigned Centroid    : 1;   /**< centroid sampling? */
+   unsigned CylindricalWrap:4; /**< TGSI_CYLINDRICAL_WRAP_x flags */
+   unsigned Padding     : 23;
+};
+
 #define TGSI_SEMANTIC_POSITION   0
 #define TGSI_SEMANTIC_COLOR      1
 #define TGSI_SEMANTIC_BCOLOR     2  /**< back-face color */
@@ -149,7 +159,11 @@ struct tgsi_declaration_dimension
 #define TGSI_SEMANTIC_STENCIL    12
 #define TGSI_SEMANTIC_CLIPDIST   13
 #define TGSI_SEMANTIC_CLIPVERTEX 14
-#define TGSI_SEMANTIC_COUNT      15 /**< number of semantic values */
+#define TGSI_SEMANTIC_GRID_SIZE  15 /**< grid size in blocks */
+#define TGSI_SEMANTIC_BLOCK_ID   16 /**< id of the current block */
+#define TGSI_SEMANTIC_BLOCK_SIZE 17 /**< block size in threads */
+#define TGSI_SEMANTIC_THREAD_ID  18 /**< block-relative id of the current thread */
+#define TGSI_SEMANTIC_COUNT      19 /**< number of semantic values */
 
 struct tgsi_declaration_semantic
 {
@@ -159,6 +173,13 @@ struct tgsi_declaration_semantic
 };
 
 struct tgsi_declaration_resource {
+   unsigned Resource    : 8; /**< one of TGSI_TEXTURE_ */
+   unsigned Raw         : 1;
+   unsigned Writable    : 1;
+   unsigned Padding     : 22;
+};
+
+struct tgsi_declaration_sampler_view {
    unsigned Resource    : 8; /**< one of TGSI_TEXTURE_ */
    unsigned ReturnTypeX : 6; /**< one of enum pipe_type */
    unsigned ReturnTypeY : 6; /**< one of enum pipe_type */
@@ -166,6 +187,15 @@ struct tgsi_declaration_resource {
    unsigned ReturnTypeW : 6; /**< one of enum pipe_type */
 };
 
+/*
+ * Special resources that don't need to be declared.  They map to the
+ * GLOBAL/LOCAL/PRIVATE/INPUT compute memory spaces.
+ */
+#define TGSI_RESOURCE_GLOBAL	0x7fff
+#define TGSI_RESOURCE_LOCAL	0x7ffe
+#define TGSI_RESOURCE_PRIVATE	0x7ffd
+#define TGSI_RESOURCE_INPUT	0x7ffc
+
 #define TGSI_IMM_FLOAT32   0
 #define TGSI_IMM_UINT32    1
 #define TGSI_IMM_INT32     2
@@ -363,16 +393,16 @@ struct tgsi_property_data {
 #define TGSI_OPCODE_ENDSWITCH           144
 
 /* resource related opcodes */
-#define TGSI_OPCODE_LOAD                145
-#define TGSI_OPCODE_LOAD_MS             146
-#define TGSI_OPCODE_SAMPLE              147
+#define TGSI_OPCODE_SAMPLE              145
+#define TGSI_OPCODE_SAMPLE_I            146
+#define TGSI_OPCODE_SAMPLE_I_MS         147
 #define TGSI_OPCODE_SAMPLE_B            148
 #define TGSI_OPCODE_SAMPLE_C            149
 #define TGSI_OPCODE_SAMPLE_C_LZ         150
 #define TGSI_OPCODE_SAMPLE_D            151
 #define TGSI_OPCODE_SAMPLE_L            152
 #define TGSI_OPCODE_GATHER4             153
-#define TGSI_OPCODE_RESINFO             154
+#define TGSI_OPCODE_SVIEWINFO           154
 #define TGSI_OPCODE_SAMPLE_POS          155
 #define TGSI_OPCODE_SAMPLE_INFO         156
 
@@ -381,7 +411,26 @@ struct tgsi_property_data {
 #define TGSI_OPCODE_IABS                159
 #define TGSI_OPCODE_ISSG                160
 
-#define TGSI_OPCODE_LAST                161
+#define TGSI_OPCODE_LOAD                161
+#define TGSI_OPCODE_STORE               162
+
+#define TGSI_OPCODE_MFENCE              163
+#define TGSI_OPCODE_LFENCE              164
+#define TGSI_OPCODE_SFENCE              165
+#define TGSI_OPCODE_BARRIER             166
+
+#define TGSI_OPCODE_ATOMUADD            167
+#define TGSI_OPCODE_ATOMXCHG            168
+#define TGSI_OPCODE_ATOMCAS             169
+#define TGSI_OPCODE_ATOMAND             170
+#define TGSI_OPCODE_ATOMOR              171
+#define TGSI_OPCODE_ATOMXOR             172
+#define TGSI_OPCODE_ATOMUMIN            173
+#define TGSI_OPCODE_ATOMUMAX            174
+#define TGSI_OPCODE_ATOMIMIN            175
+#define TGSI_OPCODE_ATOMIMAX            176
+
+#define TGSI_OPCODE_LAST                177
 
 #define TGSI_SAT_NONE            0  /* do not saturate */
 #define TGSI_SAT_ZERO_ONE        1  /* clamp to [0,1] */
@@ -441,7 +490,7 @@ struct tgsi_instruction_label
    unsigned Padding  : 8;
 };
 
-#define TGSI_TEXTURE_UNKNOWN        0
+#define TGSI_TEXTURE_BUFFER         0
 #define TGSI_TEXTURE_1D             1
 #define TGSI_TEXTURE_2D             2
 #define TGSI_TEXTURE_3D             3
@@ -455,7 +504,8 @@ struct tgsi_instruction_label
 #define TGSI_TEXTURE_SHADOW1D_ARRAY 11
 #define TGSI_TEXTURE_SHADOW2D_ARRAY 12
 #define TGSI_TEXTURE_SHADOWCUBE     13
-#define TGSI_TEXTURE_COUNT          14
+#define TGSI_TEXTURE_UNKNOWN        14
+#define TGSI_TEXTURE_COUNT          15
 
 struct tgsi_instruction_texture
 {
diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
index 3bc35bc77ce..51a956d9532 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -62,6 +62,7 @@ extern "C" {
 #define PIPE_MAX_GEOMETRY_SAMPLERS  16
 #define PIPE_MAX_SHADER_INPUTS    32
 #define PIPE_MAX_SHADER_OUTPUTS   32
+#define PIPE_MAX_SHADER_SAMPLER_VIEWS 32
 #define PIPE_MAX_SHADER_RESOURCES 32
 #define PIPE_MAX_TEXTURE_LEVELS   16
 #define PIPE_MAX_SO_BUFFERS        4
@@ -337,6 +338,7 @@ struct pipe_surface
    unsigned height;              /**< logical height in pixels */
 
    unsigned usage;               /**< bitmask of PIPE_BIND_x */
+   unsigned writable:1;          /**< writable shader resource */
 
    union {
       struct {
@@ -591,6 +593,13 @@ struct pipe_resolve_info
    unsigned mask; /**< PIPE_MASK_RGBA, Z, S or ZS */
 };
 
+struct pipe_compute_state
+{
+   const void *prog; /**< Compute program to be executed. */
+   unsigned req_local_mem; /**< Required size of the LOCAL resource. */
+   unsigned req_private_mem; /**< Required size of the PRIVATE resource. */
+   unsigned req_input_mem; /**< Required size of the INPUT resource. */
+};
 
 #ifdef __cplusplus
 }
diff --git a/src/gallium/state_trackers/Makefile b/src/gallium/state_trackers/Makefile
index 0900efc664f..d5162c17507 100644
--- a/src/gallium/state_trackers/Makefile
+++ b/src/gallium/state_trackers/Makefile
@@ -17,7 +17,7 @@ subdirs:
 
 
 clean:
-	rm -f `find . -name \*.[oa]`
+	rm -f `find . -regex '.*\.l?[oa]'`
 	rm -f `find . -name depend`
 
 
diff --git a/src/gallium/state_trackers/clover/Doxyfile b/src/gallium/state_trackers/clover/Doxyfile
new file mode 100644
index 00000000000..50250e75672
--- /dev/null
+++ b/src/gallium/state_trackers/clover/Doxyfile
@@ -0,0 +1,1716 @@
+# Doxyfile 1.7.4
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a hash (#) is considered a comment and will be ignored.
+# The format is:
+#       TAG = value [value, ...]
+# For lists items can also be appended using:
+#       TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (" ").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# http://www.gnu.org/software/libiconv for the list of possible encodings.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded
+# by quotes) that should identify the project.
+
+PROJECT_NAME           = Clover
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number.
+# This could be handy for archiving the generated documentation or
+# if some version control system is used.
+
+PROJECT_NUMBER         =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer
+# a quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          =
+
+# With the PROJECT_LOGO tag one can specify an logo or icon that is
+# included in the documentation. The maximum height of the logo should not
+# exceed 55 pixels and the maximum width should not exceed 200 pixels.
+# Doxygen will copy the logo to the output directory.
+
+PROJECT_LOGO           =
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
+# base path where the generated documentation will be put.
+# If a relative path is entered, it will be relative to the location
+# where doxygen was started. If left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       =
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
+# 4096 sub-directories (in 2 levels) under the output directory of each output
+# format and will distribute the generated files over these directories.
+# Enabling this option can be useful when feeding doxygen a huge amount of
+# source files, where putting all generated files in the same directory would
+# otherwise cause performance problems for the file system.
+
+CREATE_SUBDIRS         = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# The default language is English, other supported languages are:
+# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional,
+# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German,
+# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English
+# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian,
+# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak,
+# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
+# include brief member descriptions after the members that are listed in
+# the file and class documentation (similar to JavaDoc).
+# Set to NO to disable this.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
+# the brief description of a member or function before the detailed description.
+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator
+# that is used to form the text in various listings. Each string
+# in this list, if found as the leading text of the brief description, will be
+# stripped from the text and the result after processing the whole list, is
+# used as the annotated text. Otherwise, the brief description is used as-is.
+# If left blank, the following values are used ("$name" is automatically
+# replaced with the name of the entity): "The $name class" "The $name widget"
+# "The $name file" "is" "provides" "specifies" "contains"
+# "represents" "a" "an" "the"
+
+ABBREVIATE_BRIEF       =
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# Doxygen will generate a detailed section even if there is only a brief
+# description.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
+# path before files name in the file list and in the header files. If set
+# to NO the shortest path that makes the file name unique will be used.
+
+FULL_PATH_NAMES        = YES
+
+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
+# can be used to strip a user-defined part of the path. Stripping is
+# only done if one of the specified strings matches the left-hand part of
+# the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the
+# path to strip.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of
+# the path mentioned in the documentation of a class, which tells
+# the reader which header file to include in order to use a class.
+# If left blank only the name of the header file containing the class
+# definition is used. Otherwise one should specify the include paths that
+# are normally passed to the compiler using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
+# (but less readable) file names. This can be useful if your file system
+# doesn't support long names like on DOS, Mac, or CD-ROM.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
+# will interpret the first line (until the first dot) of a JavaDoc-style
+# comment as the brief description. If set to NO, the JavaDoc
+# comments will behave just like regular Qt-style comments
+# (thus requiring an explicit @brief command for a brief description.)
+
+JAVADOC_AUTOBRIEF      = YES
+
+# If the QT_AUTOBRIEF tag is set to YES then Doxygen will
+# interpret the first line (until the first dot) of a Qt-style
+# comment as the brief description. If set to NO, the comments
+# will behave just like regular Qt-style comments (thus requiring
+# an explicit \brief command for a brief description.)
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
+# treat a multi-line C++ special comment block (i.e. a block of //! or ///
+# comments) as a brief description. This used to be the default behaviour.
+# The new default is to treat a multi-line C++ comment block as a detailed
+# description. Set this tag to YES if you prefer the old behaviour instead.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
+# member inherits the documentation from any documented member that it
+# re-implements.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce
+# a new page for each member. If set to NO, the documentation of a member will
+# be part of the file/class/namespace that contains it.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab.
+# Doxygen uses this value to replace tabs by spaces in code fragments.
+
+TAB_SIZE               = 8
+
+# This tag can be used to specify a number of aliases that acts
+# as commands in the documentation. An alias has the form "name=value".
+# For example adding "sideeffect=\par Side Effects:\n" will allow you to
+# put the command \sideeffect (or @sideeffect) in the documentation, which
+# will result in a user-defined paragraph with heading "Side Effects:".
+# You can put \n's in the value part of an alias to insert newlines.
+
+ALIASES                =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C
+# sources only. Doxygen will then generate output that is more tailored for C.
+# For instance, some of the names that are used will be different. The list
+# of all members will be omitted, etc.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java
+# sources only. Doxygen will then generate output that is more tailored for
+# Java. For instance, namespaces will be presented as packages, qualified
+# scopes will look different, etc.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources only. Doxygen will then generate output that is more tailored for
+# Fortran.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for
+# VHDL.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given extension.
+# Doxygen has a built-in mapping, but you can override or extend it using this
+# tag. The format is ext=language, where ext is a file extension, and language
+# is one of the parsers supported by doxygen: IDL, Java, Javascript, CSharp, C,
+# C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, C++. For instance to make
+# doxygen treat .inc files as Fortran files (default is PHP), and .f files as C
+# (default is Fortran), use: inc=Fortran f=C. Note that for custom extensions
+# you also need to set FILE_PATTERNS otherwise the files are not read by doxygen.
+
+EXTENSION_MAPPING      =
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should
+# set this tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.
+# func(std::string) {}). This also makes the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+
+BUILTIN_STL_SUPPORT    = YES
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only.
+# Doxygen will parse them like normal C++ but will assume all classes use public
+# instead of private inheritance when no explicit protection keyword is present.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate getter
+# and setter methods for a property. Setting this option to YES (the default)
+# will make doxygen replace the get and set methods by a property in the
+# documentation. This will only work if the methods are indeed getting or
+# setting a simple type. If this is not the case, or you want to show the
+# methods anyway, you should set this option to NO.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# Set the SUBGROUPING tag to YES (the default) to allow class member groups of
+# the same type (for instance a group of public functions) to be put as a
+# subgroup of that type (e.g. under the Public Functions section). Set it to
+# NO to prevent subgrouping. Alternatively, this can be done per class using
+# the \nosubgrouping command.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and
+# unions are shown inside the group in which they are included (e.g. using
+# @ingroup) instead of on a separate page (for HTML and Man pages) or
+# section (for LaTeX and RTF).
+
+INLINE_GROUPED_CLASSES = NO
+
+# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum
+# is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically
+# be useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to
+# determine which symbols to keep in memory and which to flush to disk.
+# When the cache is full, less often used symbols will be written to disk.
+# For small to medium size projects (<1000 input files) the default value is
+# probably good enough. For larger projects a too small cache size can cause
+# doxygen to be busy swapping symbols to and from disk most of the time
+# causing a significant performance penalty.
+# If the system has enough physical memory increasing the cache will improve the
+# performance by keeping more symbols in memory. Note that the value works on
+# a logarithmic scale so increasing the size by one will roughly double the
+# memory usage. The cache size is given by this formula:
+# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0,
+# corresponding to a cache size of 2^16 = 65536 symbols
+
+SYMBOL_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available.
+# Private class members and static file members will be hidden unless
+# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
+# will be included in the documentation.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file
+# will be included in the documentation.
+
+EXTRACT_STATIC         = NO
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
+# defined locally in source files will be included in the documentation.
+# If set to NO only classes defined in header files are included.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. When set to YES local
+# methods, which are defined in the implementation section but not in
+# the interface are included in the documentation.
+# If set to NO (the default) only methods in the interface are included.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base
+# name of the file that contains the anonymous namespace. By default
+# anonymous namespaces are hidden.
+
+EXTRACT_ANON_NSPACES   = YES
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
+# undocumented members of documented classes, files or namespaces.
+# If set to NO (the default) these members will be included in the
+# various overviews, but no documentation section is generated.
+# This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy.
+# If set to NO (the default) these classes will be included in the various
+# overviews. This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
+# friend (class|struct|union) declarations.
+# If set to NO (the default) these declarations will be included in the
+# documentation.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any
+# documentation blocks found inside the body of a function.
+# If set to NO (the default) these blocks will be appended to the
+# function's detailed documentation block.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation
+# that is typed after a \internal command is included. If the tag is set
+# to NO (the default) then the documentation will be excluded.
+# Set it to YES to include the internal documentation.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
+# file names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+
+CASE_SENSE_NAMES       = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
+# will show members with their full class and namespace scopes in the
+# documentation. If set to YES the scope will be hidden.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
+# will put a list of the files that are included by a file in the documentation
+# of that file.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen
+# will list include files with double quotes in the documentation
+# rather than with sharp brackets.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
+# is inserted in the documentation for inline members.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
+# will sort the (detailed) documentation of file and class members
+# alphabetically by member name. If set to NO the members will appear in
+# declaration order.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the
+# brief documentation of file, namespace and class members alphabetically
+# by member name. If set to NO (the default) the members will appear in
+# declaration order.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen
+# will sort the (brief and detailed) documentation of class members so that
+# constructors and destructors are listed first. If set to NO (the default)
+# the constructors will appear in the respective orders defined by
+# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS.
+# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO
+# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the
+# hierarchy of group names into alphabetical order. If set to NO (the default)
+# the group names will appear in their defined order.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be
+# sorted by fully-qualified names, including namespaces. If set to
+# NO (the default), the class list will be sorted only by class name,
+# not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the
+# alphabetical list.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to
+# do proper type resolution of all parameters of a function it will reject a
+# match between the prototype and the implementation of a member function even
+# if there is only one candidate or it is obvious which candidate to choose
+# by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen
+# will still accept a match between prototype and implementation in such cases.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or
+# disable (NO) the todo list. This list is created by putting \todo
+# commands in the documentation.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or
+# disable (NO) the test list. This list is created by putting \test
+# commands in the documentation.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or
+# disable (NO) the bug list. This list is created by putting \bug
+# commands in the documentation.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
+# disable (NO) the deprecated list. This list is created by putting
+# \deprecated commands in the documentation.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional
+# documentation sections, marked by \if sectionname ... \endif.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
+# the initial value of a variable or macro consists of for it to appear in
+# the documentation. If the initializer consists of more lines than specified
+# here it will be hidden. Use a value of 0 to hide initializers completely.
+# The appearance of the initializer of individual variables and macros in the
+# documentation can be controlled using \showinitializer or \hideinitializer
+# command in the documentation regardless of this setting.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
+# at the bottom of the documentation of classes and structs. If set to YES the
+# list will mention the files that were used to generate the documentation.
+
+SHOW_USED_FILES        = YES
+
+# If the sources in your project are distributed over multiple directories
+# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy
+# in the documentation. The default is NO.
+
+SHOW_DIRECTORIES       = NO
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page.
+# This will remove the Files entry from the Quick Index and from the
+# Folder Tree View (if specified). The default is YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the
+# Namespaces page.
+# This will remove the Namespaces entry from the Quick Index
+# and from the Folder Tree View (if specified). The default is YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command <command> <input-file>, where <command> is the value of
+# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file
+# provided by doxygen. Whatever the program writes to standard output
+# is used as the file version. See the manual for examples.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. The create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option.
+# You can optionally specify a file name after the option, if omitted
+# DoxygenLayout.xml will be used as the name of the layout file.
+
+LAYOUT_FILE            =
+
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated
+# by doxygen. Possible values are YES and NO. If left blank NO is used.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated by doxygen. Possible values are YES and NO. If left blank
+# NO is used.
+
+WARNINGS               = YES
+
+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
+# automatically be disabled.
+
+WARN_IF_UNDOCUMENTED   = NO
+
+# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some
+# parameters in a documented function, or documenting parameters that
+# don't exist or using markup commands wrongly.
+
+WARN_IF_DOC_ERROR      = YES
+
+# The WARN_NO_PARAMDOC option can be enabled to get warnings for
+# functions that are documented, but have no documentation for their parameters
+# or return value. If set to NO (the default) doxygen will only warn about
+# wrong or incomplete parameter documentation, but not about the absence of
+# documentation.
+
+WARN_NO_PARAMDOC       = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that
+# doxygen can produce. The string should contain the $file, $line, and $text
+# tags, which will be replaced by the file and line number from which the
+# warning originated and the warning text. Optionally the format may contain
+# $version, which will be replaced by the version of the file (if it could
+# be obtained via FILE_VERSION_FILTER)
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning
+# and error messages should be written. If left blank the output is written
+# to stderr.
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag can be used to specify the files and/or directories that contain
+# documented source files. You may enter file names like "myfile.cpp" or
+# directories like "/usr/src/myproject". Separate the files or directories
+# with spaces.
+
+INPUT                  = api/ core/
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
+# also the default input encoding. Doxygen uses libiconv (or the iconv built
+# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for
+# the list of possible encodings.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank the following patterns are tested:
+# *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh
+# *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py
+# *.f90 *.f *.for *.vhd *.vhdl
+
+FILE_PATTERNS          =
+
+# The RECURSIVE tag can be used to turn specify whether or not subdirectories
+# should be searched for input files as well. Possible values are YES and NO.
+# If left blank NO is used.
+
+RECURSIVE              = NO
+
+# The EXCLUDE tag can be used to specify files and/or directories that should
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+
+EXCLUDE                =
+
+# The EXCLUDE_SYMLINKS tag can be used select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories. Note that the wildcards are matched
+# against the file with absolute path, so to exclude all test directories
+# for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or
+# directories that contain example code fragments that are included (see
+# the \include command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank all files are included.
+
+EXAMPLE_PATTERNS       =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude
+# commands irrespective of the value of the RECURSIVE tag.
+# Possible values are YES and NO. If left blank NO is used.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or
+# directories that contain image that are included in the documentation (see
+# the \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command <filter> <input-file>, where <filter>
+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
+# input file. Doxygen will then use the output that the filter program writes
+# to standard output.
+# If FILTER_PATTERNS is specified, this tag will be
+# ignored.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis.
+# Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match.
+# The filters are a list of the form:
+# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further
+# info on how filters are used. If FILTER_PATTERNS is empty or if
+# non of the patterns match the file name, INPUT_FILTER is applied.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will be used to filter the input files when producing source
+# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any)
+# and it is also possible to disable source filtering for a specific pattern
+# using *.ext= (so without naming a filter). This option only has effect when
+# FILTER_SOURCE_FILES is enabled.
+
+FILTER_SOURCE_PATTERNS =
+
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will
+# be generated. Documented entities will be cross-referenced with these sources.
+# Note: To get rid of all source code in the generated output, make sure also
+# VERBATIM_HEADERS is set to NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body
+# of functions and classes directly in the documentation.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
+# doxygen to hide any special comment blocks from generated source code
+# fragments. Normal C and C++ comments will always remain visible.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES
+# then for each documented function all documented
+# functions referencing it will be listed.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES
+# then for each documented function all documented entities
+# called/used by that function will be listed.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
+# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
+# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
+# link to the source code.
+# Otherwise they will link to the documentation.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code
+# will point to the HTML generated by the htags(1) tool instead of doxygen
+# built-in source browser. The htags tool is part of GNU's global source
+# tagging system (see http://www.gnu.org/software/global/global.html). You
+# will need version 4.8.6 or higher.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
+# will generate a verbatim copy of the header file for each class for
+# which an include is specified. Set to NO to disable this.
+
+VERBATIM_HEADERS       = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
+# of all compounds will be generated. Enable this if the project
+# contains a lot of classes, structs, unions or interfaces.
+
+ALPHABETICAL_INDEX     = YES
+
+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
+# in which this list will be split (can be a number in the range [1..20])
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all
+# classes will be put under the same header in the alphabetical index.
+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
+# should be ignored while generating the index headers.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
+# generate HTML output.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `html' will be used as the default path.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
+# doxygen will generate files with .html extension.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a personal HTML header for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard header. Note that when using a custom header you are responsible
+# for the proper inclusion of any scripts and style sheets that doxygen
+# needs, which is dependent on the configuration options used.
+# It is adviced to generate a default header using "doxygen -w html
+# header.html footer.html stylesheet.css YourConfigFile" and then modify
+# that header. Note that the header is subject to change so you typically
+# have to redo this when upgrading to a newer version of doxygen or when changing the value of configuration settings such as GENERATE_TREEVIEW!
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a personal HTML footer for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard footer.
+
+HTML_FOOTER            =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading
+# style sheet that is used by each HTML page. It can be used to
+# fine-tune the look of the HTML output. If the tag is left blank doxygen
+# will generate a default style sheet. Note that doxygen will try to copy
+# the style sheet file to the HTML output directory, so don't put your own
+# stylesheet in the HTML output directory as well, or it will be erased!
+
+HTML_STYLESHEET        =
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath$ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that
+# the files will be copied as-is; there are no commands or markers available.
+
+HTML_EXTRA_FILES       =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output.
+# Doxygen will adjust the colors in the stylesheet and background images
+# according to this color. Hue is specified as an angle on a colorwheel,
+# see http://en.wikipedia.org/wiki/Hue for more information.
+# For instance the value 0 represents red, 60 is yellow, 120 is green,
+# 180 is cyan, 240 is blue, 300 purple, and 360 is red again.
+# The allowed range is 0 to 359.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of
+# the colors in the HTML output. For a value of 0 the output will use
+# grayscales only. A value of 255 will produce the most vivid colors.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to
+# the luminance component of the colors in the HTML output. Values below
+# 100 gradually make the output lighter, whereas values above 100 make
+# the output darker. The value divided by 100 is the actual gamma applied,
+# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2,
+# and 100 does not change the gamma.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting
+# this to NO can help when comparing the output of multiple runs.
+
+HTML_TIMESTAMP         = YES
+
+# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes,
+# files or namespaces will be aligned in HTML using tables. If set to
+# NO a bullet list will be used.
+
+HTML_ALIGN_MEMBERS     = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded. For this to work a browser that supports
+# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox
+# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari).
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files
+# will be generated that can be used as input for Apple's Xcode 3
+# integrated development environment, introduced with OSX 10.5 (Leopard).
+# To create a documentation set, doxygen will generate a Makefile in the
+# HTML output directory. Running make will produce the docset in that
+# directory and running "make install" will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find
+# it at startup.
+# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+
+GENERATE_DOCSET        = NO
+
+# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the
+# feed. A documentation feed provides an umbrella under which multiple
+# documentation sets from a single provider (such as a company or product suite)
+# can be grouped.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that
+# should uniquely identify the documentation set bundle. This should be a
+# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen
+# will append .docset to the name.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES, additional index files
+# will be generated that can be used as input for tools like the
+# Microsoft HTML help workshop to generate a compiled HTML help file (.chm)
+# of the generated HTML documentation.
+
+GENERATE_HTMLHELP      = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
+# be used to specify the file name of the resulting .chm file. You
+# can add a path in front of the file if the result should not be
+# written to the html output directory.
+
+CHM_FILE               =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
+# be used to specify the location (absolute path including file name) of
+# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run
+# the HTML help compiler on the generated index.hhp.
+
+HHC_LOCATION           =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
+# controls if a separate .chi index file is generated (YES) or that
+# it should be included in the master .chm file (NO).
+
+GENERATE_CHI           = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING
+# is used to encode HtmlHelp index (hhk), content (hhc) and project file
+# content.
+
+CHM_INDEX_ENCODING     =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
+# controls whether a binary table of contents is generated (YES) or a
+# normal table of contents (NO) in the .chm file.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members
+# to the contents of the HTML help documentation and to the tree view.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated
+# that can be used as input for Qt's qhelpgenerator to generate a
+# Qt Compressed Help (.qch) of the generated HTML documentation.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can
+# be used to specify the file name of the resulting .qch file.
+# The path specified is relative to the HTML output folder.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#namespace
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#virtual-folders
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to
+# add. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#custom-filters
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see
+# <a href="http://doc.trolltech.com/qthelpproject.html#custom-filters">
+# Qt Help Project / Custom Filters</a>.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's
+# filter section matches.
+# <a href="http://doc.trolltech.com/qthelpproject.html#filter-attributes">
+# Qt Help Project / Filter Attributes</a>.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can
+# be used to specify the location of Qt's qhelpgenerator.
+# If non-empty doxygen will try to run qhelpgenerator on the generated
+# .qhp file.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files
+#  will be generated, which together with the HTML files, form an Eclipse help
+# plugin. To install this plugin and make it available under the help contents
+# menu in Eclipse, the contents of the directory containing the HTML and XML
+# files needs to be copied into the plugins directory of eclipse. The name of
+# the directory within the plugins directory should be the same as
+# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before
+# the help appears.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have
+# this name.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# The DISABLE_INDEX tag can be used to turn on/off the condensed index at
+# top of each HTML page. The value NO (the default) enables the index and
+# the value YES disables it.
+
+DISABLE_INDEX          = NO
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values
+# (range [0,1..20]) that doxygen will group on one line in the generated HTML
+# documentation. Note that a value of 0 will completely suppress the enum
+# values from appearing in the overview section.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information.
+# If the tag value is set to YES, a side panel will be generated
+# containing a tree-like index structure (just like the one that
+# is generated for HTML Help). For this to work a browser that supports
+# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser).
+# Windows users are probably better off using the HTML help feature.
+
+GENERATE_TREEVIEW      = NO
+
+# By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories,
+# and Class Hierarchy pages using a tree view instead of an ordered list.
+
+USE_INLINE_TREES       = NO
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
+# used to set the initial width (in pixels) of the frame in which the tree
+# is shown.
+
+TREEVIEW_WIDTH         = 250
+
+# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open
+# links to external symbols imported via tag files in a separate window.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of Latex formulas included
+# as images in the HTML documentation. The default is 10. Note that
+# when you change the font size after a successful doxygen run you need
+# to manually remove any form_*.png images from the HTML output directory
+# to force them to be regenerated.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are
+# not supported properly for IE 6.0, but are supported on all modern browsers.
+# Note that when changing this option you need to delete any form_*.png files
+# in the HTML output before the changes have effect.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax
+# (see http://www.mathjax.org) which uses client side Javascript for the
+# rendering instead of using prerendered bitmaps. Use this if you do not
+# have LaTeX installed or if you want to formulas look prettier in the HTML
+# output. When enabled you also need to install MathJax separately and
+# configure the path to it using the MATHJAX_RELPATH option.
+
+USE_MATHJAX            = NO
+
+# When MathJax is enabled you need to specify the location relative to the
+# HTML output directory using the MATHJAX_RELPATH option. The destination
+# directory should contain the MathJax.js script. For instance, if the mathjax
+# directory is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the
+# mathjax.org site, so you can quickly see the result without installing
+# MathJax, but it is strongly recommended to install a local copy of MathJax
+# before deployment.
+
+MATHJAX_RELPATH        = http://www.mathjax.org/mathjax
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box
+# for the HTML output. The underlying search engine uses javascript
+# and DHTML and should work on any modern browser. Note that when using
+# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets
+# (GENERATE_DOCSET) there is already a search function so this one should
+# typically be disabled. For large projects the javascript based search engine
+# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a PHP enabled web server instead of at the web client
+# using Javascript. Doxygen will generate the search PHP script and index
+# file to put on the web server. The advantage of the server
+# based approach is that it scales better to large projects and allows
+# full text search. The disadvantages are that it is more difficult to setup
+# and does not have live searching capabilities.
+
+SERVER_BASED_SEARCH    = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
+# generate Latex output.
+
+GENERATE_LATEX         = YES
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `latex' will be used as the default path.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked. If left blank `latex' will be used as the default command name.
+# Note that when enabling USE_PDFLATEX this option is only used for
+# generating bitmaps for formulas in the HTML output, but not in the
+# Makefile that is written to the output directory.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
+# generate index for LaTeX. If left blank `makeindex' will be used as the
+# default command name.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
+# LaTeX documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used
+# by the printer. Possible values are: a4, letter, legal and
+# executive. If left blank a4wide will be used.
+
+PAPER_TYPE             = a4
+
+# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX
+# packages that should be included in the LaTeX output.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for
+# the generated latex document. The header should contain everything until
+# the first chapter. If it is left blank doxygen will generate a
+# standard header. Notice: only use this tag if you know what you are doing!
+
+LATEX_HEADER           =
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for
+# the generated latex document. The footer should contain everything after
+# the last chapter. If it is left blank doxygen will generate a
+# standard footer. Notice: only use this tag if you know what you are doing!
+
+LATEX_FOOTER           =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated
+# is prepared for conversion to pdf (using ps2pdf). The pdf file will
+# contain links (just like the HTML output) instead of page references
+# This makes the output suitable for online browsing using a pdf viewer.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
+# plain latex in the generated Makefile. Set this option to YES to get a
+# higher quality PDF documentation.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
+# command to the generated LaTeX files. This will instruct LaTeX to keep
+# running if errors occur, instead of asking the user for help.
+# This option is also used when generating formulas in HTML.
+
+LATEX_BATCHMODE        = NO
+
+# If LATEX_HIDE_INDICES is set to YES then doxygen will not
+# include the index chapters (such as File Index, Compound Index, etc.)
+# in the output.
+
+LATEX_HIDE_INDICES     = NO
+
+# If LATEX_SOURCE_CODE is set to YES then doxygen will include
+# source code with syntax highlighting in the LaTeX output.
+# Note that which sources are shown also depends on other settings
+# such as SOURCE_BROWSER.
+
+LATEX_SOURCE_CODE      = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
+# The RTF output is optimized for Word 97 and may not look very pretty with
+# other RTF readers or editors.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `rtf' will be used as the default path.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
+# RTF documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
+# will contain hyperlink fields. The RTF file will
+# contain links (just like the HTML output) instead of page references.
+# This makes the output suitable for online browsing using WORD or other
+# programs which support those fields.
+# Note: wordpad (write) and others do not support links.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# config file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an rtf document.
+# Syntax is similar to doxygen's config file.
+
+RTF_EXTENSIONS_FILE    =
+
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
+# generate man pages
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `man' will be used as the default path.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to
+# the generated man pages (default is the subroutine's section .3)
+
+MAN_EXTENSION          = .3
+
+# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
+# then it will generate one additional man file for each entity
+# documented in the real man page(s). These additional files
+# only source the real man page, but without them the man command
+# would be unable to find the correct page. The default is NO.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES Doxygen will
+# generate an XML file that captures the structure of
+# the code including all documentation.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `xml' will be used as the default path.
+
+XML_OUTPUT             = xml
+
+# The XML_SCHEMA tag can be used to specify an XML schema,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_SCHEMA             =
+
+# The XML_DTD tag can be used to specify an XML DTD,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_DTD                =
+
+# If the XML_PROGRAMLISTING tag is set to YES Doxygen will
+# dump the program listings (including syntax highlighting
+# and cross-referencing information) to the XML output. Note that
+# enabling this will significantly increase the size of the XML output.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
+# generate an AutoGen Definitions (see autogen.sf.net) file
+# that captures the structure of the code including all
+# documentation. Note that this feature is still experimental
+# and incomplete at the moment.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES Doxygen will
+# generate a Perl module file that captures the structure of
+# the code including all documentation. Note that this
+# feature is still experimental and incomplete at the
+# moment.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES Doxygen will generate
+# the necessary Makefile rules, Perl scripts and LaTeX code to be able
+# to generate PDF and DVI output from the Perl module output.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be
+# nicely formatted so it can be parsed by a human reader.
+# This is useful
+# if you want to understand what is going on.
+# On the other hand, if this
+# tag is set to NO the size of the Perl module output will be much smaller
+# and Perl will parse it just the same.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file
+# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.
+# This is useful so different doxyrules.make files included by the same
+# Makefile don't overwrite each other's variables.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
+# evaluate all C-preprocessor directives found in the sources and include
+# files.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
+# names in the source code. If set to NO (the default) only conditional
+# compilation will be performed. Macro expansion can be done in a controlled
+# way by setting EXPAND_ONLY_PREDEF to YES.
+
+MACRO_EXPANSION        = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
+# then the macro expansion is limited to the macros specified with the
+# PREDEFINED and EXPAND_AS_DEFINED tags.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
+# pointed to by INCLUDE_PATH will be searched when a #include is found.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by
+# the preprocessor.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will
+# be used.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that
+# are defined before the preprocessor is started (similar to the -D option of
+# gcc). The argument of the tag is a list of macros of the form: name
+# or name=definition (no spaces). If the definition and the = are
+# omitted =1 is assumed. To prevent a macro definition from being
+# undefined via #undef or recursively expanded use the := operator
+# instead of the = operator.
+
+PREDEFINED             =
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
+# this tag can be used to specify a list of macro names that should be expanded.
+# The macro definition that is found in the sources will be used.
+# Use the PREDEFINED tag if you want to use a different macro definition that
+# overrules the definition found in the source code.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
+# doxygen's preprocessor will remove all references to function-like macros
+# that are alone on a line, have an all uppercase name, and do not end with a
+# semicolon, because these will confuse the parser if not removed.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES option can be used to specify one or more tagfiles.
+# Optionally an initial location of the external documentation
+# can be added for each tagfile. The format of a tag file without
+# this location is as follows:
+#
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+#
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where "loc1" and "loc2" can be relative or absolute paths or
+# URLs. If a location is present for each tag, the installdox tool
+# does not have to be run to correct the links.
+# Note that each tag file must have a unique name
+# (where the name does NOT include the path)
+# If a tag file is not located in the directory in which doxygen
+# is run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create
+# a tag file that is based on the input files it reads.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES all external classes will be listed
+# in the class index. If set to NO only the inherited external classes
+# will be listed.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will
+# be listed.
+
+EXTERNAL_GROUPS        = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of `which perl').
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
+# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base
+# or super classes. Setting the tag to NO turns the diagrams off. Note that
+# this option also works with HAVE_DOT disabled, but it is recommended to
+# install and use dot, since it yields more powerful graphs.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see
+# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            =
+
+# If set to YES, the inheritance and collaboration graphs will hide
+# inheritance and usage relations if the target is undocumented
+# or is not a class.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz, a graph visualization
+# toolkit from AT&T and Lucent Bell Labs. The other options in this section
+# have no effect if this option is set to NO (the default)
+
+HAVE_DOT               = NO
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is
+# allowed to run in parallel. When set to 0 (the default) doxygen will
+# base this on the number of processors available in the system. You can set it
+# explicitly to a value larger than 0 to get control over the balance
+# between CPU load and processing speed.
+
+DOT_NUM_THREADS        = 0
+
+# By default doxygen will write a font called Helvetica to the output
+# directory and reference it in all dot files that doxygen generates.
+# When you want a differently looking font you can specify the font name
+# using DOT_FONTNAME. You need to make sure dot is able to find the font,
+# which can be done by putting it in a standard location or by setting the
+# DOTFONTPATH environment variable or by setting DOT_FONTPATH to the directory
+# containing the font.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs.
+# The default size is 10pt.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the output directory to look for the
+# FreeSans.ttf font (which doxygen will put there itself). If you specify a
+# different font using DOT_FONTNAME you can set the path where dot
+# can find it using this tag.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect inheritance relations. Setting this tag to YES will force the
+# the CLASS_DIAGRAMS tag to NO.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect implementation dependencies (inheritance, containment, and
+# class references variables) of the class with other documented classes.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for groups, showing the direct groups dependencies
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+
+UML_LOOK               = NO
+
+# If set to YES, the inheritance and collaboration graphs will show the
+# relations between templates and their instances.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
+# tags are set to YES then doxygen will generate a graph for each documented
+# file showing the direct and indirect include dependencies of the file with
+# other documented files.
+
+INCLUDE_GRAPH          = YES
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
+# documented header file showing the documented files that directly or
+# indirectly include this file.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH and HAVE_DOT options are set to YES then
+# doxygen will generate a call dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable call graphs
+# for selected functions only using the \callgraph command.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then
+# doxygen will generate a caller dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable caller
+# graphs for selected functions only using the \callergraph command.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
+# will generate a graphical hierarchy of all classes instead of a textual one.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES
+# then doxygen will show the dependencies a directory has on other directories
+# in a graphical way. The dependency relations are determined by the #include
+# relations between the files in the directories.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. Possible values are svg, png, jpg, or gif.
+# If left blank png will be used.
+
+DOT_IMAGE_FORMAT       = png
+
+# The tag DOT_PATH can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the
+# \dotfile command).
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the
+# \mscfile command).
+
+MSCFILE_DIRS           =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of
+# nodes that will be shown in the graph. If the number of nodes in a graph
+# becomes larger than this value, doxygen will truncate the graph, which is
+# visualized by representing a node as a red box. Note that doxygen if the
+# number of direct children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note
+# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
+# graphs generated by dot. A depth value of 3 means that only nodes reachable
+# from the root by following a path via at most 3 edges will be shown. Nodes
+# that lay further from the root node will be omitted. Note that setting this
+# option to 1 or 2 may greatly reduce the computation time needed for large
+# code bases. Also note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not
+# seem to support this out of the box. Warning: Depending on the platform used,
+# enabling this option may lead to badly anti-aliased labels on the edges of
+# a graph (i.e. they become hard to read).
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10)
+# support this, this feature is disabled by default.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
+# generate a legend page explaining the meaning of the various boxes and
+# arrows in the dot generated graphs.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
+# remove the intermediate dot files that are used to generate
+# the various graphs.
+
+DOT_CLEANUP            = YES
diff --git a/src/gallium/state_trackers/clover/Makefile.am b/src/gallium/state_trackers/clover/Makefile.am
new file mode 100644
index 00000000000..da9f3bb92da
--- /dev/null
+++ b/src/gallium/state_trackers/clover/Makefile.am
@@ -0,0 +1,71 @@
+AUTOMAKE_OPTIONS = subdir-objects
+
+AM_CPPFLAGS = \
+	$(GALLIUM_PIPE_LOADER_DEFINES) \
+	-DMESA_VERSION=\"$(MESA_VERSION)\" \
+	-DPIPE_SEARCH_DIR=\"$(OPENCL_LIB_INSTALL_DIR)\" \
+	-I$(top_srcdir)/include \
+	-I$(top_srcdir)/src/gallium/include \
+	-I$(top_srcdir)/src/gallium/drivers \
+	-I$(top_srcdir)/src/gallium/auxiliary \
+	-I$(top_srcdir)/src/gallium/winsys \
+	-I$(srcdir)
+
+noinst_LTLIBRARIES = libclover.la libcltgsi.la libclllvm.la
+
+libcltgsi_la_CXXFLAGS = \
+	-std=c++0x
+
+libcltgsi_la_SOURCES = \
+	tgsi/compiler.cpp
+
+libclllvm_la_CXXFLAGS = \
+	-std=c++98
+
+libclllvm_la_SOURCES = \
+	llvm/invocation.cpp
+
+libclover_la_CXXFLAGS = \
+	-std=c++0x
+
+libclover_la_LIBADD = \
+	libcltgsi.la libclllvm.la
+
+libclover_la_SOURCES = \
+	core/base.hpp \
+	core/compat.hpp \
+	core/compiler.hpp \
+	core/geometry.hpp \
+	core/device.hpp \
+	core/device.cpp \
+	core/context.hpp \
+	core/context.cpp \
+	core/queue.hpp \
+	core/queue.cpp \
+	core/format.hpp \
+	core/format.cpp \
+	core/memory.hpp \
+	core/memory.cpp \
+	core/resource.hpp \
+	core/resource.cpp \
+	core/sampler.hpp \
+	core/sampler.cpp \
+	core/event.hpp \
+	core/event.cpp \
+	core/program.hpp \
+	core/program.cpp \
+	core/kernel.hpp \
+	core/kernel.cpp \
+	core/module.hpp \
+	core/module.cpp \
+	api/util.hpp \
+	api/platform.cpp \
+	api/device.cpp \
+	api/context.cpp \
+	api/queue.cpp \
+	api/memory.cpp \
+	api/transfer.cpp \
+	api/sampler.cpp \
+	api/event.cpp \
+	api/program.cpp \
+	api/kernel.cpp
diff --git a/src/gallium/state_trackers/clover/api/context.cpp b/src/gallium/state_trackers/clover/api/context.cpp
new file mode 100644
index 00000000000..c8d668933e5
--- /dev/null
+++ b/src/gallium/state_trackers/clover/api/context.cpp
@@ -0,0 +1,120 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+// OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#include "api/util.hpp"
+#include "core/context.hpp"
+
+using namespace clover;
+
+PUBLIC cl_context
+clCreateContext(const cl_context_properties *props, cl_uint num_devs,
+                const cl_device_id *devs,
+                void (CL_CALLBACK *pfn_notify)(const char *, const void *,
+                                               size_t, void *),
+                void *user_data, cl_int *errcode_ret) try {
+   auto mprops = property_map(props);
+
+   if (!devs || !num_devs ||
+       (!pfn_notify && user_data))
+      throw error(CL_INVALID_VALUE);
+
+   if (any_of(is_zero<cl_device_id>(), devs, devs + num_devs))
+      throw error(CL_INVALID_DEVICE);
+
+   for (auto p : mprops) {
+      if (!(p.first == CL_CONTEXT_PLATFORM &&
+            (cl_platform_id)p.second == NULL))
+         throw error(CL_INVALID_PROPERTY);
+   }
+
+   ret_error(errcode_ret, CL_SUCCESS);
+   return new context(
+      property_vector(mprops),
+      std::vector<cl_device_id>(devs, devs + num_devs));
+
+} catch(error &e) {
+   ret_error(errcode_ret, e);
+   return NULL;
+}
+
+PUBLIC cl_context
+clCreateContextFromType(const cl_context_properties *props,
+                        cl_device_type type,
+                        void (CL_CALLBACK *pfn_notify)(
+                           const char *, const void *, size_t, void *),
+                        void *user_data, cl_int *errcode_ret) {
+   cl_device_id dev;
+   cl_int ret;
+
+   ret = clGetDeviceIDs(0, type, 1, &dev, 0);
+   if (ret) {
+      ret_error(errcode_ret, ret);
+      return NULL;
+   }
+
+   return clCreateContext(props, 1, &dev, pfn_notify, user_data, errcode_ret);
+}
+
+PUBLIC cl_int
+clRetainContext(cl_context ctx) {
+   if (!ctx)
+      return CL_INVALID_CONTEXT;
+
+   ctx->retain();
+   return CL_SUCCESS;
+}
+
+PUBLIC cl_int
+clReleaseContext(cl_context ctx) {
+   if (!ctx)
+      return CL_INVALID_CONTEXT;
+
+   if (ctx->release())
+      delete ctx;
+
+   return CL_SUCCESS;
+}
+
+PUBLIC cl_int
+clGetContextInfo(cl_context ctx, cl_context_info param,
+                 size_t size, void *buf, size_t *size_ret) {
+   if (!ctx)
+      return CL_INVALID_CONTEXT;
+
+   switch (param) {
+   case CL_CONTEXT_REFERENCE_COUNT:
+      return scalar_property<cl_uint>(buf, size, size_ret, ctx->ref_count());
+
+   case CL_CONTEXT_NUM_DEVICES:
+      return scalar_property<cl_uint>(buf, size, size_ret, ctx->devs.size());
+
+   case CL_CONTEXT_DEVICES:
+      return vector_property<cl_device_id>(buf, size, size_ret, ctx->devs);
+
+   case CL_CONTEXT_PROPERTIES:
+      return vector_property<cl_context_properties>(buf, size, size_ret,
+                                                    ctx->props());
+
+   default:
+      return CL_INVALID_VALUE;
+   }
+}
diff --git a/src/gallium/state_trackers/clover/api/device.cpp b/src/gallium/state_trackers/clover/api/device.cpp
new file mode 100644
index 00000000000..03767519aaf
--- /dev/null
+++ b/src/gallium/state_trackers/clover/api/device.cpp
@@ -0,0 +1,262 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+// OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#include "api/util.hpp"
+#include "core/device.hpp"
+
+using namespace clover;
+
+static device_registry registry;
+
+PUBLIC cl_int
+clGetDeviceIDs(cl_platform_id platform, cl_device_type device_type,
+               cl_uint num_entries, cl_device_id *devices,
+               cl_uint *num_devices) {
+   std::vector<cl_device_id> devs;
+
+   if (platform != NULL)
+      return CL_INVALID_PLATFORM;
+
+   if ((!num_entries && devices) ||
+       (!num_devices && !devices))
+      return CL_INVALID_VALUE;
+
+   // Collect matching devices
+   for (device &dev : registry) {
+      if (((device_type & CL_DEVICE_TYPE_DEFAULT) &&
+           &dev == &registry.front()) ||
+          (device_type & dev.type()))
+         devs.push_back(&dev);
+   }
+
+   if (devs.empty())
+      return CL_DEVICE_NOT_FOUND;
+
+   // ...and return the requested data.
+   if (num_devices)
+      *num_devices = devs.size();
+   if (devices)
+      std::copy_n(devs.begin(),
+                  std::min((cl_uint)devs.size(), num_entries),
+                  devices);
+
+   return CL_SUCCESS;
+}
+
+PUBLIC cl_int
+clGetDeviceInfo(cl_device_id dev, cl_device_info param,
+                size_t size, void *buf, size_t *size_ret) {
+   if (!dev)
+      return CL_INVALID_DEVICE;
+
+   switch (param) {
+   case CL_DEVICE_TYPE:
+      return scalar_property<cl_device_type>(buf, size, size_ret, dev->type());
+
+   case CL_DEVICE_VENDOR_ID:
+      return scalar_property<cl_uint>(buf, size, size_ret, dev->vendor_id());
+
+   case CL_DEVICE_MAX_COMPUTE_UNITS:
+      return scalar_property<cl_uint>(buf, size, size_ret, 1);
+
+   case CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:
+      return scalar_property<cl_uint>(buf, size, size_ret,
+                                      dev->max_block_size().size());
+
+   case CL_DEVICE_MAX_WORK_ITEM_SIZES:
+      return vector_property<size_t>(buf, size, size_ret,
+                                     dev->max_block_size());
+
+   case CL_DEVICE_MAX_WORK_GROUP_SIZE:
+      return scalar_property<size_t>(buf, size, size_ret, SIZE_MAX);
+
+   case CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR:
+      return scalar_property<cl_uint>(buf, size, size_ret, 16);
+
+   case CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT:
+      return scalar_property<cl_uint>(buf, size, size_ret, 8);
+
+   case CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT:
+      return scalar_property<cl_uint>(buf, size, size_ret, 4);
+
+   case CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG:
+      return scalar_property<cl_uint>(buf, size, size_ret, 2);
+
+   case CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT:
+      return scalar_property<cl_uint>(buf, size, size_ret, 4);
+
+   case CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE:
+      return scalar_property<cl_uint>(buf, size, size_ret, 2);
+
+   case CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF:
+      return scalar_property<cl_uint>(buf, size, size_ret, 0);
+
+   case CL_DEVICE_MAX_CLOCK_FREQUENCY:
+      return scalar_property<cl_uint>(buf, size, size_ret, 0);
+
+   case CL_DEVICE_ADDRESS_BITS:
+      return scalar_property<cl_uint>(buf, size, size_ret, 32);
+
+   case CL_DEVICE_MAX_READ_IMAGE_ARGS:
+      return scalar_property<cl_uint>(buf, size, size_ret,
+                                      dev->max_images_read());
+
+   case CL_DEVICE_MAX_WRITE_IMAGE_ARGS:
+      return scalar_property<cl_uint>(buf, size, size_ret,
+                                      dev->max_images_write());
+
+   case CL_DEVICE_MAX_MEM_ALLOC_SIZE:
+      return scalar_property<cl_ulong>(buf, size, size_ret, 0);
+
+   case CL_DEVICE_IMAGE2D_MAX_WIDTH:
+   case CL_DEVICE_IMAGE2D_MAX_HEIGHT:
+      return scalar_property<size_t>(buf, size, size_ret,
+                                     1 << dev->max_image_levels_2d());
+
+   case CL_DEVICE_IMAGE3D_MAX_WIDTH:
+   case CL_DEVICE_IMAGE3D_MAX_HEIGHT:
+   case CL_DEVICE_IMAGE3D_MAX_DEPTH:
+      return scalar_property<size_t>(buf, size, size_ret,
+                                     1 << dev->max_image_levels_3d());
+
+   case CL_DEVICE_IMAGE_SUPPORT:
+      return scalar_property<cl_bool>(buf, size, size_ret, CL_TRUE);
+
+   case CL_DEVICE_MAX_PARAMETER_SIZE:
+      return scalar_property<size_t>(buf, size, size_ret,
+                                     dev->max_mem_input());
+
+   case CL_DEVICE_MAX_SAMPLERS:
+      return scalar_property<cl_uint>(buf, size, size_ret,
+                                      dev->max_samplers());
+
+   case CL_DEVICE_MEM_BASE_ADDR_ALIGN:
+   case CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE:
+      return scalar_property<cl_uint>(buf, size, size_ret, 128);
+
+   case CL_DEVICE_SINGLE_FP_CONFIG:
+      return scalar_property<cl_device_fp_config>(buf, size, size_ret,
+         CL_FP_DENORM | CL_FP_INF_NAN | CL_FP_ROUND_TO_NEAREST);
+
+   case CL_DEVICE_GLOBAL_MEM_CACHE_TYPE:
+      return scalar_property<cl_device_mem_cache_type>(buf, size, size_ret,
+                                                       CL_NONE);
+
+   case CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE:
+      return scalar_property<cl_uint>(buf, size, size_ret, 0);
+
+   case CL_DEVICE_GLOBAL_MEM_CACHE_SIZE:
+      return scalar_property<cl_ulong>(buf, size, size_ret, 0);
+
+   case CL_DEVICE_GLOBAL_MEM_SIZE:
+      return scalar_property<cl_ulong>(buf, size, size_ret,
+                                       dev->max_mem_global());
+
+   case CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:
+      return scalar_property<cl_ulong>(buf, size, size_ret,
+                                       dev->max_const_buffer_size());
+
+   case CL_DEVICE_MAX_CONSTANT_ARGS:
+      return scalar_property<cl_uint>(buf, size, size_ret,
+                                      dev->max_const_buffers());
+
+   case CL_DEVICE_LOCAL_MEM_TYPE:
+      return scalar_property<cl_device_local_mem_type>(buf, size, size_ret,
+                                                       CL_LOCAL);
+
+   case CL_DEVICE_LOCAL_MEM_SIZE:
+      return scalar_property<cl_ulong>(buf, size, size_ret,
+                                       dev->max_mem_local());
+
+   case CL_DEVICE_ERROR_CORRECTION_SUPPORT:
+      return scalar_property<cl_bool>(buf, size, size_ret, CL_FALSE);
+
+   case CL_DEVICE_PROFILING_TIMER_RESOLUTION:
+      return scalar_property<size_t>(buf, size, size_ret, 0);
+
+   case CL_DEVICE_ENDIAN_LITTLE:
+      return scalar_property<cl_bool>(buf, size, size_ret, CL_TRUE);
+
+   case CL_DEVICE_AVAILABLE:
+   case CL_DEVICE_COMPILER_AVAILABLE:
+      return scalar_property<cl_bool>(buf, size, size_ret, CL_TRUE);
+
+   case CL_DEVICE_EXECUTION_CAPABILITIES:
+      return scalar_property<cl_device_exec_capabilities>(buf, size, size_ret,
+                                                          CL_EXEC_KERNEL);
+
+   case CL_DEVICE_QUEUE_PROPERTIES:
+      return scalar_property<cl_command_queue_properties>(buf, size, size_ret,
+         CL_QUEUE_PROFILING_ENABLE);
+
+   case CL_DEVICE_NAME:
+      return string_property(buf, size, size_ret, dev->device_name());
+
+   case CL_DEVICE_VENDOR:
+      return string_property(buf, size, size_ret, dev->vendor_name());
+
+   case CL_DRIVER_VERSION:
+      return string_property(buf, size, size_ret, MESA_VERSION);
+
+   case CL_DEVICE_PROFILE:
+      return string_property(buf, size, size_ret, "FULL_PROFILE");
+
+   case CL_DEVICE_VERSION:
+      return string_property(buf, size, size_ret, "OpenCL 1.1 MESA " MESA_VERSION);
+
+   case CL_DEVICE_EXTENSIONS:
+      return string_property(buf, size, size_ret, "");
+
+   case CL_DEVICE_PLATFORM:
+      return scalar_property<cl_platform_id>(buf, size, size_ret, NULL);
+
+   case CL_DEVICE_HOST_UNIFIED_MEMORY:
+      return scalar_property<cl_bool>(buf, size, size_ret, CL_TRUE);
+
+   case CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR:
+      return scalar_property<cl_uint>(buf, size, size_ret, 16);
+
+   case CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT:
+      return scalar_property<cl_uint>(buf, size, size_ret, 8);
+
+   case CL_DEVICE_NATIVE_VECTOR_WIDTH_INT:
+      return scalar_property<cl_uint>(buf, size, size_ret, 4);
+
+   case CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG:
+      return scalar_property<cl_uint>(buf, size, size_ret, 2);
+
+   case CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT:
+      return scalar_property<cl_uint>(buf, size, size_ret, 4);
+
+   case CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE:
+      return scalar_property<cl_uint>(buf, size, size_ret, 2);
+
+   case CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF:
+      return scalar_property<cl_uint>(buf, size, size_ret, 0);
+
+   case CL_DEVICE_OPENCL_C_VERSION:
+      return string_property(buf, size, size_ret, "OpenCL C 1.1");
+
+   default:
+      return CL_INVALID_VALUE;
+   }
+}
diff --git a/src/gallium/state_trackers/clover/api/event.cpp b/src/gallium/state_trackers/clover/api/event.cpp
new file mode 100644
index 00000000000..d6c37f6aef2
--- /dev/null
+++ b/src/gallium/state_trackers/clover/api/event.cpp
@@ -0,0 +1,239 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+// OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#include "api/util.hpp"
+#include "core/event.hpp"
+
+using namespace clover;
+
+PUBLIC cl_event
+clCreateUserEvent(cl_context ctx, cl_int *errcode_ret) try {
+   if (!ctx)
+      throw error(CL_INVALID_CONTEXT);
+
+   ret_error(errcode_ret, CL_SUCCESS);
+   return new soft_event(*ctx, {}, false);
+
+} catch(error &e) {
+   ret_error(errcode_ret, e);
+   return NULL;
+}
+
+PUBLIC cl_int
+clSetUserEventStatus(cl_event ev, cl_int status) {
+   if (!dynamic_cast<soft_event *>(ev))
+      return CL_INVALID_EVENT;
+
+   if (status > 0)
+      return CL_INVALID_VALUE;
+
+   if (ev->status() <= 0)
+      return CL_INVALID_OPERATION;
+
+   if (status)
+      ev->abort(status);
+   else
+      ev->trigger();
+
+   return CL_SUCCESS;
+}
+
+PUBLIC cl_int
+clWaitForEvents(cl_uint num_evs, const cl_event *evs) try {
+   if (!num_evs || !evs)
+      throw error(CL_INVALID_VALUE);
+
+   std::for_each(evs, evs + num_evs, [&](const cl_event ev) {
+         if (!ev)
+            throw error(CL_INVALID_EVENT);
+
+         if (&ev->ctx != &evs[0]->ctx)
+            throw error(CL_INVALID_CONTEXT);
+
+         if (ev->status() < 0)
+            throw error(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
+      });
+
+   // Create a temporary soft event that depends on all the events in
+   // the wait list
+   ref_ptr<soft_event> sev = transfer(
+      new soft_event(evs[0]->ctx, { evs, evs + num_evs }, true));
+
+   // ...and wait on it.
+   sev->wait();
+
+   return CL_SUCCESS;
+
+} catch(error &e) {
+   return e.get();
+}
+
+PUBLIC cl_int
+clGetEventInfo(cl_event ev, cl_event_info param,
+               size_t size, void *buf, size_t *size_ret) {
+   if (!ev)
+      return CL_INVALID_EVENT;
+
+   switch (param) {
+   case CL_EVENT_COMMAND_QUEUE:
+      return scalar_property<cl_command_queue>(buf, size, size_ret, ev->queue());
+
+   case CL_EVENT_CONTEXT:
+      return scalar_property<cl_context>(buf, size, size_ret, &ev->ctx);
+
+   case CL_EVENT_COMMAND_TYPE:
+      return scalar_property<cl_command_type>(buf, size, size_ret, ev->command());
+
+   case CL_EVENT_COMMAND_EXECUTION_STATUS:
+      return scalar_property<cl_int>(buf, size, size_ret, ev->status());
+
+   case CL_EVENT_REFERENCE_COUNT:
+      return scalar_property<cl_uint>(buf, size, size_ret, ev->ref_count());
+
+   default:
+      return CL_INVALID_VALUE;
+   }
+}
+
+PUBLIC cl_int
+clSetEventCallback(cl_event ev, cl_int type,
+                   void (CL_CALLBACK *pfn_event_notify)(cl_event, cl_int,
+                                                        void *),
+                   void *user_data) try {
+   if (!ev)
+      throw error(CL_INVALID_EVENT);
+
+   if (!pfn_event_notify || type != CL_COMPLETE)
+      throw error(CL_INVALID_VALUE);
+
+   // Create a temporary soft event that depends on ev, with
+   // pfn_event_notify as completion action.
+   ref_ptr<soft_event> sev = transfer(
+      new soft_event(ev->ctx, { ev }, true,
+                     [=](event &) {
+                        ev->wait();
+                        pfn_event_notify(ev, ev->status(), user_data);
+                     }));
+
+   return CL_SUCCESS;
+
+} catch(error &e) {
+   return e.get();
+}
+
+PUBLIC cl_int
+clRetainEvent(cl_event ev) {
+   if (!ev)
+      return CL_INVALID_EVENT;
+
+   ev->retain();
+   return CL_SUCCESS;
+}
+
+PUBLIC cl_int
+clReleaseEvent(cl_event ev) {
+   if (!ev)
+      return CL_INVALID_EVENT;
+
+   if (ev->release())
+      delete ev;
+
+   return CL_SUCCESS;
+}
+
+PUBLIC cl_int
+clEnqueueMarker(cl_command_queue q, cl_event *ev) try {
+   if (!q)
+      throw error(CL_INVALID_COMMAND_QUEUE);
+
+   if (!ev)
+      throw error(CL_INVALID_VALUE);
+
+   *ev = new hard_event(*q, CL_COMMAND_MARKER, {});
+
+   return CL_SUCCESS;
+
+} catch(error &e) {
+   return e.get();
+}
+
+PUBLIC cl_int
+clEnqueueBarrier(cl_command_queue q) {
+   if (!q)
+      return CL_INVALID_COMMAND_QUEUE;
+
+   // No need to do anything, q preserves data ordering strictly.
+   return CL_SUCCESS;
+}
+
+PUBLIC cl_int
+clEnqueueWaitForEvents(cl_command_queue q, cl_uint num_evs,
+                       const cl_event *evs) try {
+   if (!q)
+      throw error(CL_INVALID_COMMAND_QUEUE);
+
+   if (!num_evs || !evs)
+      throw error(CL_INVALID_VALUE);
+
+   std::for_each(evs, evs + num_evs, [&](const cl_event ev) {
+         if (!ev)
+            throw error(CL_INVALID_EVENT);
+
+         if (&ev->ctx != &q->ctx)
+            throw error(CL_INVALID_CONTEXT);
+      });
+
+   // Create a hard event that depends on the events in the wait list:
+   // subsequent commands in the same queue will be implicitly
+   // serialized with respect to it -- hard events always are.
+   ref_ptr<hard_event> hev = transfer(
+      new hard_event(*q, 0, { evs, evs + num_evs }));
+
+   return CL_SUCCESS;
+
+} catch(error &e) {
+   return e.get();
+}
+
+PUBLIC cl_int
+clGetEventProfilingInfo(cl_event ev, cl_profiling_info param,
+                        size_t size, void *buf, size_t *size_ret) {
+   return CL_PROFILING_INFO_NOT_AVAILABLE;
+}
+
+PUBLIC cl_int
+clFinish(cl_command_queue q) try {
+   if (!q)
+      throw error(CL_INVALID_COMMAND_QUEUE);
+
+   // Create a temporary hard event -- it implicitly depends on all
+   // the previously queued hard events.
+   ref_ptr<hard_event> hev = transfer(new hard_event(*q, 0, { }));
+
+   // And wait on it.
+   hev->wait();
+
+   return CL_SUCCESS;
+
+} catch(error &e) {
+   return e.get();
+}
diff --git a/src/gallium/state_trackers/clover/api/kernel.cpp b/src/gallium/state_trackers/clover/api/kernel.cpp
new file mode 100644
index 00000000000..44eeb277127
--- /dev/null
+++ b/src/gallium/state_trackers/clover/api/kernel.cpp
@@ -0,0 +1,318 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+// OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#include "api/util.hpp"
+#include "core/kernel.hpp"
+#include "core/event.hpp"
+
+using namespace clover;
+
+PUBLIC cl_kernel
+clCreateKernel(cl_program prog, const char *name,
+               cl_int *errcode_ret) try {
+   if (!prog)
+      throw error(CL_INVALID_PROGRAM);
+
+   if (!name)
+      throw error(CL_INVALID_VALUE);
+
+   if (prog->binaries().empty())
+      throw error(CL_INVALID_PROGRAM_EXECUTABLE);
+
+   auto sym = prog->binaries().begin()->second.sym(name);
+
+   ret_error(errcode_ret, CL_SUCCESS);
+   return new kernel(*prog, name, { sym.args.begin(), sym.args.end() });
+
+} catch (module::noent_error &e) {
+   ret_error(errcode_ret, CL_INVALID_KERNEL_NAME);
+   return NULL;
+
+} catch(error &e) {
+   ret_error(errcode_ret, e);
+   return NULL;
+}
+
+PUBLIC cl_int
+clCreateKernelsInProgram(cl_program prog, cl_uint count,
+                         cl_kernel *kerns, cl_uint *count_ret) {
+   if (!prog)
+      throw error(CL_INVALID_PROGRAM);
+
+   if (prog->binaries().empty())
+      throw error(CL_INVALID_PROGRAM_EXECUTABLE);
+
+   auto &syms = prog->binaries().begin()->second.syms;
+
+   if (kerns && count < syms.size())
+      throw error(CL_INVALID_VALUE);
+
+   if (kerns)
+      std::transform(syms.begin(), syms.end(), kerns,
+                     [=](const module::symbol &sym) {
+                        return new kernel(*prog, compat::string(sym.name),
+                                          { sym.args.begin(), sym.args.end() });
+                     });
+
+   if (count_ret)
+      *count_ret = syms.size();
+
+   return CL_SUCCESS;
+}
+
+PUBLIC cl_int
+clRetainKernel(cl_kernel kern) {
+   if (!kern)
+      return CL_INVALID_KERNEL;
+
+   kern->retain();
+   return CL_SUCCESS;
+}
+
+PUBLIC cl_int
+clReleaseKernel(cl_kernel kern) {
+   if (!kern)
+      return CL_INVALID_KERNEL;
+
+   if (kern->release())
+      delete kern;
+
+   return CL_SUCCESS;
+}
+
+PUBLIC cl_int
+clSetKernelArg(cl_kernel kern, cl_uint idx, size_t size,
+               const void *value) try {
+   if (!kern)
+      throw error(CL_INVALID_KERNEL);
+
+   if (idx >= kern->args.size())
+      throw error(CL_INVALID_ARG_INDEX);
+
+   kern->args[idx]->set(size, value);
+
+   return CL_SUCCESS;
+
+} catch(error &e) {
+   return e.get();
+}
+
+PUBLIC cl_int
+clGetKernelInfo(cl_kernel kern, cl_kernel_info param,
+                size_t size, void *buf, size_t *size_ret) {
+   if (!kern)
+      return CL_INVALID_KERNEL;
+
+   switch (param) {
+   case CL_KERNEL_FUNCTION_NAME:
+      return string_property(buf, size, size_ret, kern->name());
+
+   case CL_KERNEL_NUM_ARGS:
+      return scalar_property<cl_uint>(buf, size, size_ret,
+                                      kern->args.size());
+
+   case CL_KERNEL_REFERENCE_COUNT:
+      return scalar_property<cl_uint>(buf, size, size_ret,
+                                      kern->ref_count());
+
+   case CL_KERNEL_CONTEXT:
+      return scalar_property<cl_context>(buf, size, size_ret,
+                                         &kern->prog.ctx);
+
+   case CL_KERNEL_PROGRAM:
+      return scalar_property<cl_program>(buf, size, size_ret,
+                                         &kern->prog);
+
+   default:
+      return CL_INVALID_VALUE;
+   }
+}
+
+PUBLIC cl_int
+clGetKernelWorkGroupInfo(cl_kernel kern, cl_device_id dev,
+                         cl_kernel_work_group_info param,
+                         size_t size, void *buf, size_t *size_ret) {
+   if (!kern)
+      return CL_INVALID_KERNEL;
+
+   if ((!dev && kern->prog.binaries().size() != 1) ||
+       (dev && !kern->prog.binaries().count(dev)))
+      return CL_INVALID_DEVICE;
+
+   switch (param) {
+   case CL_KERNEL_WORK_GROUP_SIZE:
+      return scalar_property<size_t>(buf, size, size_ret,
+                                     kern->max_block_size());
+
+   case CL_KERNEL_COMPILE_WORK_GROUP_SIZE:
+      return vector_property<size_t>(buf, size, size_ret,
+                                     kern->block_size());
+
+   case CL_KERNEL_LOCAL_MEM_SIZE:
+      return scalar_property<cl_ulong>(buf, size, size_ret,
+                                       kern->mem_local());
+
+   case CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE:
+      return scalar_property<size_t>(buf, size, size_ret, 1);
+
+   case CL_KERNEL_PRIVATE_MEM_SIZE:
+      return scalar_property<cl_ulong>(buf, size, size_ret,
+                                       kern->mem_private());
+
+   default:
+      return CL_INVALID_VALUE;
+   }
+}
+
+namespace {
+   ///
+   /// Common argument checking shared by kernel invocation commands.
+   ///
+   void
+   kernel_validate(cl_command_queue q, cl_kernel kern,
+                   cl_uint dims, const size_t *grid_offset,
+                   const size_t *grid_size, const size_t *block_size,
+                   cl_uint num_deps, const cl_event *deps,
+                   cl_event *ev) {
+      if (!q)
+         throw error(CL_INVALID_COMMAND_QUEUE);
+
+      if (!kern)
+         throw error(CL_INVALID_KERNEL);
+
+      if (&kern->prog.ctx != &q->ctx ||
+          any_of([&](const cl_event ev) {
+                return &ev->ctx != &q->ctx;
+             }, deps, deps + num_deps))
+         throw error(CL_INVALID_CONTEXT);
+
+      if (bool(num_deps) != bool(deps) ||
+          any_of(is_zero<cl_event>(), deps, deps + num_deps))
+         throw error(CL_INVALID_EVENT_WAIT_LIST);
+
+      if (any_of([](std::unique_ptr<kernel::argument> &arg) {
+               return !arg->set();
+            }, kern->args.begin(), kern->args.end()))
+         throw error(CL_INVALID_KERNEL_ARGS);
+
+      if (!kern->prog.binaries().count(&q->dev))
+         throw error(CL_INVALID_PROGRAM_EXECUTABLE);
+
+      if (dims < 1 || dims > q->dev.max_block_size().size())
+         throw error(CL_INVALID_WORK_DIMENSION);
+
+      if (!grid_size || any_of(is_zero<size_t>(), grid_size, grid_size + dims))
+         throw error(CL_INVALID_GLOBAL_WORK_SIZE);
+
+      if (block_size && any_of([](size_t b, size_t max) {
+               return b == 0 || b > max;
+            }, block_size, block_size + dims,
+            q->dev.max_block_size().begin()))
+         throw error(CL_INVALID_WORK_ITEM_SIZE);
+
+      if (block_size && any_of([](size_t b, size_t g) {
+               return g % b;
+            }, block_size, block_size + dims, grid_size))
+         throw error(CL_INVALID_WORK_GROUP_SIZE);
+   }
+
+   ///
+   /// Common event action shared by kernel invocation commands.
+   ///
+   std::function<void (event &)>
+   kernel_op(cl_command_queue q, cl_kernel kern,
+             const std::vector<size_t> &grid_offset,
+             const std::vector<size_t> &grid_size,
+             const std::vector<size_t> &block_size) {
+      const std::vector<size_t> reduced_grid_size = map(
+         std::divides<size_t>(), grid_size.begin(), grid_size.end(),
+         block_size.begin());
+
+      return [=](event &) {
+         kern->launch(*q, grid_offset, reduced_grid_size, block_size);
+      };
+   }
+
+   template<typename T, typename S>
+   std::vector<T>
+   opt_vector(const T *p, S n) {
+      if (p)
+         return { p, p + n };
+      else
+         return { n };
+   }
+}
+
+PUBLIC cl_int
+clEnqueueNDRangeKernel(cl_command_queue q, cl_kernel kern,
+                       cl_uint dims, const size_t *pgrid_offset,
+                       const size_t *pgrid_size, const size_t *pblock_size,
+                       cl_uint num_deps, const cl_event *deps,
+                       cl_event *ev) try {
+   const std::vector<size_t> grid_offset = opt_vector(pgrid_offset, dims);
+   const std::vector<size_t> grid_size = opt_vector(pgrid_size, dims);
+   const std::vector<size_t> block_size = opt_vector(pblock_size, dims);
+
+   kernel_validate(q, kern, dims, pgrid_offset, pgrid_size, pblock_size,
+                   num_deps, deps, ev);
+
+   hard_event *hev = new hard_event(
+      *q, CL_COMMAND_NDRANGE_KERNEL, { deps, deps + num_deps },
+      kernel_op(q, kern, grid_offset, grid_size, block_size));
+
+   ret_object(ev, hev);
+   return CL_SUCCESS;
+
+} catch(error &e) {
+   return e.get();
+}
+
+PUBLIC cl_int
+clEnqueueTask(cl_command_queue q, cl_kernel kern,
+              cl_uint num_deps, const cl_event *deps,
+              cl_event *ev) try {
+   const std::vector<size_t> grid_offset = { 0 };
+   const std::vector<size_t> grid_size = { 1 };
+   const std::vector<size_t> block_size = { 1 };
+
+   kernel_validate(q, kern, 1, grid_offset.data(), grid_size.data(),
+                   block_size.data(), num_deps, deps, ev);
+
+   hard_event *hev = new hard_event(
+      *q, CL_COMMAND_TASK, { deps, deps + num_deps },
+      kernel_op(q, kern, grid_offset, grid_size, block_size));
+
+   ret_object(ev, hev);
+   return CL_SUCCESS;
+
+} catch(error &e) {
+   return e.get();
+}
+
+PUBLIC cl_int
+clEnqueueNativeKernel(cl_command_queue q, void (*func)(void *),
+                      void *args, size_t args_size,
+                      cl_uint obj_count, const cl_mem *obj_list,
+                      const void **obj_args, cl_uint num_deps,
+                      const cl_event *deps, cl_event *ev) {
+   return CL_INVALID_OPERATION;
+}
diff --git a/src/gallium/state_trackers/clover/api/memory.cpp b/src/gallium/state_trackers/clover/api/memory.cpp
new file mode 100644
index 00000000000..1b1ae73796f
--- /dev/null
+++ b/src/gallium/state_trackers/clover/api/memory.cpp
@@ -0,0 +1,305 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+// OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#include "api/util.hpp"
+#include "core/memory.hpp"
+#include "core/format.hpp"
+
+using namespace clover;
+
+PUBLIC cl_mem
+clCreateBuffer(cl_context ctx, cl_mem_flags flags, size_t size,
+               void *host_ptr, cl_int *errcode_ret) try {
+   if (!ctx)
+      throw error(CL_INVALID_CONTEXT);
+
+   if (bool(host_ptr) != bool(flags & (CL_MEM_USE_HOST_PTR |
+                                       CL_MEM_COPY_HOST_PTR)))
+      throw error(CL_INVALID_HOST_PTR);
+
+   if (!size)
+      throw error(CL_INVALID_BUFFER_SIZE);
+
+   if (flags & ~(CL_MEM_WRITE_ONLY | CL_MEM_READ_ONLY |
+                 CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR |
+                 CL_MEM_COPY_HOST_PTR))
+      throw error(CL_INVALID_VALUE);
+
+   ret_error(errcode_ret, CL_SUCCESS);
+   return new root_buffer(*ctx, flags, size, host_ptr);
+
+} catch (error &e) {
+   ret_error(errcode_ret, e);
+   return NULL;
+}
+
+PUBLIC cl_mem
+clCreateSubBuffer(cl_mem obj, cl_mem_flags flags, cl_buffer_create_type op,
+                  const void *op_info, cl_int *errcode_ret) try {
+   root_buffer *parent = dynamic_cast<root_buffer *>(obj);
+
+   if (!parent)
+      throw error(CL_INVALID_MEM_OBJECT);
+
+   if ((flags & (CL_MEM_USE_HOST_PTR |
+                 CL_MEM_ALLOC_HOST_PTR |
+                 CL_MEM_COPY_HOST_PTR)) ||
+       (~flags & parent->flags() & (CL_MEM_READ_ONLY |
+                                    CL_MEM_WRITE_ONLY)))
+      throw error(CL_INVALID_VALUE);
+
+   if (op == CL_BUFFER_CREATE_TYPE_REGION) {
+      const cl_buffer_region *reg = (const cl_buffer_region *)op_info;
+
+      if (!reg ||
+          reg->origin > parent->size() ||
+          reg->origin + reg->size > parent->size())
+         throw error(CL_INVALID_VALUE);
+
+      if (!reg->size)
+         throw error(CL_INVALID_BUFFER_SIZE);
+
+      ret_error(errcode_ret, CL_SUCCESS);
+      return new sub_buffer(*parent, flags, reg->origin, reg->size);
+
+   } else {
+      throw error(CL_INVALID_VALUE);
+   }
+
+} catch (error &e) {
+   ret_error(errcode_ret, e);
+   return NULL;
+}
+
+PUBLIC cl_mem
+clCreateImage2D(cl_context ctx, cl_mem_flags flags,
+                const cl_image_format *format,
+                size_t width, size_t height, size_t row_pitch,
+                void *host_ptr, cl_int *errcode_ret) try {
+   if (!ctx)
+      throw error(CL_INVALID_CONTEXT);
+
+   if (flags & ~(CL_MEM_WRITE_ONLY | CL_MEM_READ_ONLY |
+                 CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR |
+                 CL_MEM_COPY_HOST_PTR))
+      throw error(CL_INVALID_VALUE);
+
+   if (!format)
+      throw error(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
+
+   if (width < 1 || height < 1)
+      throw error(CL_INVALID_IMAGE_SIZE);
+
+   if (bool(host_ptr) != bool(flags & (CL_MEM_USE_HOST_PTR |
+                                       CL_MEM_COPY_HOST_PTR)))
+      throw error(CL_INVALID_HOST_PTR);
+
+   if (!supported_formats(ctx, CL_MEM_OBJECT_IMAGE2D).count(*format))
+      throw error(CL_IMAGE_FORMAT_NOT_SUPPORTED);
+
+   ret_error(errcode_ret, CL_SUCCESS);
+   return new image2d(*ctx, flags, format, width, height,
+                      row_pitch, host_ptr);
+
+} catch (error &e) {
+   ret_error(errcode_ret, e);
+   return NULL;
+}
+
+PUBLIC cl_mem
+clCreateImage3D(cl_context ctx, cl_mem_flags flags,
+                const cl_image_format *format,
+                size_t width, size_t height, size_t depth,
+                size_t row_pitch, size_t slice_pitch,
+                void *host_ptr, cl_int *errcode_ret) try {
+   if (!ctx)
+      throw error(CL_INVALID_CONTEXT);
+
+   if (flags & ~(CL_MEM_WRITE_ONLY | CL_MEM_READ_ONLY |
+                 CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR |
+                 CL_MEM_COPY_HOST_PTR))
+      throw error(CL_INVALID_VALUE);
+
+   if (!format)
+      throw error(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
+
+   if (width < 1 || height < 1 || depth < 2)
+      throw error(CL_INVALID_IMAGE_SIZE);
+
+   if (bool(host_ptr) != bool(flags & (CL_MEM_USE_HOST_PTR |
+                                       CL_MEM_COPY_HOST_PTR)))
+      throw error(CL_INVALID_HOST_PTR);
+
+   if (!supported_formats(ctx, CL_MEM_OBJECT_IMAGE3D).count(*format))
+      throw error(CL_IMAGE_FORMAT_NOT_SUPPORTED);
+
+   ret_error(errcode_ret, CL_SUCCESS);
+   return new image3d(*ctx, flags, format, width, height, depth,
+                      row_pitch, slice_pitch, host_ptr);
+
+} catch (error &e) {
+   ret_error(errcode_ret, e);
+   return NULL;
+}
+
+PUBLIC cl_int
+clGetSupportedImageFormats(cl_context ctx, cl_mem_flags flags,
+                           cl_mem_object_type type, cl_uint count,
+                           cl_image_format *buf, cl_uint *count_ret) try {
+   if (!ctx)
+      throw error(CL_INVALID_CONTEXT);
+
+   if (flags & ~(CL_MEM_WRITE_ONLY | CL_MEM_READ_ONLY |
+                 CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR |
+                 CL_MEM_COPY_HOST_PTR))
+      throw error(CL_INVALID_VALUE);
+
+   if (!count && buf)
+      throw error(CL_INVALID_VALUE);
+
+   auto formats = supported_formats(ctx, type);
+
+   if (buf)
+      std::copy_n(formats.begin(), std::min((cl_uint)formats.size(), count),
+                  buf);
+   if (count_ret)
+      *count_ret = formats.size();
+
+   return CL_SUCCESS;
+
+} catch (error &e) {
+   return e.get();
+}
+
+PUBLIC cl_int
+clGetMemObjectInfo(cl_mem obj, cl_mem_info param,
+                   size_t size, void *buf, size_t *size_ret) {
+   if (!obj)
+      return CL_INVALID_MEM_OBJECT;
+
+   switch (param) {
+   case CL_MEM_TYPE:
+      return scalar_property<cl_mem_object_type>(buf, size, size_ret,
+                                                 obj->type());
+
+   case CL_MEM_FLAGS:
+      return scalar_property<cl_mem_flags>(buf, size, size_ret, obj->flags());
+
+   case CL_MEM_SIZE:
+      return scalar_property<size_t>(buf, size, size_ret, obj->size());
+
+   case CL_MEM_HOST_PTR:
+      return scalar_property<void *>(buf, size, size_ret, obj->host_ptr());
+
+   case CL_MEM_MAP_COUNT:
+      return scalar_property<cl_uint>(buf, size, size_ret, 0);
+
+   case CL_MEM_REFERENCE_COUNT:
+      return scalar_property<cl_uint>(buf, size, size_ret, obj->ref_count());
+
+   case CL_MEM_CONTEXT:
+      return scalar_property<cl_context>(buf, size, size_ret, &obj->ctx);
+
+   case CL_MEM_ASSOCIATED_MEMOBJECT: {
+      sub_buffer *sub = dynamic_cast<sub_buffer *>(obj);
+      return scalar_property<cl_mem>(buf, size, size_ret,
+                                     (sub ? &sub->parent : NULL));
+   }
+   case CL_MEM_OFFSET: {
+      sub_buffer *sub = dynamic_cast<sub_buffer *>(obj);
+      return scalar_property<size_t>(buf, size, size_ret,
+                                     (sub ? sub->offset() : 0));
+   }
+   default:
+      return CL_INVALID_VALUE;
+   }
+}
+
+PUBLIC cl_int
+clGetImageInfo(cl_mem obj, cl_image_info param,
+               size_t size, void *buf, size_t *size_ret) {
+   image *img = dynamic_cast<image *>(obj);
+   if (!img)
+      return CL_INVALID_MEM_OBJECT;
+
+   switch (param) {
+   case CL_IMAGE_FORMAT:
+      return scalar_property<cl_image_format>(buf, size, size_ret,
+                                              img->format());
+
+   case CL_IMAGE_ELEMENT_SIZE:
+      return scalar_property<size_t>(buf, size, size_ret, 0);
+
+   case CL_IMAGE_ROW_PITCH:
+      return scalar_property<size_t>(buf, size, size_ret, img->row_pitch());
+
+   case CL_IMAGE_SLICE_PITCH:
+      return scalar_property<size_t>(buf, size, size_ret, img->slice_pitch());
+
+   case CL_IMAGE_WIDTH:
+      return scalar_property<size_t>(buf, size, size_ret, img->width());
+
+   case CL_IMAGE_HEIGHT:
+      return scalar_property<size_t>(buf, size, size_ret, img->height());
+
+   case CL_IMAGE_DEPTH:
+      return scalar_property<size_t>(buf, size, size_ret, img->depth());
+
+   default:
+      return CL_INVALID_VALUE;
+   }
+}
+
+PUBLIC cl_int
+clRetainMemObject(cl_mem obj) {
+   if (!obj)
+      return CL_INVALID_MEM_OBJECT;
+
+   obj->retain();
+   return CL_SUCCESS;
+}
+
+PUBLIC cl_int
+clReleaseMemObject(cl_mem obj) {
+   if (!obj)
+      return CL_INVALID_MEM_OBJECT;
+
+   if (obj->release())
+      delete obj;
+
+   return CL_SUCCESS;
+}
+
+PUBLIC cl_int
+clSetMemObjectDestructorCallback(cl_mem obj,
+                                 void (CL_CALLBACK *pfn_notify)(cl_mem, void *),
+                                 void *user_data) {
+   if (!obj)
+      return CL_INVALID_MEM_OBJECT;
+
+   if (!pfn_notify)
+      return CL_INVALID_VALUE;
+
+   obj->destroy_notify([=]{ pfn_notify(obj, user_data); });
+
+   return CL_SUCCESS;
+}
diff --git a/src/gallium/state_trackers/clover/api/platform.cpp b/src/gallium/state_trackers/clover/api/platform.cpp
new file mode 100644
index 00000000000..e5e80b85256
--- /dev/null
+++ b/src/gallium/state_trackers/clover/api/platform.cpp
@@ -0,0 +1,68 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+// OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#include "api/util.hpp"
+
+using namespace clover;
+
+PUBLIC cl_int
+clGetPlatformIDs(cl_uint num_entries, cl_platform_id *platforms,
+                 cl_uint *num_platforms) {
+   if ((!num_entries && platforms) ||
+       (!num_platforms && !platforms))
+      return CL_INVALID_VALUE;
+
+   if (num_platforms)
+      *num_platforms = 1;
+   if (platforms)
+      *platforms = NULL;
+
+   return CL_SUCCESS;
+}
+
+PUBLIC cl_int
+clGetPlatformInfo(cl_platform_id platform, cl_platform_info param_name,
+                  size_t size, void *buf, size_t *size_ret) {
+   if (platform != NULL)
+      return CL_INVALID_PLATFORM;
+
+   switch (param_name) {
+   case CL_PLATFORM_PROFILE:
+      return string_property(buf, size, size_ret, "FULL_PROFILE");
+
+   case CL_PLATFORM_VERSION:
+      return string_property(buf, size, size_ret,
+                             "OpenCL 1.1 MESA " MESA_VERSION);
+
+   case CL_PLATFORM_NAME:
+      return string_property(buf, size, size_ret, "Default");
+
+   case CL_PLATFORM_VENDOR:
+      return string_property(buf, size, size_ret, "Mesa");
+
+   case CL_PLATFORM_EXTENSIONS:
+      return string_property(buf, size, size_ret, "");
+
+   default:
+      return CL_INVALID_VALUE;
+   }
+}
diff --git a/src/gallium/state_trackers/clover/api/program.cpp b/src/gallium/state_trackers/clover/api/program.cpp
new file mode 100644
index 00000000000..e874c51ad7d
--- /dev/null
+++ b/src/gallium/state_trackers/clover/api/program.cpp
@@ -0,0 +1,241 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+// OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#include "api/util.hpp"
+#include "core/program.hpp"
+
+using namespace clover;
+
+PUBLIC cl_program
+clCreateProgramWithSource(cl_context ctx, cl_uint count,
+                          const char **strings, const size_t *lengths,
+                          cl_int *errcode_ret) try {
+   std::string source;
+
+   if (!ctx)
+      throw error(CL_INVALID_CONTEXT);
+
+   if (!count || !strings ||
+       any_of(is_zero<const char *>(), strings, strings + count))
+      throw error(CL_INVALID_VALUE);
+
+   // Concatenate all the provided fragments together
+   for (unsigned i = 0; i < count; ++i)
+         source += (lengths && lengths[i] ?
+                    std::string(strings[i], strings[i] + lengths[i]) :
+                    std::string(strings[i]));
+
+   // ...and create a program object for them.
+   ret_error(errcode_ret, CL_SUCCESS);
+   return new program(*ctx, source);
+
+} catch (error &e) {
+   ret_error(errcode_ret, e);
+   return NULL;
+}
+
+PUBLIC cl_program
+clCreateProgramWithBinary(cl_context ctx, cl_uint count,
+                          const cl_device_id *devs, const size_t *lengths,
+                          const unsigned char **binaries, cl_int *status_ret,
+                          cl_int *errcode_ret) try {
+   if (!ctx)
+      throw error(CL_INVALID_CONTEXT);
+
+   if (!count || !devs || !lengths || !binaries)
+      throw error(CL_INVALID_VALUE);
+
+   if (any_of([&](const cl_device_id dev) {
+            return !ctx->has_device(dev);
+         }, devs, devs + count))
+      throw error(CL_INVALID_DEVICE);
+
+   // Deserialize the provided binaries,
+   auto modules = map(
+      [](const unsigned char *p, size_t l) -> std::pair<cl_int, module> {
+         if (!p || !l)
+            return { CL_INVALID_VALUE, {} };
+
+         try {
+            compat::istream::buffer_t bin(p, l);
+            compat::istream s(bin);
+
+            return { CL_SUCCESS, module::deserialize(s) };
+
+         } catch (compat::istream::error &e) {
+            return { CL_INVALID_BINARY, {} };
+         }
+      },
+      binaries, binaries + count, lengths);
+
+   // update the status array,
+   if (status_ret)
+      std::transform(modules.begin(), modules.end(), status_ret,
+                     keys<cl_int, module>);
+
+   if (any_of(key_equals<cl_int, module>(CL_INVALID_VALUE),
+              modules.begin(), modules.end()))
+      throw error(CL_INVALID_VALUE);
+
+   if (any_of(key_equals<cl_int, module>(CL_INVALID_BINARY),
+              modules.begin(), modules.end()))
+      throw error(CL_INVALID_BINARY);
+
+   // initialize a program object with them.
+   ret_error(errcode_ret, CL_SUCCESS);
+   return new program(*ctx, { devs, devs + count },
+                      map(values<cl_int, module>,
+                          modules.begin(), modules.end()));
+
+} catch (error &e) {
+   ret_error(errcode_ret, e);
+   return NULL;
+}
+
+PUBLIC cl_int
+clRetainProgram(cl_program prog) {
+   if (!prog)
+      return CL_INVALID_PROGRAM;
+
+   prog->retain();
+   return CL_SUCCESS;
+}
+
+PUBLIC cl_int
+clReleaseProgram(cl_program prog) {
+   if (!prog)
+      return CL_INVALID_PROGRAM;
+
+   if (prog->release())
+      delete prog;
+
+   return CL_SUCCESS;
+}
+
+PUBLIC cl_int
+clBuildProgram(cl_program prog, cl_uint count, const cl_device_id *devs,
+               const char *opts, void (*pfn_notify)(cl_program, void *),
+               void *user_data) try {
+   if (!prog)
+      throw error(CL_INVALID_PROGRAM);
+
+   if (bool(count) != bool(devs) ||
+       (!pfn_notify && user_data))
+      throw error(CL_INVALID_VALUE);
+
+   if (any_of([&](const cl_device_id dev) {
+            return !prog->ctx.has_device(dev);
+         }, devs, devs + count))
+      throw error(CL_INVALID_DEVICE);
+
+   prog->build({ devs, devs + count });
+   return CL_SUCCESS;
+
+} catch (error &e) {
+   return e.get();
+}
+
+PUBLIC cl_int
+clUnloadCompiler() {
+   return CL_SUCCESS;
+}
+
+PUBLIC cl_int
+clGetProgramInfo(cl_program prog, cl_program_info param,
+                 size_t size, void *buf, size_t *size_ret) {
+   if (!prog)
+      return CL_INVALID_PROGRAM;
+
+   switch (param) {
+   case CL_PROGRAM_REFERENCE_COUNT:
+      return scalar_property<cl_uint>(buf, size, size_ret,
+                                      prog->ref_count());
+
+   case CL_PROGRAM_CONTEXT:
+      return scalar_property<cl_context>(buf, size, size_ret,
+                                         &prog->ctx);
+
+   case CL_PROGRAM_NUM_DEVICES:
+      return scalar_property<cl_uint>(buf, size, size_ret,
+                                      prog->binaries().size());
+
+   case CL_PROGRAM_DEVICES:
+      return vector_property<cl_device_id>(
+         buf, size, size_ret,
+         map(keys<device *, module>,
+             prog->binaries().begin(), prog->binaries().end()));
+
+   case CL_PROGRAM_SOURCE:
+      return string_property(buf, size, size_ret, prog->source());
+
+   case CL_PROGRAM_BINARY_SIZES:
+      return vector_property<size_t>(
+         buf, size, size_ret,
+         map([](const std::pair<device *, module> &ent) {
+               compat::ostream::buffer_t bin;
+               compat::ostream s(bin);
+               ent.second.serialize(s);
+               return bin.size();
+            },
+            prog->binaries().begin(), prog->binaries().end()));
+
+   case CL_PROGRAM_BINARIES:
+      return matrix_property<unsigned char>(
+         buf, size, size_ret,
+         map([](const std::pair<device *, module> &ent) {
+               compat::ostream::buffer_t bin;
+               compat::ostream s(bin);
+               ent.second.serialize(s);
+               return bin;
+            },
+            prog->binaries().begin(), prog->binaries().end()));
+
+   default:
+      return CL_INVALID_VALUE;
+   }
+}
+
+PUBLIC cl_int
+clGetProgramBuildInfo(cl_program prog, cl_device_id dev,
+                      cl_program_build_info param,
+                      size_t size, void *buf, size_t *size_ret) {
+   if (!prog)
+      return CL_INVALID_PROGRAM;
+
+   if (!prog->ctx.has_device(dev))
+      return CL_INVALID_DEVICE;
+
+   switch (param) {
+   case CL_PROGRAM_BUILD_STATUS:
+      return scalar_property<cl_build_status>(buf, size, size_ret,
+                                              prog->build_status(dev));
+
+   case CL_PROGRAM_BUILD_OPTIONS:
+      return string_property(buf, size, size_ret, prog->build_opts(dev));
+
+   case CL_PROGRAM_BUILD_LOG:
+      return string_property(buf, size, size_ret, prog->build_log(dev));
+
+   default:
+      return CL_INVALID_VALUE;
+   }
+}
diff --git a/src/gallium/state_trackers/clover/api/queue.cpp b/src/gallium/state_trackers/clover/api/queue.cpp
new file mode 100644
index 00000000000..a7905bc4396
--- /dev/null
+++ b/src/gallium/state_trackers/clover/api/queue.cpp
@@ -0,0 +1,102 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+// OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#include "api/util.hpp"
+#include "core/queue.hpp"
+
+using namespace clover;
+
+PUBLIC cl_command_queue
+clCreateCommandQueue(cl_context ctx, cl_device_id dev,
+                     cl_command_queue_properties props,
+                     cl_int *errcode_ret) try {
+   if (!ctx)
+      throw error(CL_INVALID_CONTEXT);
+
+   if (!ctx->has_device(dev))
+      throw error(CL_INVALID_DEVICE);
+
+   if (props & ~(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
+                 CL_QUEUE_PROFILING_ENABLE))
+      throw error(CL_INVALID_VALUE);
+
+   ret_error(errcode_ret, CL_SUCCESS);
+   return new command_queue(*ctx, *dev, props);
+
+} catch (error &e) {
+   ret_error(errcode_ret, e);
+   return NULL;
+}
+
+PUBLIC cl_int
+clRetainCommandQueue(cl_command_queue q) {
+   if (!q)
+      return CL_INVALID_COMMAND_QUEUE;
+
+   q->retain();
+   return CL_SUCCESS;
+}
+
+PUBLIC cl_int
+clReleaseCommandQueue(cl_command_queue q) {
+   if (!q)
+      return CL_INVALID_COMMAND_QUEUE;
+
+   if (q->release())
+      delete q;
+
+   return CL_SUCCESS;
+}
+
+PUBLIC cl_int
+clGetCommandQueueInfo(cl_command_queue q, cl_command_queue_info param,
+                      size_t size, void *buf, size_t *size_ret) {
+   if (!q)
+      return CL_INVALID_COMMAND_QUEUE;
+
+   switch (param) {
+   case CL_QUEUE_CONTEXT:
+      return scalar_property<cl_context>(buf, size, size_ret, &q->ctx);
+
+   case CL_QUEUE_DEVICE:
+      return scalar_property<cl_device_id>(buf, size, size_ret, &q->dev);
+
+   case CL_QUEUE_REFERENCE_COUNT:
+      return scalar_property<cl_uint>(buf, size, size_ret, q->ref_count());
+
+   case CL_QUEUE_PROPERTIES:
+      return scalar_property<cl_command_queue_properties>(buf, size, size_ret,
+                                                          q->props());
+
+   default:
+      return CL_INVALID_VALUE;
+   }
+}
+
+PUBLIC cl_int
+clFlush(cl_command_queue q) {
+   if (!q)
+      return CL_INVALID_COMMAND_QUEUE;
+
+   q->flush();
+   return CL_SUCCESS;
+}
diff --git a/src/gallium/state_trackers/clover/api/sampler.cpp b/src/gallium/state_trackers/clover/api/sampler.cpp
new file mode 100644
index 00000000000..32ce22ef90f
--- /dev/null
+++ b/src/gallium/state_trackers/clover/api/sampler.cpp
@@ -0,0 +1,90 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+// OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#include "api/util.hpp"
+#include "core/sampler.hpp"
+
+using namespace clover;
+
+PUBLIC cl_sampler
+clCreateSampler(cl_context ctx, cl_bool norm_mode,
+                cl_addressing_mode addr_mode, cl_filter_mode filter_mode,
+                cl_int *errcode_ret) try {
+   if (!ctx)
+      throw error(CL_INVALID_CONTEXT);
+
+   ret_error(errcode_ret, CL_SUCCESS);
+   return new sampler(*ctx, norm_mode, addr_mode, filter_mode);
+
+} catch (error &e) {
+   ret_error(errcode_ret, e);
+   return NULL;
+}
+
+PUBLIC cl_int
+clRetainSampler(cl_sampler s) {
+   if (!s)
+      throw error(CL_INVALID_SAMPLER);
+
+   s->retain();
+   return CL_SUCCESS;
+}
+
+PUBLIC cl_int
+clReleaseSampler(cl_sampler s) {
+   if (!s)
+      throw error(CL_INVALID_SAMPLER);
+
+   if (s->release())
+      delete s;
+
+   return CL_SUCCESS;
+}
+
+PUBLIC cl_int
+clGetSamplerInfo(cl_sampler s, cl_sampler_info param,
+                 size_t size, void *buf, size_t *size_ret) {
+   if (!s)
+      throw error(CL_INVALID_SAMPLER);
+
+   switch (param) {
+   case CL_SAMPLER_REFERENCE_COUNT:
+      return scalar_property<cl_uint>(buf, size, size_ret, s->ref_count());
+
+   case CL_SAMPLER_CONTEXT:
+      return scalar_property<cl_context>(buf, size, size_ret, &s->ctx);
+
+   case CL_SAMPLER_NORMALIZED_COORDS:
+      return scalar_property<cl_bool>(buf, size, size_ret, s->norm_mode());
+
+   case CL_SAMPLER_ADDRESSING_MODE:
+      return scalar_property<cl_addressing_mode>(buf, size, size_ret,
+                                                 s->addr_mode());
+
+   case CL_SAMPLER_FILTER_MODE:
+      return scalar_property<cl_filter_mode>(buf, size, size_ret,
+                                             s->filter_mode());
+
+   default:
+      return CL_INVALID_VALUE;
+   }
+}
diff --git a/src/gallium/state_trackers/clover/api/transfer.cpp b/src/gallium/state_trackers/clover/api/transfer.cpp
new file mode 100644
index 00000000000..c67b75e8034
--- /dev/null
+++ b/src/gallium/state_trackers/clover/api/transfer.cpp
@@ -0,0 +1,506 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+// OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#include <cstring>
+
+#include "api/util.hpp"
+#include "core/event.hpp"
+#include "core/resource.hpp"
+
+using namespace clover;
+
+namespace {
+   typedef resource::point point;
+
+   ///
+   /// Common argument checking shared by memory transfer commands.
+   ///
+   void
+   validate_base(cl_command_queue q, cl_uint num_deps, const cl_event *deps) {
+      if (!q)
+         throw error(CL_INVALID_COMMAND_QUEUE);
+
+      if (bool(num_deps) != bool(deps) ||
+          any_of(is_zero<cl_event>(), deps, deps + num_deps))
+         throw error(CL_INVALID_EVENT_WAIT_LIST);
+
+      if (any_of([&](const cl_event ev) {
+               return &ev->ctx != &q->ctx;
+            }, deps, deps + num_deps))
+         throw error(CL_INVALID_CONTEXT);
+   }
+
+   ///
+   /// Memory object-specific argument checking shared by most memory
+   /// transfer commands.
+   ///
+   void
+   validate_obj(cl_command_queue q, cl_mem obj) {
+      if (!obj)
+         throw error(CL_INVALID_MEM_OBJECT);
+
+      if (&obj->ctx != &q->ctx)
+         throw error(CL_INVALID_CONTEXT);
+   }
+
+   ///
+   /// Class that encapsulates the task of mapping an object of type
+   /// \a T.  The return value of get() should be implicitly
+   /// convertible to \a void *.
+   ///
+   template<typename T> struct __map;
+
+   template<> struct __map<void *> {
+      static void *
+      get(cl_command_queue q, void *obj, cl_map_flags flags,
+          size_t offset, size_t size) {
+         return (char *)obj + offset;
+      }
+   };
+
+   template<> struct __map<const void *> {
+      static const void *
+      get(cl_command_queue q, const void *obj, cl_map_flags flags,
+          size_t offset, size_t size) {
+         return (const char *)obj + offset;
+      }
+   };
+
+   template<> struct __map<memory_obj *> {
+      static mapping
+      get(cl_command_queue q, memory_obj *obj, cl_map_flags flags,
+          size_t offset, size_t size) {
+         return { *q, obj->resource(q), flags, true, { offset }, { size }};
+      }
+   };
+
+   ///
+   /// Software copy from \a src_obj to \a dst_obj.  They can be
+   /// either pointers or memory objects.
+   ///
+   template<typename T, typename S>
+   std::function<void (event &)>
+   soft_copy_op(cl_command_queue q,
+                T dst_obj, const point &dst_orig, const point &dst_pitch,
+                S src_obj, const point &src_orig, const point &src_pitch,
+                const point &region) {
+      return [=](event &) {
+         auto dst = __map<T>::get(q, dst_obj, CL_MAP_WRITE,
+                                  dst_pitch(dst_orig), dst_pitch(region));
+         auto src = __map<S>::get(q, src_obj, CL_MAP_READ,
+                                  src_pitch(src_orig), src_pitch(region));
+         point p;
+
+         for (p[2] = 0; p[2] < region[2]; ++p[2]) {
+            for (p[1] = 0; p[1] < region[1]; ++p[1]) {
+               std::memcpy(static_cast<char *>(dst) + dst_pitch(p),
+                           static_cast<const char *>(src) + src_pitch(p),
+                           src_pitch[0] * region[0]);
+            }
+         }
+      };
+   }
+
+   ///
+   /// Hardware copy from \a src_obj to \a dst_obj.
+   ///
+   template<typename T, typename S>
+   std::function<void (event &)>
+   hard_copy_op(cl_command_queue q, T dst_obj, const point &dst_orig,
+                S src_obj, const point &src_orig, const point &region) {
+      return [=](event &) {
+         dst_obj->resource(q).copy(*q, dst_orig, region,
+                                   src_obj->resource(q), src_orig);
+      };
+   }
+}
+
+PUBLIC cl_int
+clEnqueueReadBuffer(cl_command_queue q, cl_mem obj, cl_bool blocking,
+                    size_t offset, size_t size, void *ptr,
+                    cl_uint num_deps, const cl_event *deps,
+                    cl_event *ev) try {
+   validate_base(q, num_deps, deps);
+   validate_obj(q, obj);
+
+   if (!ptr || offset > obj->size() || offset + size > obj->size())
+      throw error(CL_INVALID_VALUE);
+
+   hard_event *hev = new hard_event(
+      *q, CL_COMMAND_READ_BUFFER, { deps, deps + num_deps },
+      soft_copy_op(q,
+                   ptr, { 0 }, { 1 },
+                   obj, { offset }, { 1 },
+                   { size, 1, 1 }));
+
+   ret_object(ev, hev);
+   return CL_SUCCESS;
+
+} catch (error &e) {
+   return e.get();
+}
+
+PUBLIC cl_int
+clEnqueueWriteBuffer(cl_command_queue q, cl_mem obj, cl_bool blocking,
+                     size_t offset, size_t size, const void *ptr,
+                     cl_uint num_deps, const cl_event *deps,
+                     cl_event *ev) try {
+   validate_base(q, num_deps, deps);
+   validate_obj(q, obj);
+
+   if (!ptr || offset > obj->size() || offset + size > obj->size())
+      throw error(CL_INVALID_VALUE);
+
+   hard_event *hev = new hard_event(
+      *q, CL_COMMAND_WRITE_BUFFER, { deps, deps + num_deps },
+      soft_copy_op(q,
+                   obj, { offset }, { 1 },
+                   ptr, { 0 }, { 1 },
+                   { size, 1, 1 }));
+
+   ret_object(ev, hev);
+   return CL_SUCCESS;
+
+} catch (error &e) {
+   return e.get();
+}
+
+PUBLIC cl_int
+clEnqueueReadBufferRect(cl_command_queue q, cl_mem obj, cl_bool blocking,
+                        const size_t *obj_origin, const size_t *host_origin,
+                        const size_t *region,
+                        size_t obj_row_pitch, size_t obj_slice_pitch,
+                        size_t host_row_pitch, size_t host_slice_pitch,
+                        void *ptr,
+                        cl_uint num_deps, const cl_event *deps,
+                        cl_event *ev) try {
+   validate_base(q, num_deps, deps);
+   validate_obj(q, obj);
+
+   if (!ptr)
+      throw error(CL_INVALID_VALUE);
+
+   hard_event *hev = new hard_event(
+      *q, CL_COMMAND_READ_BUFFER_RECT, { deps, deps + num_deps },
+      soft_copy_op(q,
+                   ptr, host_origin,
+                   { 1, host_row_pitch, host_slice_pitch },
+                   obj, obj_origin,
+                   { 1, obj_row_pitch, obj_slice_pitch },
+                   region));
+
+   ret_object(ev, hev);
+   return CL_SUCCESS;
+
+} catch (error &e) {
+   return e.get();
+}
+
+PUBLIC cl_int
+clEnqueueWriteBufferRect(cl_command_queue q, cl_mem obj, cl_bool blocking,
+                         const size_t *obj_origin, const size_t *host_origin,
+                         const size_t *region,
+                         size_t obj_row_pitch, size_t obj_slice_pitch,
+                         size_t host_row_pitch, size_t host_slice_pitch,
+                         const void *ptr,
+                         cl_uint num_deps, const cl_event *deps,
+                         cl_event *ev) try {
+   validate_base(q, num_deps, deps);
+   validate_obj(q, obj);
+
+   if (!ptr)
+      throw error(CL_INVALID_VALUE);
+
+   hard_event *hev = new hard_event(
+      *q, CL_COMMAND_WRITE_BUFFER_RECT, { deps, deps + num_deps },
+      soft_copy_op(q,
+                   obj, obj_origin,
+                   { 1, obj_row_pitch, obj_slice_pitch },
+                   ptr, host_origin,
+                   { 1, host_row_pitch, host_slice_pitch },
+                   region));
+
+   ret_object(ev, hev);
+   return CL_SUCCESS;
+
+} catch (error &e) {
+   return e.get();
+}
+
+PUBLIC cl_int
+clEnqueueCopyBuffer(cl_command_queue q, cl_mem src_obj, cl_mem dst_obj,
+                    size_t src_offset, size_t dst_offset, size_t size,
+                    cl_uint num_deps, const cl_event *deps,
+                    cl_event *ev) try {
+   validate_base(q, num_deps, deps);
+   validate_obj(q, src_obj);
+   validate_obj(q, dst_obj);
+
+   hard_event *hev = new hard_event(
+      *q, CL_COMMAND_COPY_BUFFER, { deps, deps + num_deps },
+      hard_copy_op(q, dst_obj, { dst_offset },
+                   src_obj, { src_offset },
+                   { size, 1, 1 }));
+
+   ret_object(ev, hev);
+   return CL_SUCCESS;
+
+} catch (error &e) {
+   return e.get();
+}
+
+PUBLIC cl_int
+clEnqueueCopyBufferRect(cl_command_queue q, cl_mem src_obj, cl_mem dst_obj,
+                        const size_t *src_origin, const size_t *dst_origin,
+                        const size_t *region,
+                        size_t src_row_pitch, size_t src_slice_pitch,
+                        size_t dst_row_pitch, size_t dst_slice_pitch,
+                        cl_uint num_deps, const cl_event *deps,
+                        cl_event *ev) try {
+   validate_base(q, num_deps, deps);
+   validate_obj(q, src_obj);
+   validate_obj(q, dst_obj);
+
+   hard_event *hev = new hard_event(
+      *q, CL_COMMAND_COPY_BUFFER_RECT, { deps, deps + num_deps },
+      soft_copy_op(q,
+                   dst_obj, dst_origin,
+                   { 1, dst_row_pitch, dst_slice_pitch },
+                   src_obj, src_origin,
+                   { 1, src_row_pitch, src_slice_pitch },
+                   region));
+
+   ret_object(ev, hev);
+   return CL_SUCCESS;
+
+} catch (error &e) {
+   return e.get();
+}
+
+PUBLIC cl_int
+clEnqueueReadImage(cl_command_queue q, cl_mem obj, cl_bool blocking,
+                   const size_t *origin, const size_t *region,
+                   size_t row_pitch, size_t slice_pitch, void *ptr,
+                   cl_uint num_deps, const cl_event *deps,
+                   cl_event *ev) try {
+   image *img = dynamic_cast<image *>(obj);
+
+   validate_base(q, num_deps, deps);
+   validate_obj(q, img);
+
+   if (!ptr)
+      throw error(CL_INVALID_VALUE);
+
+   hard_event *hev = new hard_event(
+      *q, CL_COMMAND_READ_IMAGE, { deps, deps + num_deps },
+      soft_copy_op(q,
+                   ptr, {},
+                   { 1, row_pitch, slice_pitch },
+                   obj, origin,
+                   { 1, img->row_pitch(), img->slice_pitch() },
+                   region));
+
+   ret_object(ev, hev);
+   return CL_SUCCESS;
+
+} catch (error &e) {
+   return e.get();
+}
+
+PUBLIC cl_int
+clEnqueueWriteImage(cl_command_queue q, cl_mem obj, cl_bool blocking,
+                    const size_t *origin, const size_t *region,
+                    size_t row_pitch, size_t slice_pitch, const void *ptr,
+                    cl_uint num_deps, const cl_event *deps,
+                    cl_event *ev) try {
+   image *img = dynamic_cast<image *>(obj);
+
+   validate_base(q, num_deps, deps);
+   validate_obj(q, img);
+
+   if (!ptr)
+      throw error(CL_INVALID_VALUE);
+
+   hard_event *hev = new hard_event(
+      *q, CL_COMMAND_WRITE_IMAGE, { deps, deps + num_deps },
+      soft_copy_op(q,
+                   obj, origin,
+                   { 1, img->row_pitch(), img->slice_pitch() },
+                   ptr, {},
+                   { 1, row_pitch, slice_pitch },
+                   region));
+
+   ret_object(ev, hev);
+   return CL_SUCCESS;
+
+} catch (error &e) {
+   return e.get();
+}
+
+PUBLIC cl_int
+clEnqueueCopyImage(cl_command_queue q, cl_mem src_obj, cl_mem dst_obj,
+                   const size_t *src_origin, const size_t *dst_origin,
+                   const size_t *region,
+                   cl_uint num_deps, const cl_event *deps,
+                   cl_event *ev) try {
+   image *src_img = dynamic_cast<image *>(src_obj);
+   image *dst_img = dynamic_cast<image *>(dst_obj);
+
+   validate_base(q, num_deps, deps);
+   validate_obj(q, src_img);
+   validate_obj(q, dst_img);
+
+   hard_event *hev = new hard_event(
+      *q, CL_COMMAND_COPY_IMAGE, { deps, deps + num_deps },
+      hard_copy_op(q, dst_obj, dst_origin, src_obj, src_origin, region));
+
+   ret_object(ev, hev);
+   return CL_SUCCESS;
+
+} catch (error &e) {
+   return e.get();
+}
+
+PUBLIC cl_int
+clEnqueueCopyImageToBuffer(cl_command_queue q, cl_mem src_obj, cl_mem dst_obj,
+                           const size_t *src_origin, const size_t *region,
+                           size_t dst_offset,
+                           cl_uint num_deps, const cl_event *deps,
+                           cl_event *ev) try {
+   image *src_img = dynamic_cast<image *>(src_obj);
+
+   validate_base(q, num_deps, deps);
+   validate_obj(q, src_img);
+   validate_obj(q, dst_obj);
+
+   hard_event *hev = new hard_event(
+      *q, CL_COMMAND_COPY_IMAGE_TO_BUFFER, { deps, deps + num_deps },
+      soft_copy_op(q,
+                   dst_obj, { dst_offset },
+                   { 0, 0, 0 },
+                   src_obj, src_origin,
+                   { 1, src_img->row_pitch(), src_img->slice_pitch() },
+                   region));
+
+   ret_object(ev, hev);
+   return CL_SUCCESS;
+
+} catch (error &e) {
+   return e.get();
+}
+
+PUBLIC cl_int
+clEnqueueCopyBufferToImage(cl_command_queue q, cl_mem src_obj, cl_mem dst_obj,
+                           size_t src_offset,
+                           const size_t *dst_origin, const size_t *region,
+                           cl_uint num_deps, const cl_event *deps,
+                           cl_event *ev) try {
+   image *dst_img = dynamic_cast<image *>(src_obj);
+
+   validate_base(q, num_deps, deps);
+   validate_obj(q, src_obj);
+   validate_obj(q, dst_img);
+
+   hard_event *hev = new hard_event(
+      *q, CL_COMMAND_COPY_BUFFER_TO_IMAGE, { deps, deps + num_deps },
+      soft_copy_op(q,
+                   dst_obj, dst_origin,
+                   { 1, dst_img->row_pitch(), dst_img->slice_pitch() },
+                   src_obj, { src_offset },
+                   { 0, 0, 0 },
+                   region));
+
+   ret_object(ev, hev);
+   return CL_SUCCESS;
+
+} catch (error &e) {
+   return e.get();
+}
+
+PUBLIC void *
+clEnqueueMapBuffer(cl_command_queue q, cl_mem obj, cl_bool blocking,
+                   cl_map_flags flags, size_t offset, size_t size,
+                   cl_uint num_deps, const cl_event *deps,
+                   cl_event *ev, cl_int *errcode_ret) try {
+   validate_base(q, num_deps, deps);
+   validate_obj(q, obj);
+
+   if (offset > obj->size() || offset + size > obj->size())
+      throw error(CL_INVALID_VALUE);
+
+   void *map = obj->resource(q).add_map(
+      *q, flags, blocking, { offset }, { size });
+
+   ret_object(ev, new hard_event(*q, CL_COMMAND_MAP_BUFFER,
+                                 { deps, deps + num_deps }));
+   ret_error(errcode_ret, CL_SUCCESS);
+   return map;
+
+} catch (error &e) {
+   ret_error(errcode_ret, e);
+   return NULL;
+}
+
+PUBLIC void *
+clEnqueueMapImage(cl_command_queue q, cl_mem obj, cl_bool blocking,
+                  cl_map_flags flags,
+                  const size_t *origin, const size_t *region,
+                  size_t *row_pitch, size_t *slice_pitch,
+                  cl_uint num_deps, const cl_event *deps,
+                  cl_event *ev, cl_int *errcode_ret) try {
+   image *img = dynamic_cast<image *>(obj);
+
+   validate_base(q, num_deps, deps);
+   validate_obj(q, img);
+
+   void *map = obj->resource(q).add_map(
+      *q, flags, blocking, origin, region);
+
+   ret_object(ev, new hard_event(*q, CL_COMMAND_MAP_IMAGE,
+                                 { deps, deps + num_deps }));
+   ret_error(errcode_ret, CL_SUCCESS);
+   return map;
+
+} catch (error &e) {
+   ret_error(errcode_ret, e);
+   return NULL;
+}
+
+PUBLIC cl_int
+clEnqueueUnmapMemObject(cl_command_queue q, cl_mem obj, void *ptr,
+                        cl_uint num_deps, const cl_event *deps,
+                        cl_event *ev) try {
+   validate_base(q, num_deps, deps);
+   validate_obj(q, obj);
+
+   hard_event *hev = new hard_event(
+      *q, CL_COMMAND_UNMAP_MEM_OBJECT, { deps, deps + num_deps },
+      [=](event &) {
+         obj->resource(q).del_map(ptr);
+      });
+
+   ret_object(ev, hev);
+   return CL_SUCCESS;
+
+} catch (error &e) {
+   return e.get();
+}
diff --git a/src/gallium/state_trackers/clover/api/util.hpp b/src/gallium/state_trackers/clover/api/util.hpp
new file mode 100644
index 00000000000..2f9ec1f6a10
--- /dev/null
+++ b/src/gallium/state_trackers/clover/api/util.hpp
@@ -0,0 +1,166 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+// OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#ifndef __CL_UTIL_HPP__
+#define __CL_UTIL_HPP__
+
+#include <cstdint>
+#include <cstring>
+#include <algorithm>
+#include <map>
+
+#include "core/base.hpp"
+#include "pipe/p_compiler.h"
+
+namespace clover {
+   ///
+   /// Return a matrix (a container of containers) in \a buf with
+   /// argument and bounds checking.  Intended to be used by
+   /// implementations of \a clGetXXXInfo().
+   ///
+   template<typename T, typename V>
+   cl_int
+   matrix_property(void *buf, size_t size, size_t *size_ret, const V& v) {
+      if (buf && size < sizeof(T *) * v.size())
+         return CL_INVALID_VALUE;
+
+      if (size_ret)
+         *size_ret = sizeof(T *) * v.size();
+
+      if (buf)
+         for_each([](typename V::value_type src, T *dst) {
+               if (dst)
+                  std::copy(src.begin(), src.end(), dst);
+            },
+            v.begin(), v.end(), (T **)buf);
+
+      return CL_SUCCESS;
+   }
+
+   ///
+   /// Return a vector in \a buf with argument and bounds checking.
+   /// Intended to be used by implementations of \a clGetXXXInfo().
+   ///
+   template<typename T, typename V>
+   cl_int
+   vector_property(void *buf, size_t size, size_t *size_ret, const V& v) {
+      if (buf && size < sizeof(T) * v.size())
+         return CL_INVALID_VALUE;
+
+      if (size_ret)
+         *size_ret = sizeof(T) * v.size();
+      if (buf)
+         std::copy(v.begin(), v.end(), (T *)buf);
+
+      return CL_SUCCESS;
+   }
+
+   ///
+   /// Return a scalar in \a buf with argument and bounds checking.
+   /// Intended to be used by implementations of \a clGetXXXInfo().
+   ///
+   template<typename T>
+   cl_int
+   scalar_property(void *buf, size_t size, size_t *size_ret, T v) {
+      return vector_property<T>(buf, size, size_ret, std::vector<T>(1, v));
+   }
+
+   ///
+   /// Return a string in \a buf with argument and bounds checking.
+   /// Intended to be used by implementations of \a clGetXXXInfo().
+   ///
+   inline cl_int
+   string_property(void *buf, size_t size, size_t *size_ret,
+                   const std::string &v) {
+      if (buf && size < v.size() + 1)
+         return CL_INVALID_VALUE;
+
+      if (size_ret)
+         *size_ret = v.size() + 1;
+      if (buf)
+         std::strcpy((char *)buf, v.c_str());
+
+      return CL_SUCCESS;
+   }
+
+   ///
+   /// Convert a NULL-terminated property list into an std::map.
+   ///
+   template<typename T>
+   std::map<T, T>
+   property_map(const T *props) {
+      std::map<T, T> m;
+
+      while (props && *props) {
+         T key = *props++;
+         T value = *props++;
+
+         if (m.count(key))
+            throw clover::error(CL_INVALID_PROPERTY);
+
+         m.insert({ key, value });
+      }
+
+      return m;
+   }
+
+   ///
+   /// Convert an std::map into a NULL-terminated property list.
+   ///
+   template<typename T>
+   std::vector<T>
+   property_vector(const std::map<T, T> &m) {
+      std::vector<T> v;
+
+      for (auto &p : m) {
+         v.push_back(p.first);
+         v.push_back(p.second);
+      }
+
+      v.push_back(0);
+      return v;
+   }
+
+   ///
+   /// Return an error code in \a p if non-zero.
+   ///
+   inline void
+   ret_error(cl_int *p, const clover::error &e) {
+      if (p)
+         *p = e.get();
+   }
+
+   ///
+   /// Return a reference-counted object in \a p if non-zero.
+   /// Otherwise release object ownership.
+   ///
+   template<typename T, typename S>
+   void
+   ret_object(T p, S v) {
+      if (p)
+         *p = v;
+      else
+         v->release();
+   }
+}
+
+#endif
diff --git a/src/gallium/state_trackers/clover/core/base.hpp b/src/gallium/state_trackers/clover/core/base.hpp
new file mode 100644
index 00000000000..19053f39235
--- /dev/null
+++ b/src/gallium/state_trackers/clover/core/base.hpp
@@ -0,0 +1,285 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+// OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#ifndef __CORE_BASE_HPP__
+#define __CORE_BASE_HPP__
+
+#include <stdexcept>
+#include <atomic>
+#include <cassert>
+#include <tuple>
+#include <vector>
+#include <functional>
+
+#include "CL/cl.h"
+
+///
+/// Main namespace of the CL state tracker.
+///
+namespace clover {
+   ///
+   /// Class that represents an error that can be converted to an
+   /// OpenCL status code.
+   ///
+   class error : public std::runtime_error {
+   public:
+      error(cl_int code, std::string what = "") :
+         std::runtime_error(what), code(code) {
+      }
+
+      cl_int get() const {
+         return code;
+      }
+
+   protected:
+      cl_int code;
+   };
+
+   ///
+   /// Base class for objects that support reference counting.
+   ///
+   class ref_counter {
+   public:
+      ref_counter() : __ref_count(1) {}
+
+      unsigned ref_count() {
+         return __ref_count;
+      }
+
+      void retain() {
+         __ref_count++;
+      }
+
+      bool release() {
+         return (--__ref_count) == 0;
+      }
+
+   private:
+      std::atomic<unsigned> __ref_count;
+   };
+
+   ///
+   /// Intrusive smart pointer for objects that implement the
+   /// clover::ref_counter interface.
+   ///
+   template<typename T>
+   class ref_ptr {
+   public:
+      ref_ptr(T *q = NULL) : p(NULL) {
+         reset(q);
+      }
+
+      template<typename S>
+      ref_ptr(const ref_ptr<S> &ref) : p(NULL) {
+         reset(ref.p);
+      }
+
+      ~ref_ptr() {
+         reset(NULL);
+      }
+
+      void reset(T *q = NULL) {
+         if (q)
+            q->retain();
+         if (p && p->release())
+            delete p;
+         p = q;
+      }
+
+      ref_ptr &operator=(const ref_ptr &ref) {
+         reset(ref.p);
+         return *this;
+      }
+
+      T *operator*() const {
+         return p;
+      }
+
+      T *operator->() const {
+         return p;
+      }
+
+      operator bool() const {
+         return p;
+      }
+
+   private:
+      T *p;
+   };
+
+   ///
+   /// Transfer the caller's ownership of a reference-counted object
+   /// to a clover::ref_ptr smart pointer.
+   ///
+   template<typename T>
+   inline ref_ptr<T>
+   transfer(T *p) {
+      ref_ptr<T> ref { p };
+      p->release();
+      return ref;
+   }
+
+   template<typename T, typename S, int N>
+   struct __iter_helper {
+      template<typename F, typename Its, typename... Args>
+      static T
+      step(F op, S state, Its its, Args... args) {
+         return __iter_helper<T, S, N - 1>::step(
+            op, state, its, *(std::get<N>(its)++), args...);
+      }
+   };
+
+   template<typename T, typename S>
+   struct __iter_helper<T, S, 0> {
+      template<typename F, typename Its, typename... Args>
+      static T
+      step(F op, S state, Its its, Args... args) {
+         return op(state, *(std::get<0>(its)++), args...);
+      }
+   };
+
+   struct __empty {};
+
+   template<typename T>
+   struct __iter_helper<T, __empty, 0> {
+      template<typename F, typename Its, typename... Args>
+      static T
+      step(F op, __empty state, Its its, Args... args) {
+         return op(*(std::get<0>(its)++), args...);
+      }
+   };
+
+   template<typename F, typename... Its>
+   struct __result_helper {
+      typedef typename std::remove_const<
+         typename std::result_of<
+            F (typename std::iterator_traits<Its>::value_type...)
+            >::type
+         >::type type;
+   };
+
+   ///
+   /// Iterate \a op on the result of zipping all the specified
+   /// iterators together.
+   ///
+   /// Similar to std::for_each, but it accepts functions of an
+   /// arbitrary number of arguments.
+   ///
+   template<typename F, typename It0, typename... Its>
+   F
+   for_each(F op, It0 it0, It0 end0, Its... its) {
+      while (it0 != end0)
+         __iter_helper<void, __empty, sizeof...(Its)>::step(
+            op, {}, std::tie(it0, its...));
+
+      return op;
+   }
+
+   ///
+   /// Iterate \a op on the result of zipping all the specified
+   /// iterators together, storing return values in a new container.
+   ///
+   /// Similar to std::transform, but it accepts functions of an
+   /// arbitrary number of arguments and it doesn't have to be
+   /// provided with an output iterator.
+   ///
+   template<typename F, typename It0, typename... Its,
+            typename C = std::vector<
+               typename __result_helper<F, It0, Its...>::type>>
+   C
+   map(F op, It0 it0, It0 end0, Its... its) {
+      C c;
+
+      while (it0 != end0)
+         c.push_back(
+            __iter_helper<typename C::value_type, __empty, sizeof...(Its)>
+            ::step(op, {}, std::tie(it0, its...)));
+
+      return c;
+   }
+
+   ///
+   /// Reduce the result of zipping all the specified iterators
+   /// together, using iterative application of \a op from left to
+   /// right.
+   ///
+   /// Similar to std::accumulate, but it accepts functions of an
+   /// arbitrary number of arguments.
+   ///
+   template<typename F, typename T, typename It0, typename... Its>
+   T
+   fold(F op, T a, It0 it0, It0 end0, Its... its) {
+      while (it0 != end0)
+         a = __iter_helper<T, T, sizeof...(Its)>::step(
+            op, a, std::tie(it0, its...));
+
+      return a;
+   }
+
+   ///
+   /// Iterate \a op on the result of zipping the specified iterators
+   /// together, checking if any of the evaluations returns \a true.
+   ///
+   /// Similar to std::any_of, but it accepts functions of an
+   /// arbitrary number of arguments.
+   ///
+   template<typename F, typename It0, typename... Its>
+   bool
+   any_of(F op, It0 it0, It0 end0, Its... its) {
+      while (it0 != end0)
+         if (__iter_helper<bool, __empty, sizeof...(Its)>::step(
+                op, {}, std::tie(it0, its...)))
+            return true;
+
+      return false;
+   }
+
+   template<typename T, typename S>
+   T
+   keys(const std::pair<T, S> &ent) {
+      return ent.first;
+   }
+
+   template<typename T, typename S>
+   std::function<bool (const std::pair<T, S> &)>
+   key_equals(const T &x) {
+      return [=](const std::pair<T, S> &ent) {
+         return ent.first == x;
+      };
+   }
+
+   template<typename T, typename S>
+   S
+   values(const std::pair<T, S> &ent) {
+      return ent.second;
+   }
+
+   template<typename T>
+   std::function<bool (const T &)>
+   is_zero() {
+      return [](const T &x) {
+         return x == 0;
+      };
+   }
+}
+
+#endif
diff --git a/src/gallium/state_trackers/clover/core/compat.hpp b/src/gallium/state_trackers/clover/core/compat.hpp
new file mode 100644
index 00000000000..c0057af3258
--- /dev/null
+++ b/src/gallium/state_trackers/clover/core/compat.hpp
@@ -0,0 +1,290 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+// OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#ifndef __CORE_COMPAT_HPP__
+#define __CORE_COMPAT_HPP__
+
+#include <new>
+#include <cstring>
+#include <cstdlib>
+#include <string>
+#include <stdint.h>
+
+
+namespace clover {
+   namespace compat {
+      // XXX - For cases where we can't rely on STL...  I.e. the
+      //       interface between code compiled as C++98 and C++11
+      //       source.  Get rid of this as soon as everything can be
+      //       compiled as C++11.
+
+      template<typename T>
+      class vector {
+      protected:
+         static T *
+         alloc(int n, const T *q, int m) {
+            T *p = reinterpret_cast<T *>(std::malloc(n * sizeof(T)));
+
+            for (int i = 0; i < m; ++i)
+               new(&p[i]) T(q[i]);
+
+            return p;
+         }
+
+         static void
+         free(int n, T *p) {
+            for (int i = 0; i < n; ++i)
+               p[i].~T();
+
+            std::free(p);
+         }
+
+      public:
+         vector() : p(NULL), n(0) {
+         }
+
+         vector(const vector &v) : p(alloc(v.n, v.p, v.n)), n(v.n) {
+         }
+
+         vector(T *p, size_t n) : p(alloc(n, p, n)), n(n) {
+         }
+
+         template<typename C>
+         vector(const C &v) :
+            p(alloc(v.size(), &*v.begin(), v.size())), n(v.size()) {
+         }
+
+         ~vector() {
+            free(n, p);
+         }
+
+         vector &
+         operator=(const vector &v) {
+            free(n, p);
+
+            p = alloc(v.n, v.p, v.n);
+            n = v.n;
+
+            return *this;
+         }
+
+         void
+         reserve(size_t m) {
+            if (n < m) {
+               T *q = alloc(m, p, n);
+               free(n, p);
+
+               p = q;
+               n = m;
+            }
+         }
+
+         void
+         resize(size_t m, T x = T()) {
+            size_t n = size();
+
+            reserve(m);
+
+            for (size_t i = n; i < m; ++i)
+               new(&p[i]) T(x);
+         }
+
+         void
+         push_back(const T &x) {
+            size_t n = size();
+            reserve(n + 1);
+            new(&p[n]) T(x);
+         }
+
+         size_t
+         size() const {
+            return n;
+         }
+
+         T *
+         begin() {
+            return p;
+         }
+
+         const T *
+         begin() const {
+            return p;
+         }
+
+         T *
+         end() {
+            return p + n;
+         }
+
+         const T *
+         end() const {
+            return p + n;
+         }
+
+         T &
+         operator[](int i) {
+            return p[i];
+         }
+
+         const T &
+         operator[](int i) const {
+            return p[i];
+         }
+
+      private:
+         T *p;
+         size_t n;
+      };
+
+      template<typename T>
+      class vector_ref {
+      public:
+         vector_ref(T *p, size_t n) : p(p), n(n) {
+         }
+
+         template<typename C>
+         vector_ref(C &v) : p(&*v.begin()), n(v.size()) {
+         }
+
+         size_t
+         size() const {
+            return n;
+         }
+
+         T *
+         begin() {
+            return p;
+         }
+
+         const T *
+         begin() const {
+            return p;
+         }
+
+         T *
+         end() {
+            return p + n;
+         }
+
+         const T *
+         end() const {
+            return p + n;
+         }
+
+         T &
+         operator[](int i) {
+            return p[i];
+         }
+
+         const T &
+         operator[](int i) const {
+            return p[i];
+         }
+
+      private:
+         T *p;
+         size_t n;
+      };
+
+      class istream {
+      public:
+         typedef vector_ref<const unsigned char> buffer_t;
+
+         class error {
+         public:
+            virtual ~error() {}
+         };
+
+         istream(const buffer_t &buf) : buf(buf), offset(0) {}
+
+         void
+         read(char *p, size_t n) {
+            if (offset + n > buf.size())
+               throw error();
+
+            std::memcpy(p, buf.begin() + offset, n);
+            offset += n;
+         }
+
+      private:
+         const buffer_t &buf;
+         size_t offset;
+      };
+
+      class ostream {
+      public:
+         typedef vector<unsigned char> buffer_t;
+
+         ostream(buffer_t &buf) : buf(buf), offset(buf.size()) {}
+
+         void
+         write(const char *p, size_t n) {
+            buf.resize(offset + n);
+            std::memcpy(buf.begin() + offset, p, n);
+            offset += n;
+         }
+
+      private:
+         buffer_t &buf;
+         size_t offset;
+      };
+
+      class string : public vector_ref<const char> {
+      public:
+         string(const char *p) : vector_ref(p, std::strlen(p)) {
+         }
+
+         template<typename C>
+         string(const C &v) : vector_ref(v) {
+         }
+
+         operator std::string() const {
+            return std::string(begin(), end());
+         }
+
+         const char *
+         find(const string &s) const {
+            for (size_t i = 0; i + s.size() < size(); ++i) {
+               if (!std::memcmp(begin() + i, s.begin(), s.size()))
+                  return begin() + i;
+            }
+
+            return end();
+         }
+      };
+
+      template<typename T>
+      bool
+      operator==(const vector_ref<T> &a, const vector_ref<T> &b) {
+         if (a.size() != b.size())
+            return false;
+
+         for (size_t i = 0; i < a.size(); ++i)
+            if (a[i] != b[i])
+               return false;
+
+         return true;
+      }
+   }
+}
+
+#endif
diff --git a/src/gallium/state_trackers/clover/core/compiler.hpp b/src/gallium/state_trackers/clover/core/compiler.hpp
new file mode 100644
index 00000000000..a3998d5e2fb
--- /dev/null
+++ b/src/gallium/state_trackers/clover/core/compiler.hpp
@@ -0,0 +1,53 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+// OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#ifndef __CORE_COMPILER_HPP__
+#define __CORE_COMPILER_HPP__
+
+#include "core/compat.hpp"
+#include "core/module.hpp"
+
+namespace clover {
+   class build_error {
+   public:
+      build_error(const compat::string &log) : log(log) {
+      }
+
+      virtual ~build_error() {
+      }
+
+      compat::string what() {
+         return log;
+      }
+
+   private:
+      compat::vector<char> log;
+   };
+
+   module compile_program_llvm(const compat::string &source,
+                               const compat::string &target);
+
+   module compile_program_tgsi(const compat::string &source,
+                               const compat::string &target);
+}
+
+#endif
diff --git a/src/gallium/state_trackers/clover/core/context.cpp b/src/gallium/state_trackers/clover/core/context.cpp
new file mode 100644
index 00000000000..6e09a1acae0
--- /dev/null
+++ b/src/gallium/state_trackers/clover/core/context.cpp
@@ -0,0 +1,37 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+// OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#include <algorithm>
+
+#include "core/context.hpp"
+
+using namespace clover;
+
+_cl_context::_cl_context(const std::vector<cl_context_properties> &props,
+                         const std::vector<device *> &devs) :
+   devs(devs), __props(props) {
+}
+
+bool
+_cl_context::has_device(clover::device *dev) const {
+   return std::count(devs.begin(), devs.end(), dev);
+}
diff --git a/src/gallium/state_trackers/clover/core/context.hpp b/src/gallium/state_trackers/clover/core/context.hpp
new file mode 100644
index 00000000000..d783fb6b14b
--- /dev/null
+++ b/src/gallium/state_trackers/clover/core/context.hpp
@@ -0,0 +1,51 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+// OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#ifndef __CORE_CONTEXT_HPP__
+#define __CORE_CONTEXT_HPP__
+
+#include "core/base.hpp"
+#include "core/device.hpp"
+
+namespace clover {
+   typedef struct _cl_context context;
+}
+
+struct _cl_context : public clover::ref_counter {
+public:
+   _cl_context(const std::vector<cl_context_properties> &props,
+               const std::vector<clover::device *> &devs);
+   _cl_context(const _cl_context &ctx) = delete;
+
+   bool has_device(clover::device *dev) const;
+
+   const std::vector<cl_context_properties> &props() const {
+      return __props;
+   }
+
+   const std::vector<clover::device *> devs;
+
+private:
+   std::vector<cl_context_properties> __props;
+};
+
+#endif
diff --git a/src/gallium/state_trackers/clover/core/device.cpp b/src/gallium/state_trackers/clover/core/device.cpp
new file mode 100644
index 00000000000..8390f3f4abb
--- /dev/null
+++ b/src/gallium/state_trackers/clover/core/device.cpp
@@ -0,0 +1,179 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+// OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#include "core/device.hpp"
+#include "pipe/p_screen.h"
+#include "pipe/p_state.h"
+
+using namespace clover;
+
+namespace {
+   template<typename T>
+   std::vector<T>
+   get_compute_param(pipe_screen *pipe, pipe_compute_cap cap) {
+      int sz = pipe->get_compute_param(pipe, cap, NULL);
+      std::vector<T> v(sz / sizeof(T));
+
+      pipe->get_compute_param(pipe, cap, &v.front());
+      return v;
+   }
+}
+
+_cl_device_id::_cl_device_id(pipe_loader_device *ldev) : ldev(ldev) {
+   pipe = pipe_loader_create_screen(ldev, PIPE_SEARCH_DIR);
+   if (!pipe || !pipe->get_param(pipe, PIPE_CAP_COMPUTE))
+      throw error(CL_INVALID_DEVICE);
+}
+
+_cl_device_id::_cl_device_id(_cl_device_id &&dev) : pipe(dev.pipe), ldev(dev.ldev) {
+   dev.ldev = NULL;
+   dev.pipe = NULL;
+}
+
+_cl_device_id::~_cl_device_id() {
+   if (pipe)
+      pipe->destroy(pipe);
+   if (ldev)
+      pipe_loader_release(&ldev, 1);
+}
+
+cl_device_type
+_cl_device_id::type() const {
+   switch (ldev->type) {
+   case PIPE_LOADER_DEVICE_SOFTWARE:
+      return CL_DEVICE_TYPE_CPU;
+   case PIPE_LOADER_DEVICE_PCI:
+      return CL_DEVICE_TYPE_GPU;
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
+cl_uint
+_cl_device_id::vendor_id() const {
+   switch (ldev->type) {
+   case PIPE_LOADER_DEVICE_SOFTWARE:
+      return 0;
+   case PIPE_LOADER_DEVICE_PCI:
+      return ldev->pci.vendor_id;
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
+size_t
+_cl_device_id::max_images_read() const {
+   return PIPE_MAX_SHADER_RESOURCES;
+}
+
+size_t
+_cl_device_id::max_images_write() const {
+   return PIPE_MAX_SHADER_RESOURCES;
+}
+
+cl_uint
+_cl_device_id::max_image_levels_2d() const {
+   return pipe->get_param(pipe, PIPE_CAP_MAX_TEXTURE_2D_LEVELS);
+}
+
+cl_uint
+_cl_device_id::max_image_levels_3d() const {
+   return pipe->get_param(pipe, PIPE_CAP_MAX_TEXTURE_3D_LEVELS);
+}
+
+cl_uint
+_cl_device_id::max_samplers() const {
+   return pipe->get_shader_param(pipe, PIPE_SHADER_COMPUTE,
+                                 PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS);
+}
+
+cl_ulong
+_cl_device_id::max_mem_global() const {
+   return get_compute_param<uint64_t>(pipe,
+                                      PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE)[0];
+}
+
+cl_ulong
+_cl_device_id::max_mem_local() const {
+   return get_compute_param<uint64_t>(pipe,
+                                      PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE)[0];
+}
+
+cl_ulong
+_cl_device_id::max_mem_input() const {
+   return get_compute_param<uint64_t>(pipe,
+                                      PIPE_COMPUTE_CAP_MAX_INPUT_SIZE)[0];
+}
+
+cl_ulong
+_cl_device_id::max_const_buffer_size() const {
+   return pipe->get_shader_param(pipe, PIPE_SHADER_COMPUTE,
+                                 PIPE_SHADER_CAP_MAX_CONSTS) * 16;
+}
+
+cl_uint
+_cl_device_id::max_const_buffers() const {
+   return pipe->get_shader_param(pipe, PIPE_SHADER_COMPUTE,
+                                 PIPE_SHADER_CAP_MAX_CONST_BUFFERS);
+}
+
+std::vector<size_t>
+_cl_device_id::max_block_size() const {
+   return get_compute_param<uint64_t>(pipe, PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE);
+}
+
+std::string
+_cl_device_id::device_name() const {
+   return pipe->get_name(pipe);
+}
+
+std::string
+_cl_device_id::vendor_name() const {
+   return pipe->get_vendor(pipe);
+}
+
+std::string
+_cl_device_id::ir_target() const {
+   switch (pipe->get_shader_param(pipe, PIPE_SHADER_COMPUTE,
+                                  PIPE_SHADER_CAP_PREFERRED_IR)) {
+   case PIPE_SHADER_IR_TGSI:
+      return "tgsi";
+   default:
+      assert(0);
+      return "";
+   }
+}
+
+device_registry::device_registry() {
+   int n = pipe_loader_probe(NULL, 0);
+   std::vector<pipe_loader_device *> ldevs(n);
+
+   pipe_loader_probe(&ldevs.front(), n);
+
+   for (pipe_loader_device *ldev : ldevs) {
+      try {
+         devs.emplace_back(ldev);
+      } catch (error &) {}
+   }
+}
diff --git a/src/gallium/state_trackers/clover/core/device.hpp b/src/gallium/state_trackers/clover/core/device.hpp
new file mode 100644
index 00000000000..8f284ba5e42
--- /dev/null
+++ b/src/gallium/state_trackers/clover/core/device.hpp
@@ -0,0 +1,107 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+// OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#ifndef __CORE_DEVICE_HPP__
+#define __CORE_DEVICE_HPP__
+
+#include <set>
+#include <vector>
+
+#include "core/base.hpp"
+#include "core/format.hpp"
+#include "pipe-loader/pipe_loader.h"
+
+namespace clover {
+   typedef struct _cl_device_id device;
+   class root_resource;
+   class hard_event;
+}
+
+struct _cl_device_id {
+public:
+   _cl_device_id(pipe_loader_device *ldev);
+   _cl_device_id(_cl_device_id &&dev);
+   _cl_device_id(const _cl_device_id &dev) = delete;
+   ~_cl_device_id();
+
+   cl_device_type type() const;
+   cl_uint vendor_id() const;
+   size_t max_images_read() const;
+   size_t max_images_write() const;
+   cl_uint max_image_levels_2d() const;
+   cl_uint max_image_levels_3d() const;
+   cl_uint max_samplers() const;
+   cl_ulong max_mem_global() const;
+   cl_ulong max_mem_local() const;
+   cl_ulong max_mem_input() const;
+   cl_ulong max_const_buffer_size() const;
+   cl_uint max_const_buffers() const;
+
+   std::vector<size_t> max_block_size() const;
+   std::string device_name() const;
+   std::string vendor_name() const;
+   std::string ir_target() const;
+
+   friend struct _cl_command_queue;
+   friend class clover::root_resource;
+   friend class clover::hard_event;
+   friend std::set<cl_image_format>
+   clover::supported_formats(cl_context, cl_mem_object_type);
+
+private:
+   pipe_screen *pipe;
+   pipe_loader_device *ldev;
+};
+
+namespace clover {
+   ///
+   /// Container of all the compute devices that are available in the
+   /// system.
+   ///
+   class device_registry {
+   public:
+      typedef std::vector<device>::iterator iterator;
+
+      device_registry();
+
+      iterator begin() {
+         return devs.begin();
+      }
+
+      iterator end() {
+         return devs.end();
+      }
+
+      device &front() {
+         return devs.front();
+      }
+
+      device &back() {
+         return devs.back();
+      }
+
+   protected:
+      std::vector<device> devs;
+   };
+}
+
+#endif
diff --git a/src/gallium/state_trackers/clover/core/event.cpp b/src/gallium/state_trackers/clover/core/event.cpp
new file mode 100644
index 00000000000..aa287e9a0c9
--- /dev/null
+++ b/src/gallium/state_trackers/clover/core/event.cpp
@@ -0,0 +1,175 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+// OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#include "core/event.hpp"
+#include "pipe/p_screen.h"
+
+using namespace clover;
+
+_cl_event::_cl_event(clover::context &ctx,
+                     std::vector<clover::event *> deps,
+                     action action_ok, action action_fail) :
+   ctx(ctx), __status(0), wait_count(1),
+   action_ok(action_ok), action_fail(action_fail) {
+   for (auto ev : deps)
+      ev->chain(this);
+}
+
+_cl_event::~_cl_event() {
+}
+
+void
+_cl_event::trigger() {
+   if (!--wait_count) {
+      action_ok(*this);
+
+      while (!__chain.empty()) {
+         __chain.back()->trigger();
+         __chain.pop_back();
+      }
+   }
+}
+
+void
+_cl_event::abort(cl_int status) {
+   __status = status;
+   action_fail(*this);
+
+   while (!__chain.empty()) {
+      __chain.back()->abort(status);
+      __chain.pop_back();
+   }
+}
+
+bool
+_cl_event::signalled() const {
+   return !wait_count;
+}
+
+void
+_cl_event::chain(clover::event *ev) {
+   if (wait_count) {
+      ev->wait_count++;
+      __chain.push_back(ev);
+      ev->deps.push_back(this);
+   }
+}
+
+hard_event::hard_event(clover::command_queue &q, cl_command_type command,
+                       std::vector<clover::event *> deps, action action) :
+   _cl_event(q.ctx, deps, action, [](event &ev){}),
+   __queue(q), __command(command), __fence(NULL) {
+   q.sequence(this);
+   trigger();
+}
+
+hard_event::~hard_event() {
+   pipe_screen *screen = queue()->dev.pipe;
+   screen->fence_reference(screen, &__fence, NULL);
+}
+
+cl_int
+hard_event::status() const {
+   pipe_screen *screen = queue()->dev.pipe;
+
+   if (__status < 0)
+      return __status;
+
+   else if (!__fence)
+      return CL_QUEUED;
+
+   else if (!screen->fence_signalled(screen, __fence))
+      return CL_SUBMITTED;
+
+   else
+      return CL_COMPLETE;
+}
+
+cl_command_queue
+hard_event::queue() const {
+   return &__queue;
+}
+
+cl_command_type
+hard_event::command() const {
+   return __command;
+}
+
+void
+hard_event::wait() const {
+   pipe_screen *screen = queue()->dev.pipe;
+
+   if (status() == CL_QUEUED)
+      queue()->flush();
+
+   if (!__fence ||
+       !screen->fence_finish(screen, __fence, PIPE_TIMEOUT_INFINITE))
+      throw error(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
+}
+
+void
+hard_event::fence(pipe_fence_handle *fence) {
+   pipe_screen *screen = queue()->dev.pipe;
+   screen->fence_reference(screen, &__fence, fence);
+}
+
+soft_event::soft_event(clover::context &ctx,
+                       std::vector<clover::event *> deps,
+                       bool __trigger, action action) :
+   _cl_event(ctx, deps, action, action) {
+   if (__trigger)
+      trigger();
+}
+
+cl_int
+soft_event::status() const {
+   if (__status < 0)
+      return __status;
+
+   else if (!signalled() ||
+            any_of([](const ref_ptr<event> &ev) {
+                  return ev->status() != CL_COMPLETE;
+               }, deps.begin(), deps.end()))
+      return CL_SUBMITTED;
+
+   else
+      return CL_COMPLETE;
+}
+
+cl_command_queue
+soft_event::queue() const {
+   return NULL;
+}
+
+cl_command_type
+soft_event::command() const {
+   return CL_COMMAND_USER;
+}
+
+void
+soft_event::wait() const {
+   for (auto ev : deps)
+      ev->wait();
+
+   if (status() != CL_COMPLETE)
+      throw error(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
+}
diff --git a/src/gallium/state_trackers/clover/core/event.hpp b/src/gallium/state_trackers/clover/core/event.hpp
new file mode 100644
index 00000000000..ea4ac4ae43c
--- /dev/null
+++ b/src/gallium/state_trackers/clover/core/event.hpp
@@ -0,0 +1,138 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+// OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#ifndef __CORE_EVENT_HPP__
+#define __CORE_EVENT_HPP__
+
+#include <functional>
+
+#include "core/base.hpp"
+#include "core/queue.hpp"
+
+namespace clover {
+   typedef struct _cl_event event;
+}
+
+///
+/// Class that represents a task that might be executed asynchronously
+/// at some point in the future.
+///
+/// An event consists of a list of dependencies, a boolean signalled()
+/// flag, and an associated task.  An event is considered signalled as
+/// soon as all its dependencies (if any) are signalled as well, and
+/// the trigger() method is called; at that point the associated task
+/// will be started through the specified \a action_ok.  If the
+/// abort() method is called instead, the specified \a action_fail is
+/// executed and the associated task will never be started.  Dependent
+/// events will be aborted recursively.
+///
+/// The execution status of the associated task can be queried using
+/// the status() method, and it can be waited for completion using the
+/// wait() method.
+///
+struct _cl_event : public clover::ref_counter {
+public:
+   typedef std::function<void (clover::event &)> action;
+
+   _cl_event(clover::context &ctx, std::vector<clover::event *> deps,
+             action action_ok, action action_fail);
+   virtual ~_cl_event();
+
+   void trigger();
+   void abort(cl_int status);
+   bool signalled() const;
+
+   virtual cl_int status() const = 0;
+   virtual cl_command_queue queue() const = 0;
+   virtual cl_command_type command() const = 0;
+   virtual void wait() const = 0;
+
+   clover::context &ctx;
+
+protected:
+   void chain(clover::event *ev);
+
+   cl_int __status;
+   std::vector<clover::ref_ptr<clover::event>> deps;
+
+private:
+   unsigned wait_count;
+   action action_ok;
+   action action_fail;
+   std::vector<clover::ref_ptr<clover::event>> __chain;
+};
+
+namespace clover {
+   ///
+   /// Class that represents a task executed by a command queue.
+   ///
+   /// Similar to a normal clover::event.  In addition it's associated
+   /// with a given command queue \a q and a given OpenCL \a command.
+   /// hard_event instances created for the same queue are implicitly
+   /// ordered with respect to each other, and they are implicitly
+   /// triggered on construction.
+   ///
+   /// A hard_event is considered complete when the associated
+   /// hardware task finishes execution.
+   ///
+   class hard_event : public event {
+   public:
+      hard_event(clover::command_queue &q, cl_command_type command,
+                 std::vector<clover::event *> deps,
+                 action action = [](event &){});
+      ~hard_event();
+
+      virtual cl_int status() const;
+      virtual cl_command_queue queue() const;
+      virtual cl_command_type command() const;
+      virtual void wait() const;
+
+      friend class ::_cl_command_queue;
+
+   private:
+      virtual void fence(pipe_fence_handle *fence);
+
+      clover::command_queue &__queue;
+      cl_command_type __command;
+      pipe_fence_handle *__fence;
+   };
+
+   ///
+   /// Class that represents a software event.
+   ///
+   /// A soft_event is not associated with any specific hardware task
+   /// or command queue.  It's considered complete as soon as all its
+   /// dependencies finish execution.
+   ///
+   class soft_event : public event {
+   public:
+      soft_event(clover::context &ctx, std::vector<clover::event *> deps,
+                 bool trigger, action action = [](event &){});
+
+      virtual cl_int status() const;
+      virtual cl_command_queue queue() const;
+      virtual cl_command_type command() const;
+      virtual void wait() const;
+   };
+}
+
+#endif
diff --git a/src/gallium/state_trackers/clover/core/format.cpp b/src/gallium/state_trackers/clover/core/format.cpp
new file mode 100644
index 00000000000..8f6e14d6567
--- /dev/null
+++ b/src/gallium/state_trackers/clover/core/format.cpp
@@ -0,0 +1,167 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+// OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#include <algorithm>
+
+#include "core/format.hpp"
+#include "core/memory.hpp"
+#include "pipe/p_screen.h"
+#include "pipe/p_context.h"
+
+namespace clover {
+   static const std::map<cl_image_format, pipe_format> formats {
+      { { CL_BGRA, CL_UNORM_INT8 }, PIPE_FORMAT_B8G8R8A8_UNORM },
+      { { CL_ARGB, CL_UNORM_INT8 }, PIPE_FORMAT_A8R8G8B8_UNORM },
+      { { CL_RGB, CL_UNORM_SHORT_565 }, PIPE_FORMAT_B5G6R5_UNORM },
+      { { CL_LUMINANCE, CL_UNORM_INT8 }, PIPE_FORMAT_L8_UNORM },
+      { { CL_A, CL_UNORM_INT8 }, PIPE_FORMAT_A8_UNORM },
+      { { CL_INTENSITY, CL_UNORM_INT8 }, PIPE_FORMAT_I8_UNORM },
+      { { CL_LUMINANCE, CL_UNORM_INT16 }, PIPE_FORMAT_L16_UNORM },
+      { { CL_R, CL_FLOAT }, PIPE_FORMAT_R32_FLOAT },
+      { { CL_RG, CL_FLOAT }, PIPE_FORMAT_R32G32_FLOAT },
+      { { CL_RGB, CL_FLOAT }, PIPE_FORMAT_R32G32B32_FLOAT },
+      { { CL_RGBA, CL_FLOAT }, PIPE_FORMAT_R32G32B32A32_FLOAT },
+      { { CL_R, CL_UNORM_INT16 }, PIPE_FORMAT_R16_UNORM },
+      { { CL_RG, CL_UNORM_INT16 }, PIPE_FORMAT_R16G16_UNORM },
+      { { CL_RGB, CL_UNORM_INT16 }, PIPE_FORMAT_R16G16B16_UNORM },
+      { { CL_RGBA, CL_UNORM_INT16 }, PIPE_FORMAT_R16G16B16A16_UNORM },
+      { { CL_R, CL_SNORM_INT16 }, PIPE_FORMAT_R16_SNORM },
+      { { CL_RG, CL_SNORM_INT16 }, PIPE_FORMAT_R16G16_SNORM },
+      { { CL_RGB, CL_SNORM_INT16 }, PIPE_FORMAT_R16G16B16_SNORM },
+      { { CL_RGBA, CL_SNORM_INT16 }, PIPE_FORMAT_R16G16B16A16_SNORM },
+      { { CL_R, CL_UNORM_INT8 }, PIPE_FORMAT_R8_UNORM },
+      { { CL_RG, CL_UNORM_INT8 }, PIPE_FORMAT_R8G8_UNORM },
+      { { CL_RGB, CL_UNORM_INT8 }, PIPE_FORMAT_R8G8B8_UNORM },
+      { { CL_RGBA, CL_UNORM_INT8 }, PIPE_FORMAT_R8G8B8A8_UNORM },
+      { { CL_R, CL_SNORM_INT8 }, PIPE_FORMAT_R8_SNORM },
+      { { CL_RG, CL_SNORM_INT8 }, PIPE_FORMAT_R8G8_SNORM },
+      { { CL_RGB, CL_SNORM_INT8 }, PIPE_FORMAT_R8G8B8_SNORM },
+      { { CL_RGBA, CL_SNORM_INT8 }, PIPE_FORMAT_R8G8B8A8_SNORM },
+      { { CL_R, CL_HALF_FLOAT }, PIPE_FORMAT_R16_FLOAT },
+      { { CL_RG, CL_HALF_FLOAT }, PIPE_FORMAT_R16G16_FLOAT },
+      { { CL_RGB, CL_HALF_FLOAT }, PIPE_FORMAT_R16G16B16_FLOAT },
+      { { CL_RGBA, CL_HALF_FLOAT }, PIPE_FORMAT_R16G16B16A16_FLOAT },
+      { { CL_RGBx, CL_UNORM_SHORT_555 }, PIPE_FORMAT_B5G5R5X1_UNORM },
+      { { CL_RGBx, CL_UNORM_INT8 }, PIPE_FORMAT_R8G8B8X8_UNORM },
+      { { CL_A, CL_UNORM_INT16 }, PIPE_FORMAT_A16_UNORM },
+      { { CL_INTENSITY, CL_UNORM_INT16 }, PIPE_FORMAT_I16_UNORM },
+      { { CL_LUMINANCE, CL_SNORM_INT8 }, PIPE_FORMAT_L8_SNORM },
+      { { CL_INTENSITY, CL_SNORM_INT8 }, PIPE_FORMAT_I8_SNORM },
+      { { CL_A, CL_SNORM_INT16 }, PIPE_FORMAT_A16_SNORM },
+      { { CL_LUMINANCE, CL_SNORM_INT16 }, PIPE_FORMAT_L16_SNORM },
+      { { CL_INTENSITY, CL_SNORM_INT16 }, PIPE_FORMAT_I16_SNORM },
+      { { CL_A, CL_HALF_FLOAT }, PIPE_FORMAT_A16_FLOAT },
+      { { CL_LUMINANCE, CL_HALF_FLOAT }, PIPE_FORMAT_L16_FLOAT },
+      { { CL_INTENSITY, CL_HALF_FLOAT }, PIPE_FORMAT_I16_FLOAT },
+      { { CL_A, CL_FLOAT }, PIPE_FORMAT_A32_FLOAT },
+      { { CL_LUMINANCE, CL_FLOAT }, PIPE_FORMAT_L32_FLOAT },
+      { { CL_INTENSITY, CL_FLOAT }, PIPE_FORMAT_I32_FLOAT },
+      { { CL_RA, CL_UNORM_INT8 }, PIPE_FORMAT_R8A8_UNORM },
+      { { CL_R, CL_UNSIGNED_INT8 }, PIPE_FORMAT_R8_UINT },
+      { { CL_RG, CL_UNSIGNED_INT8 }, PIPE_FORMAT_R8G8_UINT },
+      { { CL_RGB, CL_UNSIGNED_INT8 }, PIPE_FORMAT_R8G8B8_UINT },
+      { { CL_RGBA, CL_UNSIGNED_INT8 }, PIPE_FORMAT_R8G8B8A8_UINT },
+      { { CL_R, CL_SIGNED_INT8 }, PIPE_FORMAT_R8_SINT },
+      { { CL_RG, CL_SIGNED_INT8 }, PIPE_FORMAT_R8G8_SINT },
+      { { CL_RGB, CL_SIGNED_INT8 }, PIPE_FORMAT_R8G8B8_SINT },
+      { { CL_RGBA, CL_SIGNED_INT8 }, PIPE_FORMAT_R8G8B8A8_SINT },
+      { { CL_R, CL_UNSIGNED_INT16 }, PIPE_FORMAT_R16_UINT },
+      { { CL_RG, CL_UNSIGNED_INT16 }, PIPE_FORMAT_R16G16_UINT },
+      { { CL_RGB, CL_UNSIGNED_INT16 }, PIPE_FORMAT_R16G16B16_UINT },
+      { { CL_RGBA, CL_UNSIGNED_INT16 }, PIPE_FORMAT_R16G16B16A16_UINT },
+      { { CL_R, CL_SIGNED_INT16 }, PIPE_FORMAT_R16_SINT },
+      { { CL_RG, CL_SIGNED_INT16 }, PIPE_FORMAT_R16G16_SINT },
+      { { CL_RGB, CL_SIGNED_INT16 }, PIPE_FORMAT_R16G16B16_SINT },
+      { { CL_RGBA, CL_SIGNED_INT16 }, PIPE_FORMAT_R16G16B16A16_SINT },
+      { { CL_R, CL_UNSIGNED_INT32 }, PIPE_FORMAT_R32_UINT },
+      { { CL_RG, CL_UNSIGNED_INT32 }, PIPE_FORMAT_R32G32_UINT },
+      { { CL_RGB, CL_UNSIGNED_INT32 }, PIPE_FORMAT_R32G32B32_UINT },
+      { { CL_RGBA, CL_UNSIGNED_INT32 }, PIPE_FORMAT_R32G32B32A32_UINT },
+      { { CL_R, CL_SIGNED_INT32 }, PIPE_FORMAT_R32_SINT },
+      { { CL_RG, CL_SIGNED_INT32 }, PIPE_FORMAT_R32G32_SINT },
+      { { CL_RGB, CL_SIGNED_INT32 }, PIPE_FORMAT_R32G32B32_SINT },
+      { { CL_RGBA, CL_SIGNED_INT32 }, PIPE_FORMAT_R32G32B32A32_SINT },
+      { { CL_A, CL_UNSIGNED_INT8 }, PIPE_FORMAT_A8_UINT },
+      { { CL_INTENSITY, CL_UNSIGNED_INT8 }, PIPE_FORMAT_I8_UINT },
+      { { CL_LUMINANCE, CL_UNSIGNED_INT8 }, PIPE_FORMAT_L8_UINT },
+      { { CL_A, CL_SIGNED_INT8 }, PIPE_FORMAT_A8_SINT },
+      { { CL_INTENSITY, CL_SIGNED_INT8 }, PIPE_FORMAT_I8_SINT },
+      { { CL_LUMINANCE, CL_SIGNED_INT8 }, PIPE_FORMAT_L8_SINT },
+      { { CL_A, CL_UNSIGNED_INT16 }, PIPE_FORMAT_A16_UINT },
+      { { CL_INTENSITY, CL_UNSIGNED_INT16 }, PIPE_FORMAT_I16_UINT },
+      { { CL_LUMINANCE, CL_UNSIGNED_INT16 }, PIPE_FORMAT_L16_UINT },
+      { { CL_A, CL_SIGNED_INT16 }, PIPE_FORMAT_A16_SINT },
+      { { CL_INTENSITY, CL_SIGNED_INT16 }, PIPE_FORMAT_I16_SINT },
+      { { CL_LUMINANCE, CL_SIGNED_INT16 }, PIPE_FORMAT_L16_SINT },
+      { { CL_A, CL_UNSIGNED_INT32 }, PIPE_FORMAT_A32_UINT },
+      { { CL_INTENSITY, CL_UNSIGNED_INT32 }, PIPE_FORMAT_I32_UINT },
+      { { CL_LUMINANCE, CL_UNSIGNED_INT32 }, PIPE_FORMAT_L32_UINT },
+      { { CL_A, CL_SIGNED_INT32 }, PIPE_FORMAT_A32_SINT },
+      { { CL_INTENSITY, CL_SIGNED_INT32 }, PIPE_FORMAT_I32_SINT },
+      { { CL_LUMINANCE, CL_SIGNED_INT32 }, PIPE_FORMAT_L32_SINT }
+   };
+
+   pipe_texture_target
+   translate_target(cl_mem_object_type type) {
+      switch (type) {
+      case CL_MEM_OBJECT_BUFFER:
+         return PIPE_BUFFER;
+      case CL_MEM_OBJECT_IMAGE2D:
+         return PIPE_TEXTURE_2D;
+      case CL_MEM_OBJECT_IMAGE3D:
+         return PIPE_TEXTURE_3D;
+      default:
+         throw error(CL_INVALID_VALUE);
+      }
+   }
+
+   pipe_format
+   translate_format(const cl_image_format &format) {
+      auto it = formats.find(format);
+
+      if (it == formats.end())
+         throw error(CL_IMAGE_FORMAT_NOT_SUPPORTED);
+
+      return it->second;
+   }
+
+   std::set<cl_image_format>
+   supported_formats(cl_context ctx, cl_mem_object_type type) {
+      std::set<cl_image_format> s;
+      pipe_texture_target target = translate_target(type);
+      unsigned bindings = (PIPE_BIND_SAMPLER_VIEW |
+                           PIPE_BIND_COMPUTE_RESOURCE |
+                           PIPE_BIND_TRANSFER_READ |
+                           PIPE_BIND_TRANSFER_WRITE);
+
+      for (auto f : formats) {
+         if (std::all_of(ctx->devs.begin(), ctx->devs.end(),
+                         [=](const device *dev) {
+                            return dev->pipe->is_format_supported(
+                               dev->pipe, f.second, target, 1, bindings);
+                         }))
+            s.insert(f.first);
+      }
+
+      return s;
+   }
+}
diff --git a/src/gallium/state_trackers/clover/core/format.hpp b/src/gallium/state_trackers/clover/core/format.hpp
new file mode 100644
index 00000000000..a24cbf37621
--- /dev/null
+++ b/src/gallium/state_trackers/clover/core/format.hpp
@@ -0,0 +1,51 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+// OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#ifndef __CORE_FORMAT_HPP__
+#define __CORE_FORMAT_HPP__
+
+#include <set>
+
+#include "core/base.hpp"
+#include "pipe/p_defines.h"
+#include "pipe/p_format.h"
+
+namespace clover {
+   pipe_texture_target translate_target(cl_mem_object_type type);
+   pipe_format translate_format(const cl_image_format &format);
+
+   ///
+   /// Return all the image formats supported by a given context for
+   /// the given memory object type.
+   ///
+   std::set<cl_image_format> supported_formats(cl_context ctx,
+                                               cl_mem_object_type type);
+}
+
+static inline bool
+operator<(const cl_image_format &a, const cl_image_format &b) {
+   return (a.image_channel_order != b.image_channel_order ?
+           a.image_channel_order < b.image_channel_order :
+           a.image_channel_data_type < b.image_channel_data_type);
+}
+
+#endif
diff --git a/src/gallium/state_trackers/clover/core/geometry.hpp b/src/gallium/state_trackers/clover/core/geometry.hpp
new file mode 100644
index 00000000000..027264e72f0
--- /dev/null
+++ b/src/gallium/state_trackers/clover/core/geometry.hpp
@@ -0,0 +1,72 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+// OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#ifndef __CORE_GEOMETRY_HPP__
+#define __CORE_GEOMETRY_HPP__
+
+#include <array>
+#include <algorithm>
+
+namespace clover {
+   ///
+   /// N-dimensional coordinate array.
+   ///
+   template<typename T, int N>
+   class point {
+   public:
+      point() : a() {
+      }
+
+      point(std::initializer_list<T> v) {
+         auto it = std::copy(v.begin(), v.end(), a.begin());
+         std::fill(it, a.end(), 0);
+      }
+
+      point(const T *v) {
+         std::copy(v, v + N, a.begin());
+      }
+
+      T &operator[](int i) {
+         return a[i];
+      }
+
+      const T &operator[](int i) const {
+         return a[i];
+      }
+
+      point operator+(const point &p) const {
+         point q;
+         std::transform(a.begin(), a.end(), p.a.begin(),
+                        q.a.begin(), std::plus<T>());
+         return q;
+      }
+
+      T operator()(const point &p) const {
+         return std::inner_product(p.a.begin(), p.a.end(), a.begin(), 0);
+      }
+
+   protected:
+      std::array<T, N> a;
+   };
+}
+
+#endif
diff --git a/src/gallium/state_trackers/clover/core/kernel.cpp b/src/gallium/state_trackers/clover/core/kernel.cpp
new file mode 100644
index 00000000000..6fa8bd63453
--- /dev/null
+++ b/src/gallium/state_trackers/clover/core/kernel.cpp
@@ -0,0 +1,393 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+// OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#include "core/kernel.hpp"
+#include "core/resource.hpp"
+#include "pipe/p_context.h"
+
+using namespace clover;
+
+_cl_kernel::_cl_kernel(clover::program &prog,
+                       const std::string &name,
+                       const std::vector<clover::module::argument> &args) :
+   prog(prog), __name(name), exec(*this) {
+   for (auto arg : args) {
+      if (arg.type == module::argument::scalar)
+         this->args.emplace_back(new scalar_argument(arg.size));
+      else if (arg.type == module::argument::global)
+         this->args.emplace_back(new global_argument(arg.size));
+      else if (arg.type == module::argument::local)
+         this->args.emplace_back(new local_argument());
+      else if (arg.type == module::argument::constant)
+         this->args.emplace_back(new constant_argument());
+      else if (arg.type == module::argument::image2d_rd ||
+               arg.type == module::argument::image3d_rd)
+         this->args.emplace_back(new image_rd_argument());
+      else if (arg.type == module::argument::image2d_wr ||
+               arg.type == module::argument::image3d_wr)
+         this->args.emplace_back(new image_wr_argument());
+      else if (arg.type == module::argument::sampler)
+         this->args.emplace_back(new sampler_argument());
+      else
+         throw error(CL_INVALID_KERNEL_DEFINITION);
+   }
+}
+
+template<typename T, typename V>
+static inline std::vector<T>
+pad_vector(clover::command_queue &q, const V &v, T x) {
+   std::vector<T> w { v.begin(), v.end() };
+   w.resize(q.dev.max_block_size().size(), x);
+   return w;
+}
+
+void
+_cl_kernel::launch(clover::command_queue &q,
+                   const std::vector<size_t> &grid_offset,
+                   const std::vector<size_t> &grid_size,
+                   const std::vector<size_t> &block_size) {
+   void *st = exec.bind(&q);
+   auto g_handles = map([&](size_t h) { return (uint32_t *)&exec.input[h]; },
+                        exec.g_handles.begin(), exec.g_handles.end());
+
+   q.pipe->bind_compute_state(q.pipe, st);
+   q.pipe->bind_compute_sampler_states(q.pipe, 0, exec.samplers.size(),
+                                       exec.samplers.data());
+   q.pipe->set_compute_sampler_views(q.pipe, 0, exec.sviews.size(),
+                                     exec.sviews.data());
+   q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(),
+                                     exec.resources.data());
+   q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(),
+                              exec.g_buffers.data(), g_handles.data());
+
+   q.pipe->launch_grid(q.pipe,
+                       pad_vector<uint>(q, block_size, 1).data(),
+                       pad_vector<uint>(q, grid_size, 1).data(),
+                       module(q).sym(__name).offset,
+                       exec.input.data());
+
+   q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(), NULL, NULL);
+   q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(), NULL);
+   q.pipe->set_compute_sampler_views(q.pipe, 0, exec.sviews.size(), NULL);
+   q.pipe->bind_compute_sampler_states(q.pipe, 0, exec.samplers.size(), NULL);
+   exec.unbind();
+}
+
+size_t
+_cl_kernel::mem_local() const {
+   size_t sz = 0;
+
+   for (auto &arg : args) {
+      if (dynamic_cast<local_argument *>(arg.get()))
+         sz += arg->storage();
+   }
+
+   return sz;
+}
+
+size_t
+_cl_kernel::mem_private() const {
+   return 0;
+}
+
+size_t
+_cl_kernel::max_block_size() const {
+   return SIZE_MAX;
+}
+
+const std::string &
+_cl_kernel::name() const {
+   return __name;
+}
+
+std::vector<size_t>
+_cl_kernel::block_size() const {
+   return { 0, 0, 0 };
+}
+
+const clover::module &
+_cl_kernel::module(const clover::command_queue &q) const {
+   return prog.binaries().find(&q.dev)->second;
+}
+
+
+_cl_kernel::exec_context::exec_context(clover::kernel &kern) :
+   kern(kern), q(NULL), mem_local(0), st(NULL) {
+}
+
+_cl_kernel::exec_context::~exec_context() {
+   if (st)
+      q->pipe->delete_compute_state(q->pipe, st);
+}
+
+void *
+_cl_kernel::exec_context::bind(clover::command_queue *__q) {
+   std::swap(q, __q);
+
+   for (auto &arg : kern.args)
+      arg->bind(*this);
+
+   // Create a new compute state if anything changed.
+   if (!st || q != __q ||
+       cs.req_local_mem != mem_local ||
+       cs.req_input_mem != input.size()) {
+      if (st)
+         __q->pipe->delete_compute_state(__q->pipe, st);
+
+      cs.prog = kern.module(*q).sec(module::section::text).data.begin();
+      cs.req_local_mem = mem_local;
+      cs.req_input_mem = input.size();
+      st = q->pipe->create_compute_state(q->pipe, &cs);
+   }
+
+   return st;
+}
+
+void
+_cl_kernel::exec_context::unbind() {
+   for (auto &arg : kern.args)
+      arg->unbind(*this);
+
+   input.clear();
+   samplers.clear();
+   sviews.clear();
+   resources.clear();
+   g_buffers.clear();
+   g_handles.clear();
+   mem_local = 0;
+}
+
+_cl_kernel::argument::argument(size_t size) :
+   __size(size), __set(false) {
+}
+
+bool
+_cl_kernel::argument::set() const {
+   return __set;
+}
+
+size_t
+_cl_kernel::argument::storage() const {
+   return 0;
+}
+
+_cl_kernel::scalar_argument::scalar_argument(size_t size) :
+   argument(size) {
+}
+
+void
+_cl_kernel::scalar_argument::set(size_t size, const void *value) {
+   if (size != __size)
+      throw error(CL_INVALID_ARG_SIZE);
+
+   v = { (uint8_t *)value, (uint8_t *)value + size };
+   __set = true;
+}
+
+void
+_cl_kernel::scalar_argument::bind(exec_context &ctx) {
+   ctx.input.insert(ctx.input.end(), v.begin(), v.end());
+}
+
+void
+_cl_kernel::scalar_argument::unbind(exec_context &ctx) {
+}
+
+_cl_kernel::global_argument::global_argument(size_t size) :
+   argument(size) {
+}
+
+void
+_cl_kernel::global_argument::set(size_t size, const void *value) {
+   if (size != sizeof(cl_mem))
+      throw error(CL_INVALID_ARG_SIZE);
+
+   obj = dynamic_cast<clover::buffer *>(*(cl_mem *)value);
+   __set = true;
+}
+
+void
+_cl_kernel::global_argument::bind(exec_context &ctx) {
+   size_t offset = ctx.input.size();
+   size_t idx = ctx.g_buffers.size();
+
+   ctx.input.resize(offset + __size);
+
+   ctx.g_buffers.resize(idx + 1);
+   ctx.g_buffers[idx] = obj->resource(ctx.q).pipe;
+
+   ctx.g_handles.resize(idx + 1);
+   ctx.g_handles[idx] = offset;
+}
+
+void
+_cl_kernel::global_argument::unbind(exec_context &ctx) {
+}
+
+_cl_kernel::local_argument::local_argument() :
+   argument(sizeof(uint32_t)) {
+}
+
+size_t
+_cl_kernel::local_argument::storage() const {
+   return __storage;
+}
+
+void
+_cl_kernel::local_argument::set(size_t size, const void *value) {
+   if (value)
+      throw error(CL_INVALID_ARG_VALUE);
+
+   __storage = size;
+   __set = true;
+}
+
+void
+_cl_kernel::local_argument::bind(exec_context &ctx) {
+   size_t offset = ctx.input.size();
+   size_t ptr = ctx.mem_local;
+
+   ctx.input.resize(offset + sizeof(uint32_t));
+   *(uint32_t *)&ctx.input[offset] = ptr;
+
+   ctx.mem_local += __storage;
+}
+
+void
+_cl_kernel::local_argument::unbind(exec_context &ctx) {
+}
+
+_cl_kernel::constant_argument::constant_argument() :
+   argument(sizeof(uint32_t)) {
+}
+
+void
+_cl_kernel::constant_argument::set(size_t size, const void *value) {
+   if (size != sizeof(cl_mem))
+      throw error(CL_INVALID_ARG_SIZE);
+
+   obj = dynamic_cast<clover::buffer *>(*(cl_mem *)value);
+   __set = true;
+}
+
+void
+_cl_kernel::constant_argument::bind(exec_context &ctx) {
+   size_t offset = ctx.input.size();
+   size_t idx = ctx.resources.size();
+
+   ctx.input.resize(offset + sizeof(uint32_t));
+   *(uint32_t *)&ctx.input[offset] = idx << 24;
+
+   ctx.resources.resize(idx + 1);
+   ctx.resources[idx] = st = obj->resource(ctx.q).bind_surface(*ctx.q, false);
+}
+
+void
+_cl_kernel::constant_argument::unbind(exec_context &ctx) {
+   obj->resource(ctx.q).unbind_surface(*ctx.q, st);
+}
+
+_cl_kernel::image_rd_argument::image_rd_argument() :
+   argument(sizeof(uint32_t)) {
+}
+
+void
+_cl_kernel::image_rd_argument::set(size_t size, const void *value) {
+   if (size != sizeof(cl_mem))
+      throw error(CL_INVALID_ARG_SIZE);
+
+   obj = dynamic_cast<clover::image *>(*(cl_mem *)value);
+   __set = true;
+}
+
+void
+_cl_kernel::image_rd_argument::bind(exec_context &ctx) {
+   size_t offset = ctx.input.size();
+   size_t idx = ctx.sviews.size();
+
+   ctx.input.resize(offset + sizeof(uint32_t));
+   *(uint32_t *)&ctx.input[offset] = idx;
+
+   ctx.sviews.resize(idx + 1);
+   ctx.sviews[idx] = st = obj->resource(ctx.q).bind_sampler_view(*ctx.q);
+}
+
+void
+_cl_kernel::image_rd_argument::unbind(exec_context &ctx) {
+   obj->resource(ctx.q).unbind_sampler_view(*ctx.q, st);
+}
+
+_cl_kernel::image_wr_argument::image_wr_argument() :
+   argument(sizeof(uint32_t)) {
+}
+
+void
+_cl_kernel::image_wr_argument::set(size_t size, const void *value) {
+   if (size != sizeof(cl_mem))
+      throw error(CL_INVALID_ARG_SIZE);
+
+   obj = dynamic_cast<clover::image *>(*(cl_mem *)value);
+   __set = true;
+}
+
+void
+_cl_kernel::image_wr_argument::bind(exec_context &ctx) {
+   size_t offset = ctx.input.size();
+   size_t idx = ctx.resources.size();
+
+   ctx.input.resize(offset + sizeof(uint32_t));
+   *(uint32_t *)&ctx.input[offset] = idx;
+
+   ctx.resources.resize(idx + 1);
+   ctx.resources[idx] = st = obj->resource(ctx.q).bind_surface(*ctx.q, true);
+}
+
+void
+_cl_kernel::image_wr_argument::unbind(exec_context &ctx) {
+   obj->resource(ctx.q).unbind_surface(*ctx.q, st);
+}
+
+_cl_kernel::sampler_argument::sampler_argument() :
+   argument(0) {
+}
+
+void
+_cl_kernel::sampler_argument::set(size_t size, const void *value) {
+   if (size != sizeof(cl_sampler))
+      throw error(CL_INVALID_ARG_SIZE);
+
+   obj = *(cl_sampler *)value;
+   __set = true;
+}
+
+void
+_cl_kernel::sampler_argument::bind(exec_context &ctx) {
+   size_t idx = ctx.samplers.size();
+
+   ctx.samplers.resize(idx + 1);
+   ctx.samplers[idx] = st = obj->bind(*ctx.q);
+}
+
+void
+_cl_kernel::sampler_argument::unbind(exec_context &ctx) {
+   obj->unbind(*ctx.q, st);
+}
diff --git a/src/gallium/state_trackers/clover/core/kernel.hpp b/src/gallium/state_trackers/clover/core/kernel.hpp
new file mode 100644
index 00000000000..bc21de8094f
--- /dev/null
+++ b/src/gallium/state_trackers/clover/core/kernel.hpp
@@ -0,0 +1,214 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+// OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#ifndef __CORE_KERNEL_HPP__
+#define __CORE_KERNEL_HPP__
+
+#include <memory>
+
+#include "core/base.hpp"
+#include "core/program.hpp"
+#include "core/memory.hpp"
+#include "core/sampler.hpp"
+#include "pipe/p_state.h"
+
+namespace clover {
+   typedef struct _cl_kernel kernel;
+   class argument;
+}
+
+struct _cl_kernel : public clover::ref_counter {
+private:
+   ///
+   /// Class containing all the state required to execute a compute
+   /// kernel.
+   ///
+   struct exec_context {
+      exec_context(clover::kernel &kern);
+      ~exec_context();
+
+      void *bind(clover::command_queue *q);
+      void unbind();
+
+      clover::kernel &kern;
+      clover::command_queue *q;
+
+      std::vector<uint8_t> input;
+      std::vector<void *> samplers;
+      std::vector<pipe_sampler_view *> sviews;
+      std::vector<pipe_surface *> resources;
+      std::vector<pipe_resource *> g_buffers;
+      std::vector<size_t> g_handles;
+      size_t mem_local;
+
+   private:
+      void *st;
+      pipe_compute_state cs;
+   };
+
+public:
+   class argument {
+   public:
+      argument(size_t size);
+
+      /// \a true if the argument has been set.
+      bool set() const;
+
+      /// Argument size in the input buffer.
+      size_t size() const;
+
+      /// Storage space required for the referenced object.
+      virtual size_t storage() const;
+
+      /// Set this argument to some object.
+      virtual void set(size_t size, const void *value) = 0;
+
+      /// Allocate the necessary resources to bind the specified
+      /// object to this argument, and update \a ctx accordingly.
+      virtual void bind(exec_context &ctx) = 0;
+
+      /// Free any resources that were allocated in bind().
+      virtual void unbind(exec_context &ctx) = 0;
+
+   protected:
+      size_t __size;
+      bool __set;
+   };
+
+   _cl_kernel(clover::program &prog,
+              const std::string &name,
+              const std::vector<clover::module::argument> &args);
+
+   void launch(clover::command_queue &q,
+               const std::vector<size_t> &grid_offset,
+               const std::vector<size_t> &grid_size,
+               const std::vector<size_t> &block_size);
+
+   size_t mem_local() const;
+   size_t mem_private() const;
+   size_t max_block_size() const;
+
+   const std::string &name() const;
+   std::vector<size_t> block_size() const;
+
+   clover::program &prog;
+   std::vector<std::unique_ptr<argument>> args;
+
+private:
+   const clover::module &
+   module(const clover::command_queue &q) const;
+
+   class scalar_argument : public argument {
+   public:
+      scalar_argument(size_t size);
+
+      virtual void set(size_t size, const void *value);
+      virtual void bind(exec_context &ctx);
+      virtual void unbind(exec_context &ctx);
+
+   private:
+      std::vector<uint8_t> v;
+   };
+
+   class global_argument : public argument {
+   public:
+      global_argument(size_t size);
+
+      virtual void set(size_t size, const void *value);
+      virtual void bind(exec_context &ctx);
+      virtual void unbind(exec_context &ctx);
+
+   private:
+      clover::buffer *obj;
+   };
+
+   class local_argument : public argument {
+   public:
+      local_argument();
+
+      virtual size_t storage() const;
+
+      virtual void set(size_t size, const void *value);
+      virtual void bind(exec_context &ctx);
+      virtual void unbind(exec_context &ctx);
+
+   private:
+      size_t __storage;
+   };
+
+   class constant_argument : public argument {
+   public:
+      constant_argument();
+
+      virtual void set(size_t size, const void *value);
+      virtual void bind(exec_context &ctx);
+      virtual void unbind(exec_context &ctx);
+
+   private:
+      clover::buffer *obj;
+      pipe_surface *st;
+   };
+
+   class image_rd_argument : public argument {
+   public:
+      image_rd_argument();
+
+      virtual void set(size_t size, const void *value);
+      virtual void bind(exec_context &ctx);
+      virtual void unbind(exec_context &ctx);
+
+   private:
+      clover::image *obj;
+      pipe_sampler_view *st;
+   };
+
+   class image_wr_argument : public argument {
+   public:
+      image_wr_argument();
+
+      virtual void set(size_t size, const void *value);
+      virtual void bind(exec_context &ctx);
+      virtual void unbind(exec_context &ctx);
+
+   private:
+      clover::image *obj;
+      pipe_surface *st;
+   };
+
+   class sampler_argument : public argument {
+   public:
+      sampler_argument();
+
+      virtual void set(size_t size, const void *value);
+      virtual void bind(exec_context &ctx);
+      virtual void unbind(exec_context &ctx);
+
+   private:
+      clover::sampler *obj;
+      void *st;
+   };
+
+   std::string __name;
+   exec_context exec;
+};
+
+#endif
diff --git a/src/gallium/state_trackers/clover/core/memory.cpp b/src/gallium/state_trackers/clover/core/memory.cpp
new file mode 100644
index 00000000000..8d8be2e547d
--- /dev/null
+++ b/src/gallium/state_trackers/clover/core/memory.cpp
@@ -0,0 +1,199 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+// OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#include "core/memory.hpp"
+#include "core/resource.hpp"
+
+using namespace clover;
+
+_cl_mem::_cl_mem(clover::context &ctx, cl_mem_flags flags,
+                 size_t size, void *host_ptr) :
+   ctx(ctx), __flags(flags),
+   __size(size), __host_ptr(host_ptr),
+   __destroy_notify([]{}) {
+   if (flags & CL_MEM_COPY_HOST_PTR)
+      data.append((char *)host_ptr, size);
+}
+
+_cl_mem::~_cl_mem() {
+   __destroy_notify();
+}
+
+void
+_cl_mem::destroy_notify(std::function<void ()> f) {
+   __destroy_notify = f;
+}
+
+cl_mem_flags
+_cl_mem::flags() const {
+   return __flags;
+}
+
+size_t
+_cl_mem::size() const {
+   return __size;
+}
+
+void *
+_cl_mem::host_ptr() const {
+   return __host_ptr;
+}
+
+buffer::buffer(clover::context &ctx, cl_mem_flags flags,
+               size_t size, void *host_ptr) :
+   memory_obj(ctx, flags, size, host_ptr) {
+}
+
+cl_mem_object_type
+buffer::type() const {
+   return CL_MEM_OBJECT_BUFFER;
+}
+
+root_buffer::root_buffer(clover::context &ctx, cl_mem_flags flags,
+                         size_t size, void *host_ptr) :
+   buffer(ctx, flags, size, host_ptr) {
+}
+
+clover::resource &
+root_buffer::resource(cl_command_queue q) {
+   // Create a new resource if there's none for this device yet.
+   if (!resources.count(&q->dev)) {
+      auto r = (!resources.empty() ?
+                new root_resource(q->dev, *this, *resources.begin()->second) :
+                new root_resource(q->dev, *this, *q, data));
+
+      resources.insert(std::make_pair(&q->dev,
+                                      std::unique_ptr<root_resource>(r)));
+      data.clear();
+   }
+
+   return *resources.find(&q->dev)->second;
+}
+
+sub_buffer::sub_buffer(clover::root_buffer &parent, cl_mem_flags flags,
+                       size_t offset, size_t size) :
+   buffer(parent.ctx, flags, size,
+          (char *)parent.host_ptr() + offset),
+   parent(parent), __offset(offset) {
+}
+
+clover::resource &
+sub_buffer::resource(cl_command_queue q) {
+   // Create a new resource if there's none for this device yet.
+   if (!resources.count(&q->dev)) {
+      auto r = new sub_resource(parent.resource(q), { offset() });
+
+      resources.insert(std::make_pair(&q->dev,
+                                      std::unique_ptr<sub_resource>(r)));
+   }
+
+   return *resources.find(&q->dev)->second;
+}
+
+size_t
+sub_buffer::offset() const {
+   return __offset;
+}
+
+image::image(clover::context &ctx, cl_mem_flags flags,
+             const cl_image_format *format,
+             size_t width, size_t height, size_t depth,
+             size_t row_pitch, size_t slice_pitch, size_t size,
+             void *host_ptr) :
+   memory_obj(ctx, flags, size, host_ptr),
+   __format(*format), __width(width), __height(height), __depth(depth),
+   __row_pitch(row_pitch), __slice_pitch(slice_pitch) {
+}
+
+clover::resource &
+image::resource(cl_command_queue q) {
+   // Create a new resource if there's none for this device yet.
+   if (!resources.count(&q->dev)) {
+      auto r = (!resources.empty() ?
+                new root_resource(q->dev, *this, *resources.begin()->second) :
+                new root_resource(q->dev, *this, *q, data));
+
+      resources.insert(std::make_pair(&q->dev,
+                                      std::unique_ptr<root_resource>(r)));
+      data.clear();
+   }
+
+   return *resources.find(&q->dev)->second;
+}
+
+cl_image_format
+image::format() const {
+   return __format;
+}
+
+size_t
+image::width() const {
+   return __width;
+}
+
+size_t
+image::height() const {
+   return __height;
+}
+
+size_t
+image::depth() const {
+   return __depth;
+}
+
+size_t
+image::row_pitch() const {
+   return __row_pitch;
+}
+
+size_t
+image::slice_pitch() const {
+   return __slice_pitch;
+}
+
+image2d::image2d(clover::context &ctx, cl_mem_flags flags,
+                 const cl_image_format *format, size_t width,
+                 size_t height, size_t row_pitch,
+                 void *host_ptr) :
+   image(ctx, flags, format, width, height, 0,
+         row_pitch, 0, height * row_pitch, host_ptr) {
+}
+
+cl_mem_object_type
+image2d::type() const {
+   return CL_MEM_OBJECT_IMAGE2D;
+}
+
+image3d::image3d(clover::context &ctx, cl_mem_flags flags,
+                 const cl_image_format *format,
+                 size_t width, size_t height, size_t depth,
+                 size_t row_pitch, size_t slice_pitch,
+                 void *host_ptr) :
+   image(ctx, flags, format, width, height, depth,
+         row_pitch, slice_pitch, depth * slice_pitch,
+         host_ptr) {
+}
+
+cl_mem_object_type
+image3d::type() const {
+   return CL_MEM_OBJECT_IMAGE3D;
+}
diff --git a/src/gallium/state_trackers/clover/core/memory.hpp b/src/gallium/state_trackers/clover/core/memory.hpp
new file mode 100644
index 00000000000..96f70e931bc
--- /dev/null
+++ b/src/gallium/state_trackers/clover/core/memory.hpp
@@ -0,0 +1,157 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+// OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#ifndef __CORE_MEMORY_HPP__
+#define __CORE_MEMORY_HPP__
+
+#include <functional>
+#include <map>
+#include <memory>
+
+#include "core/base.hpp"
+#include "core/queue.hpp"
+
+namespace clover {
+   typedef struct _cl_mem memory_obj;
+
+   class resource;
+   class sub_resource;
+}
+
+struct _cl_mem : public clover::ref_counter {
+protected:
+   _cl_mem(clover::context &ctx, cl_mem_flags flags,
+           size_t size, void *host_ptr);
+   _cl_mem(const _cl_mem &obj) = delete;
+
+public:
+   virtual ~_cl_mem();
+
+   virtual cl_mem_object_type type() const = 0;
+   virtual clover::resource &resource(cl_command_queue q) = 0;
+
+   void destroy_notify(std::function<void ()> f);
+   cl_mem_flags flags() const;
+   size_t size() const;
+   void *host_ptr() const;
+
+   clover::context &ctx;
+
+private:
+   cl_mem_flags __flags;
+   size_t __size;
+   void *__host_ptr;
+   std::function<void ()> __destroy_notify;
+
+protected:
+   std::string data;
+};
+
+namespace clover {
+   struct buffer : public memory_obj {
+   protected:
+      buffer(clover::context &ctx, cl_mem_flags flags,
+             size_t size, void *host_ptr);
+
+   public:
+      virtual cl_mem_object_type type() const;
+   };
+
+   struct root_buffer : public buffer {
+   public:
+      root_buffer(clover::context &ctx, cl_mem_flags flags,
+                  size_t size, void *host_ptr);
+
+      virtual clover::resource &resource(cl_command_queue q);
+
+   private:
+      std::map<clover::device *,
+               std::unique_ptr<clover::root_resource>> resources;
+   };
+
+   struct sub_buffer : public buffer {
+   public:
+      sub_buffer(clover::root_buffer &parent, cl_mem_flags flags,
+                 size_t offset, size_t size);
+
+      virtual clover::resource &resource(cl_command_queue q);
+      size_t offset() const;
+
+      clover::root_buffer &parent;
+
+   private:
+      size_t __offset;
+      std::map<clover::device *,
+               std::unique_ptr<clover::sub_resource>> resources;
+   };
+
+   struct image : public memory_obj {
+   protected:
+      image(clover::context &ctx, cl_mem_flags flags,
+            const cl_image_format *format,
+            size_t width, size_t height, size_t depth,
+            size_t row_pitch, size_t slice_pitch, size_t size,
+            void *host_ptr);
+
+   public:
+      virtual clover::resource &resource(cl_command_queue q);
+      cl_image_format format() const;
+      size_t width() const;
+      size_t height() const;
+      size_t depth() const;
+      size_t row_pitch() const;
+      size_t slice_pitch() const;
+
+   private:
+      cl_image_format __format;
+      size_t __width;
+      size_t __height;
+      size_t __depth;
+      size_t __row_pitch;
+      size_t __slice_pitch;
+      std::map<clover::device *,
+               std::unique_ptr<clover::root_resource>> resources;
+   };
+
+   struct image2d : public image {
+   public:
+      image2d(clover::context &ctx, cl_mem_flags flags,
+              const cl_image_format *format, size_t width,
+              size_t height, size_t row_pitch,
+              void *host_ptr);
+
+      virtual cl_mem_object_type type() const;
+   };
+
+   struct image3d : public image {
+   public:
+      image3d(clover::context &ctx, cl_mem_flags flags,
+              const cl_image_format *format,
+              size_t width, size_t height, size_t depth,
+              size_t row_pitch, size_t slice_pitch,
+              void *host_ptr);
+
+      virtual cl_mem_object_type type() const;
+   };
+}
+
+#endif
diff --git a/src/gallium/state_trackers/clover/core/module.cpp b/src/gallium/state_trackers/clover/core/module.cpp
new file mode 100644
index 00000000000..1865771443b
--- /dev/null
+++ b/src/gallium/state_trackers/clover/core/module.cpp
@@ -0,0 +1,172 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+// OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#include <type_traits>
+#include <algorithm>
+
+#include "core/module.hpp"
+
+using namespace clover;
+
+namespace {
+   template<typename T, typename = void>
+   struct __serializer;
+
+   /// Serialize the specified object.
+   template<typename T>
+   void
+   __proc(compat::ostream &os, const T &x) {
+      __serializer<T>::proc(os, x);
+   }
+
+   /// Deserialize the specified object.
+   template<typename T>
+   void
+   __proc(compat::istream &is, T &x) {
+      __serializer<T>::proc(is, x);
+   }
+
+   template<typename T>
+   T
+   __proc(compat::istream &is) {
+      T x;
+      __serializer<T>::proc(is, x);
+      return x;
+   }
+
+   /// (De)serialize a scalar value.
+   template<typename T>
+   struct __serializer<T, typename std::enable_if<
+                             std::is_scalar<T>::value>::type> {
+      static void
+      proc(compat::ostream &os, const T &x) {
+         os.write(reinterpret_cast<const char *>(&x), sizeof(x));
+      }
+
+      static void
+      proc(compat::istream &is, T &x) {
+         is.read(reinterpret_cast<char *>(&x), sizeof(x));
+      }
+   };
+
+   /// (De)serialize a vector.
+   template<typename T>
+   struct __serializer<compat::vector<T>> {
+      static void
+      proc(compat::ostream &os, const compat::vector<T> &v) {
+         __proc<uint32_t>(os, v.size());
+
+         for (size_t i = 0; i < v.size(); i++)
+            __proc<T>(os, v[i]);
+      }
+
+      static void
+      proc(compat::istream &is, compat::vector<T> &v) {
+         v.reserve(__proc<uint32_t>(is));
+
+         for (size_t i = 0; i < v.size(); i++)
+            new(&v[i]) T(__proc<T>(is));
+      }
+   };
+
+   /// (De)serialize a module::section.
+   template<>
+   struct __serializer<module::section> {
+      template<typename S, typename QT>
+      static void
+      proc(S &s, QT &x) {
+         __proc(s, x.type);
+         __proc(s, x.size);
+         __proc(s, x.data);
+      }
+   };
+
+   /// (De)serialize a module::argument.
+   template<>
+   struct __serializer<module::argument> {
+      template<typename S, typename QT>
+      static void
+      proc(S &s, QT &x) {
+         __proc(s, x.type);
+         __proc(s, x.size);
+      }
+   };
+
+   /// (De)serialize a module::symbol.
+   template<>
+   struct __serializer<module::symbol> {
+      template<typename S, typename QT>
+      static void
+      proc(S &s, QT &x) {
+         __proc(s, x.section);
+         __proc(s, x.offset);
+         __proc(s, x.args);
+      }
+   };
+
+   /// (De)serialize a module.
+   template<>
+   struct __serializer<module> {
+      template<typename S, typename QT>
+      static void
+      proc(S &s, QT &x) {
+         __proc(s, x.syms);
+         __proc(s, x.secs);
+      }
+   };
+};
+
+namespace clover {
+   void
+   module::serialize(compat::ostream &os) const {
+      __proc(os, *this);
+   }
+
+   module
+   module::deserialize(compat::istream &is) {
+      return __proc<module>(is);
+   }
+
+   const module::symbol &
+   module::sym(compat::string name) const {
+      auto it = std::find_if(syms.begin(), syms.end(), [&](const symbol &x) {
+            return compat::string(x.name) == name;
+         });
+
+      if (it == syms.end())
+         throw noent_error();
+
+      return *it;
+   }
+
+   const module::section &
+   module::sec(typename section::type type) const {
+      auto it = std::find_if(secs.begin(), secs.end(), [&](const section &x) {
+            return x.type == type;
+         });
+
+      if (it == secs.end())
+         throw noent_error();
+
+      return *it;
+   }
+}
diff --git a/src/gallium/state_trackers/clover/core/module.hpp b/src/gallium/state_trackers/clover/core/module.hpp
new file mode 100644
index 00000000000..bc4b203af8e
--- /dev/null
+++ b/src/gallium/state_trackers/clover/core/module.hpp
@@ -0,0 +1,93 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+// OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#ifndef __CORE_MODULE_HPP__
+#define __CORE_MODULE_HPP__
+
+#include "core/compat.hpp"
+
+namespace clover {
+   struct module {
+      class noent_error {
+      public:
+         virtual ~noent_error() {}
+      };
+
+      typedef uint32_t resource_id;
+      typedef uint32_t size_t;
+
+      struct section {
+         enum type {
+            text,
+            data_constant,
+            data_global,
+            data_local,
+            data_private
+         };
+
+         resource_id id;
+         type type;
+         size_t size;
+         clover::compat::vector<char> data;
+      };
+
+      struct argument {
+         enum type {
+            scalar,
+            constant,
+            global,
+            local,
+            image2d_rd,
+            image2d_wr,
+            image3d_rd,
+            image3d_wr,
+            sampler
+         };
+
+         type type;
+         size_t size;
+      };
+
+      struct symbol {
+         clover::compat::vector<char> name;
+         resource_id section;
+         size_t offset;
+         clover::compat::vector<argument> args;
+      };
+
+      void serialize(compat::ostream &os) const;
+      static module deserialize(compat::istream &is);
+
+      /// Look up a symbol by name.  Throws module::noent_error if not
+      /// found.
+      const symbol &sym(compat::string name) const;
+
+      /// Look up a section by type.  Throws module::noent_error if not
+      /// found.
+      const section &sec(typename section::type type) const;
+
+      clover::compat::vector<symbol> syms;
+      clover::compat::vector<section> secs;
+   };
+}
+
+#endif
diff --git a/src/gallium/state_trackers/clover/core/program.cpp b/src/gallium/state_trackers/clover/core/program.cpp
new file mode 100644
index 00000000000..5ac9f93480e
--- /dev/null
+++ b/src/gallium/state_trackers/clover/core/program.cpp
@@ -0,0 +1,85 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+// OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#include "core/program.hpp"
+#include "core/compiler.hpp"
+
+using namespace clover;
+
+_cl_program::_cl_program(clover::context &ctx,
+                         const std::string &source) :
+   ctx(ctx), __source(source) {
+}
+
+_cl_program::_cl_program(clover::context &ctx,
+                         const std::vector<clover::device *> &devs,
+                         const std::vector<clover::module> &binaries) :
+   ctx(ctx) {
+   for_each([&](clover::device *dev, const clover::module &bin) {
+         __binaries.insert({ dev, bin });
+      },
+      devs.begin(), devs.end(), binaries.begin());
+}
+
+void
+_cl_program::build(const std::vector<clover::device *> &devs) {
+   __binaries.clear();
+   __logs.clear();
+
+   for (auto dev : devs) {
+      try {
+         auto module = (dev->ir_target() == "tgsi" ?
+                        compile_program_tgsi(__source, dev->ir_target()) :
+                        compile_program_llvm(__source, dev->ir_target()));
+         __binaries.insert({ dev, module });
+
+      } catch (build_error &e) {
+         __logs.insert({ dev, e.what() });
+         throw error(CL_BUILD_PROGRAM_FAILURE);
+      }
+   }
+}
+
+const std::string &
+_cl_program::source() const {
+   return __source;
+}
+
+const std::map<clover::device *, clover::module> &
+_cl_program::binaries() const {
+   return __binaries;
+}
+
+cl_build_status
+_cl_program::build_status(clover::device *dev) const {
+   return __binaries.count(dev) ? CL_BUILD_SUCCESS : CL_BUILD_NONE;
+}
+
+std::string
+_cl_program::build_opts(clover::device *dev) const {
+   return {};
+}
+
+std::string
+_cl_program::build_log(clover::device *dev) const {
+   return __logs.count(dev) ? __logs.find(dev)->second : "";
+}
diff --git a/src/gallium/state_trackers/clover/core/program.hpp b/src/gallium/state_trackers/clover/core/program.hpp
new file mode 100644
index 00000000000..f3858f6ce98
--- /dev/null
+++ b/src/gallium/state_trackers/clover/core/program.hpp
@@ -0,0 +1,61 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+// OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#ifndef __CORE_PROGRAM_HPP__
+#define __CORE_PROGRAM_HPP__
+
+#include <map>
+
+#include "core/base.hpp"
+#include "core/context.hpp"
+#include "core/module.hpp"
+
+namespace clover {
+   typedef struct _cl_program program;
+}
+
+struct _cl_program : public clover::ref_counter {
+public:
+   _cl_program(clover::context &ctx,
+               const std::string &source);
+   _cl_program(clover::context &ctx,
+               const std::vector<clover::device *> &devs,
+               const std::vector<clover::module> &binaries);
+
+   void build(const std::vector<clover::device *> &devs);
+
+   const std::string &source() const;
+   const std::map<clover::device *, clover::module> &binaries() const;
+
+   cl_build_status build_status(clover::device *dev) const;
+   std::string build_opts(clover::device *dev) const;
+   std::string build_log(clover::device *dev) const;
+
+   clover::context &ctx;
+
+private:
+   std::map<clover::device *, clover::module> __binaries;
+   std::map<clover::device *, std::string> __logs;
+   std::string __source;
+};
+
+#endif
diff --git a/src/gallium/state_trackers/clover/core/queue.cpp b/src/gallium/state_trackers/clover/core/queue.cpp
new file mode 100644
index 00000000000..7e476c715e0
--- /dev/null
+++ b/src/gallium/state_trackers/clover/core/queue.cpp
@@ -0,0 +1,69 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+// OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#include <algorithm>
+
+#include "core/queue.hpp"
+#include "core/event.hpp"
+#include "pipe/p_screen.h"
+#include "pipe/p_context.h"
+
+using namespace clover;
+
+_cl_command_queue::_cl_command_queue(context &ctx, device &dev,
+                                     cl_command_queue_properties props) :
+   ctx(ctx), dev(dev), __props(props) {
+   pipe = dev.pipe->context_create(dev.pipe, NULL);
+   if (!pipe)
+      throw error(CL_INVALID_DEVICE);
+}
+
+_cl_command_queue::~_cl_command_queue() {
+   pipe->destroy(pipe);
+}
+
+void
+_cl_command_queue::flush() {
+   pipe_screen *screen = dev.pipe;
+   pipe_fence_handle *fence = NULL;
+
+   if (!queued_events.empty()) {
+      // Find out which events have already been signalled.
+      auto first = queued_events.begin();
+      auto last = std::find_if(queued_events.begin(), queued_events.end(),
+                               [](event_ptr &ev) { return !ev->signalled(); });
+
+      // Flush and fence them.
+      pipe->flush(pipe, &fence);
+      std::for_each(first, last, [&](event_ptr &ev) { ev->fence(fence); });
+      screen->fence_reference(screen, &fence, NULL);
+      queued_events.erase(first, last);
+   }
+}
+
+void
+_cl_command_queue::sequence(clover::hard_event *ev) {
+   if (!queued_events.empty())
+      queued_events.back()->chain(ev);
+
+   queued_events.push_back(ev);
+}
diff --git a/src/gallium/state_trackers/clover/core/queue.hpp b/src/gallium/state_trackers/clover/core/queue.hpp
new file mode 100644
index 00000000000..6c124eae83f
--- /dev/null
+++ b/src/gallium/state_trackers/clover/core/queue.hpp
@@ -0,0 +1,72 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+// OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#ifndef __CORE_QUEUE_HPP__
+#define __CORE_QUEUE_HPP__
+
+#include "core/base.hpp"
+#include "core/context.hpp"
+#include "pipe/p_context.h"
+
+namespace clover {
+   typedef struct _cl_command_queue command_queue;
+   class resource;
+   class mapping;
+   class hard_event;
+}
+
+struct _cl_command_queue : public clover::ref_counter {
+public:
+   _cl_command_queue(clover::context &ctx, clover::device &dev,
+                     cl_command_queue_properties props);
+   _cl_command_queue(const _cl_command_queue &q) = delete;
+   ~_cl_command_queue();
+
+   void flush();
+
+   cl_command_queue_properties props() const {
+      return __props;
+   }
+
+   clover::context &ctx;
+   clover::device &dev;
+
+   friend class clover::resource;
+   friend class clover::root_resource;
+   friend class clover::mapping;
+   friend class clover::hard_event;
+   friend struct _cl_sampler;
+   friend struct _cl_kernel;
+
+private:
+   /// Serialize a hardware event with respect to the previous ones,
+   /// and push it to the pending list.
+   void sequence(clover::hard_event *ev);
+
+   cl_command_queue_properties __props;
+   pipe_context *pipe;
+
+   typedef clover::ref_ptr<clover::hard_event> event_ptr;
+   std::vector<event_ptr> queued_events;
+};
+
+#endif
diff --git a/src/gallium/state_trackers/clover/core/resource.cpp b/src/gallium/state_trackers/clover/core/resource.cpp
new file mode 100644
index 00000000000..61085b2cb29
--- /dev/null
+++ b/src/gallium/state_trackers/clover/core/resource.cpp
@@ -0,0 +1,203 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+// OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#include "core/resource.hpp"
+#include "pipe/p_screen.h"
+#include "util/u_sampler.h"
+#include "util/u_format.h"
+
+using namespace clover;
+
+namespace {
+   class box {
+   public:
+      box(const resource::point &origin, const resource::point &size) :
+         pipe({ (unsigned)origin[0], (unsigned)origin[1],
+                (unsigned)origin[2], (unsigned)size[0],
+                (unsigned)size[1], (unsigned)size[2] }) {
+      }
+
+      operator const pipe_box *() {
+         return &pipe;
+      }
+
+   protected:
+      pipe_box pipe;
+   };
+}
+
+resource::resource(clover::device &dev, clover::memory_obj &obj) :
+   dev(dev), obj(obj), pipe(NULL), offset{0} {
+}
+
+resource::~resource() {
+}
+
+void
+resource::copy(command_queue &q, const point &origin, const point &region,
+               resource &src_res, const point &src_origin) {
+   point p = offset + origin;
+
+   q.pipe->resource_copy_region(q.pipe, pipe, 0, p[0], p[1], p[2],
+                                src_res.pipe, 0,
+                                box(src_res.offset + src_origin, region));
+}
+
+void *
+resource::add_map(command_queue &q, cl_map_flags flags, bool blocking,
+                  const point &origin, const point &region) {
+   maps.emplace_back(q, *this, flags, blocking, origin, region);
+   return maps.back();
+}
+
+void
+resource::del_map(void *p) {
+   auto it = std::find(maps.begin(), maps.end(), p);
+   if (it != maps.end())
+      maps.erase(it);
+}
+
+unsigned
+resource::map_count() const {
+   return maps.size();
+}
+
+pipe_sampler_view *
+resource::bind_sampler_view(clover::command_queue &q) {
+   pipe_sampler_view info;
+
+   u_sampler_view_default_template(&info, pipe, pipe->format);
+   return q.pipe->create_sampler_view(q.pipe, pipe, &info);
+}
+
+void
+resource::unbind_sampler_view(clover::command_queue &q,
+                              pipe_sampler_view *st) {
+   q.pipe->sampler_view_destroy(q.pipe, st);
+}
+
+pipe_surface *
+resource::bind_surface(clover::command_queue &q, bool rw) {
+   pipe_surface info {};
+
+   info.format = pipe->format;
+   info.usage = pipe->bind;
+   info.writable = rw;
+
+   if (pipe->target == PIPE_BUFFER)
+      info.u.buf.last_element = pipe->width0 - 1;
+
+   return q.pipe->create_surface(q.pipe, pipe, &info);
+}
+
+void
+resource::unbind_surface(clover::command_queue &q, pipe_surface *st) {
+   q.pipe->surface_destroy(q.pipe, st);
+}
+
+root_resource::root_resource(clover::device &dev, clover::memory_obj &obj,
+                             clover::command_queue &q,
+                             const std::string &data) :
+   resource(dev, obj) {
+   pipe_resource info {};
+
+   if (image *img = dynamic_cast<image *>(&obj)) {
+      info.format = translate_format(img->format());
+      info.width0 = img->width();
+      info.height0 = img->height();
+      info.depth0 = img->depth();
+   } else {
+      info.width0 = obj.size();
+      info.height0 = 1;
+      info.depth0 = 1;
+   }
+
+   info.target = translate_target(obj.type());
+   info.bind = (PIPE_BIND_SAMPLER_VIEW |
+                PIPE_BIND_COMPUTE_RESOURCE |
+                PIPE_BIND_GLOBAL |
+                PIPE_BIND_TRANSFER_READ |
+                PIPE_BIND_TRANSFER_WRITE);
+
+   pipe = dev.pipe->resource_create(dev.pipe, &info);
+   if (!pipe)
+      throw error(CL_OUT_OF_RESOURCES);
+
+   if (!data.empty()) {
+      box rect { { 0, 0, 0 }, { info.width0, info.height0, info.depth0 } };
+      unsigned cpp = util_format_get_blocksize(info.format);
+
+      q.pipe->transfer_inline_write(q.pipe, pipe, 0, PIPE_TRANSFER_WRITE,
+                                    rect, data.data(), cpp * info.width0,
+                                    cpp * info.width0 * info.height0);
+   }
+}
+
+root_resource::root_resource(clover::device &dev, clover::memory_obj &obj,
+                             clover::root_resource &r) :
+   resource(dev, obj) {
+   assert(0); // XXX -- resource shared among dev and r.dev
+}
+
+root_resource::~root_resource() {
+   dev.pipe->resource_destroy(dev.pipe, pipe);
+}
+
+sub_resource::sub_resource(clover::resource &r, point offset) :
+   resource(r.dev, r.obj) {
+   pipe = r.pipe;
+   offset = r.offset + offset;
+}
+
+mapping::mapping(command_queue &q, resource &r,
+                 cl_map_flags flags, bool blocking,
+                 const resource::point &origin,
+                 const resource::point &region) :
+   pctx(q.pipe) {
+   unsigned usage = ((flags & CL_MAP_WRITE ? PIPE_TRANSFER_WRITE : 0 ) |
+                     (flags & CL_MAP_READ ? PIPE_TRANSFER_READ : 0 ) |
+                     (blocking ? PIPE_TRANSFER_UNSYNCHRONIZED : 0));
+
+   pxfer = pctx->get_transfer(pctx, r.pipe, 0, usage,
+                              box(origin + r.offset, region));
+   if (!pxfer)
+      throw error(CL_OUT_OF_RESOURCES);
+
+   p = pctx->transfer_map(pctx, pxfer);
+   if (!p) {
+      pctx->transfer_destroy(pctx, pxfer);
+      throw error(CL_OUT_OF_RESOURCES);
+   }
+}
+
+mapping::mapping(mapping &&m) :
+   pctx(m.pctx), pxfer(m.pxfer), p(m.p) {
+   m.p = NULL;
+   m.pxfer = NULL;
+}
+
+mapping::~mapping() {
+   if (pxfer) {
+      pctx->transfer_unmap(pctx, pxfer);
+      pctx->transfer_destroy(pctx, pxfer);
+   }
+}
diff --git a/src/gallium/state_trackers/clover/core/resource.hpp b/src/gallium/state_trackers/clover/core/resource.hpp
new file mode 100644
index 00000000000..947060139ec
--- /dev/null
+++ b/src/gallium/state_trackers/clover/core/resource.hpp
@@ -0,0 +1,129 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+// OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#ifndef __CORE_RESOURCE_HPP__
+#define __CORE_RESOURCE_HPP__
+
+#include <list>
+
+#include "core/base.hpp"
+#include "core/memory.hpp"
+#include "core/geometry.hpp"
+#include "pipe/p_state.h"
+
+namespace clover {
+   class mapping;
+
+   ///
+   /// Class that represents a device-specific instance of some memory
+   /// object.
+   ///
+   class resource {
+   public:
+      typedef clover::point<size_t, 3> point;
+
+      resource(const resource &r) = delete;
+      virtual ~resource();
+
+      void copy(command_queue &q, const point &origin, const point &region,
+                resource &src_resource, const point &src_origin);
+
+      void *add_map(command_queue &q, cl_map_flags flags, bool blocking,
+                    const point &origin, const point &region);
+      void del_map(void *p);
+      unsigned map_count() const;
+
+      clover::device &dev;
+      clover::memory_obj &obj;
+
+      friend class sub_resource;
+      friend class mapping;
+      friend struct ::_cl_kernel;
+
+   protected:
+      resource(clover::device &dev, clover::memory_obj &obj);
+
+      pipe_sampler_view *bind_sampler_view(clover::command_queue &q);
+      void unbind_sampler_view(clover::command_queue &q,
+                               pipe_sampler_view *st);
+
+      pipe_surface *bind_surface(clover::command_queue &q, bool rw);
+      void unbind_surface(clover::command_queue &q, pipe_surface *st);
+
+      pipe_resource *pipe;
+      point offset;
+
+   private:
+      std::list<mapping> maps;
+   };
+
+   ///
+   /// Resource associated with its own top-level data storage
+   /// allocated in some device.
+   ///
+   class root_resource : public resource {
+   public:
+      root_resource(clover::device &dev, clover::memory_obj &obj,
+                    clover::command_queue &q, const std::string &data);
+      root_resource(clover::device &dev, clover::memory_obj &obj,
+                    root_resource &r);
+      virtual ~root_resource();
+   };
+
+   ///
+   /// Resource that reuses a portion of some other resource as data
+   /// storage.
+   ///
+   class sub_resource : public resource {
+   public:
+      sub_resource(clover::resource &r, point offset);
+   };
+
+   ///
+   /// Class that represents a mapping of some resource into the CPU
+   /// memory space.
+   ///
+   class mapping {
+   public:
+      mapping(command_queue &q, resource &r, cl_map_flags flags,
+              bool blocking, const resource::point &origin,
+              const resource::point &region);
+      mapping(const mapping &m) = delete;
+      mapping(mapping &&m);
+      ~mapping();
+
+      operator void *() {
+         return p;
+      }
+
+      operator char *() {
+         return (char *)p;
+      }
+
+   private:
+      pipe_context *pctx;
+      pipe_transfer *pxfer;
+      void *p;
+   };
+}
+
+#endif
diff --git a/src/gallium/state_trackers/clover/core/sampler.cpp b/src/gallium/state_trackers/clover/core/sampler.cpp
new file mode 100644
index 00000000000..6d683f2b41a
--- /dev/null
+++ b/src/gallium/state_trackers/clover/core/sampler.cpp
@@ -0,0 +1,73 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+// OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#include "core/sampler.hpp"
+#include "pipe/p_state.h"
+
+using namespace clover;
+
+_cl_sampler::_cl_sampler(clover::context &ctx, bool norm_mode,
+                         cl_addressing_mode addr_mode,
+                         cl_filter_mode filter_mode) :
+   ctx(ctx), __norm_mode(norm_mode),
+   __addr_mode(addr_mode), __filter_mode(filter_mode) {
+}
+
+bool
+_cl_sampler::norm_mode() {
+   return __norm_mode;
+}
+
+cl_addressing_mode
+_cl_sampler::addr_mode() {
+   return __addr_mode;
+}
+
+cl_filter_mode
+_cl_sampler::filter_mode() {
+   return __filter_mode;
+}
+
+void *
+_cl_sampler::bind(clover::command_queue &q) {
+   struct pipe_sampler_state info {};
+
+   info.normalized_coords = norm_mode();
+
+   info.wrap_s = info.wrap_t = info.wrap_r =
+      (addr_mode() == CL_ADDRESS_CLAMP_TO_EDGE ? PIPE_TEX_WRAP_CLAMP_TO_EDGE :
+       addr_mode() == CL_ADDRESS_CLAMP ? PIPE_TEX_WRAP_CLAMP_TO_BORDER :
+       addr_mode() == CL_ADDRESS_REPEAT ? PIPE_TEX_WRAP_REPEAT :
+       addr_mode() == CL_ADDRESS_MIRRORED_REPEAT ? PIPE_TEX_WRAP_MIRROR_REPEAT :
+       PIPE_TEX_WRAP_CLAMP_TO_EDGE);
+
+   info.min_img_filter = info.mag_img_filter =
+      (filter_mode() == CL_FILTER_LINEAR ? PIPE_TEX_FILTER_LINEAR :
+       PIPE_TEX_FILTER_NEAREST);
+
+   return q.pipe->create_sampler_state(q.pipe, &info);
+}
+
+void
+_cl_sampler::unbind(clover::command_queue &q, void *st) {
+   q.pipe->delete_sampler_state(q.pipe, st);
+}
diff --git a/src/gallium/state_trackers/clover/core/sampler.hpp b/src/gallium/state_trackers/clover/core/sampler.hpp
new file mode 100644
index 00000000000..5bb5bccb1a1
--- /dev/null
+++ b/src/gallium/state_trackers/clover/core/sampler.hpp
@@ -0,0 +1,55 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+// OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#ifndef __CORE_SAMPLER_HPP__
+#define __CORE_SAMPLER_HPP__
+
+#include "core/base.hpp"
+#include "core/queue.hpp"
+
+namespace clover {
+   typedef struct _cl_sampler sampler;
+}
+
+struct _cl_sampler : public clover::ref_counter {
+public:
+   _cl_sampler(clover::context &ctx, bool norm_mode,
+               cl_addressing_mode addr_mode, cl_filter_mode filter_mode);
+
+   bool norm_mode();
+   cl_addressing_mode addr_mode();
+   cl_filter_mode filter_mode();
+
+   clover::context &ctx;
+
+   friend class _cl_kernel;
+
+private:
+   void *bind(clover::command_queue &q);
+   void unbind(clover::command_queue &q, void *st);
+
+   bool __norm_mode;
+   cl_addressing_mode __addr_mode;
+   cl_filter_mode __filter_mode;
+};
+
+#endif
diff --git a/src/gallium/state_trackers/clover/llvm/invocation.cpp b/src/gallium/state_trackers/clover/llvm/invocation.cpp
new file mode 100644
index 00000000000..89e21bf9289
--- /dev/null
+++ b/src/gallium/state_trackers/clover/llvm/invocation.cpp
@@ -0,0 +1,94 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+// OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#include "core/compiler.hpp"
+
+#if 0
+#include <clang/Frontend/CompilerInstance.h>
+#include <clang/Frontend/TextDiagnosticPrinter.h>
+#include <clang/CodeGen/CodeGenAction.h>
+#include <llvm/LLVMContext.h>
+#include <llvm/Support/TargetSelect.h>
+#include <llvm/Support/MemoryBuffer.h>
+
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <cstdio>
+#endif
+
+using namespace clover;
+
+#if 0
+namespace {
+   void
+   build_binary(const std::string &source, const std::string &target,
+                const std::string &name) {
+      clang::CompilerInstance c;
+      clang::EmitObjAction act(&llvm::getGlobalContext());
+      std::string log;
+      llvm::raw_string_ostream s_log(log);
+
+      LLVMInitializeTGSITarget();
+      LLVMInitializeTGSITargetInfo();
+      LLVMInitializeTGSITargetMC();
+      LLVMInitializeTGSIAsmPrinter();
+
+      c.getFrontendOpts().Inputs.push_back(
+         std::make_pair(clang::IK_OpenCL, name));
+      c.getHeaderSearchOpts().UseBuiltinIncludes = false;
+      c.getHeaderSearchOpts().UseStandardIncludes = false;
+      c.getLangOpts().NoBuiltin = true;
+      c.getTargetOpts().Triple = target;
+      c.getInvocation().setLangDefaults(clang::IK_OpenCL);
+      c.createDiagnostics(0, NULL, new clang::TextDiagnosticPrinter(
+                             s_log, c.getDiagnosticOpts()));
+
+      c.getPreprocessorOpts().addRemappedFile(
+         name, llvm::MemoryBuffer::getMemBuffer(source));
+
+      if (!c.ExecuteAction(act))
+         throw build_error(log);
+   }
+
+   module
+   load_binary(const char *name) {
+      std::ifstream fs((name));
+      std::vector<unsigned char> str((std::istreambuf_iterator<char>(fs)),
+                                     (std::istreambuf_iterator<char>()));
+      compat::istream cs(str);
+      return module::deserialize(cs);
+   }
+}
+#endif
+
+module
+clover::compile_program_llvm(const compat::string &source,
+                             const compat::string &target) {
+#if 0
+   build_binary(source, target, "cl_input");
+   module m = load_binary("cl_input.o");
+   std::remove("cl_input.o");
+   return m;
+#endif
+   return module();
+}
diff --git a/src/gallium/state_trackers/clover/tgsi/compiler.cpp b/src/gallium/state_trackers/clover/tgsi/compiler.cpp
new file mode 100644
index 00000000000..eb27db1aa76
--- /dev/null
+++ b/src/gallium/state_trackers/clover/tgsi/compiler.cpp
@@ -0,0 +1,100 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+// OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#include <sstream>
+
+#include "core/compiler.hpp"
+
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_text.h"
+#include "util/u_memory.h"
+
+using namespace clover;
+
+namespace {
+   void
+   read_header(const std::string &header, module &m) {
+      std::istringstream ls(header);
+      std::string line;
+
+      while (getline(ls, line)) {
+         std::istringstream ts(line);
+         std::string name, tok;
+         module::size_t offset;
+         compat::vector<module::argument> args;
+
+         if (!(ts >> name))
+            continue;
+
+         if (!(ts >> offset))
+            throw build_error("invalid kernel start address");
+
+         while (ts >> tok) {
+            if (tok == "scalar")
+               args.push_back({ module::argument::scalar, 4 });
+            else if (tok == "global")
+               args.push_back({ module::argument::global, 4 });
+            else if (tok == "local")
+               args.push_back({ module::argument::local, 4 });
+            else if (tok == "constant")
+               args.push_back({ module::argument::constant, 4 });
+            else if (tok == "image2d_rd")
+               args.push_back({ module::argument::image2d_rd, 4 });
+            else if (tok == "image2d_wr")
+               args.push_back({ module::argument::image2d_wr, 4 });
+            else if (tok == "image3d_rd")
+               args.push_back({ module::argument::image3d_rd, 4 });
+            else if (tok == "image3d_wr")
+               args.push_back({ module::argument::image3d_wr, 4 });
+            else if (tok == "sampler")
+               args.push_back({ module::argument::sampler, 0 });
+            else
+               throw build_error("invalid kernel argument");
+         }
+
+         m.syms.push_back({ name, 0, offset, args });
+      }
+   }
+
+   void
+   read_body(const char *source, module &m) {
+      tgsi_token prog[1024];
+
+      if (!tgsi_text_translate(source, prog, Elements(prog)))
+         throw build_error("translate failed");
+
+      unsigned sz = tgsi_num_tokens(prog) * sizeof(tgsi_token);
+      m.secs.push_back({ 0, module::section::text, sz, { (char *)prog, sz } });
+   }
+}
+
+module
+clover::compile_program_tgsi(const compat::string &source,
+                             const compat::string &target) {
+   const char *body = source.find("COMP\n");
+   module m;
+
+   read_header({ source.begin(), body }, m);
+   read_body(body, m);
+
+   return m;
+}
diff --git a/src/gallium/state_trackers/d3d1x/gd3d1x/sm4_to_tgsi.cpp b/src/gallium/state_trackers/d3d1x/gd3d1x/sm4_to_tgsi.cpp
index 392fd3e0921..aaa46f19e8c 100644
--- a/src/gallium/state_trackers/d3d1x/gd3d1x/sm4_to_tgsi.cpp
+++ b/src/gallium/state_trackers/d3d1x/gd3d1x/sm4_to_tgsi.cpp
@@ -446,7 +446,7 @@ struct sm4_to_tgsi_converter
 				break;
 			case SM4_OPCODE_RESINFO:
 				// TODO: return type
-				ureg_RESINFO(ureg, _dst(), _src(1), resources[_idx(SM4_FILE_RESOURCE, 2)]);
+				ureg_SVIEWINFO(ureg, _dst(), _src(1), resources[_idx(SM4_FILE_RESOURCE, 2)]);
 				break;
 			// TODO: sample index, texture offset
 			case SM4_OPCODE_LD: // dst, coord_int, res; mipmap level in last coord_int arg
@@ -750,11 +750,12 @@ next:;
 				}
 				if(resources.size() <= (unsigned)idx)
 					resources.resize(idx + 1);
-				resources[idx] = ureg_DECL_resource(ureg, idx, targets[idx].first,
-								    res_return_type(dcl.rrt.x),
-								    res_return_type(dcl.rrt.y),
-								    res_return_type(dcl.rrt.z),
-								    res_return_type(dcl.rrt.w));
+				resources[idx] = ureg_DECL_sampler_view(
+                                   ureg, idx, targets[idx].first,
+                                   res_return_type(dcl.rrt.x),
+                                   res_return_type(dcl.rrt.y),
+                                   res_return_type(dcl.rrt.z),
+                                   res_return_type(dcl.rrt.w));
 				break;
 			case SM4_OPCODE_DCL_SAMPLER:
 				check(idx >= 0);
diff --git a/src/gallium/state_trackers/egl/fbdev/native_fbdev.c b/src/gallium/state_trackers/egl/fbdev/native_fbdev.c
index b45ab5c4f2e..b17a8ce78c9 100644
--- a/src/gallium/state_trackers/egl/fbdev/native_fbdev.c
+++ b/src/gallium/state_trackers/egl/fbdev/native_fbdev.c
@@ -41,6 +41,7 @@
  *  - no pixmap support
  */
 
+#include <errno.h>
 #include <sys/ioctl.h>
 #include <sys/types.h>
 #include <sys/stat.h>
diff --git a/src/gallium/state_trackers/gbm/gbm_drm.c b/src/gallium/state_trackers/gbm/gbm_drm.c
index d4baf87096a..a327fdaae64 100644
--- a/src/gallium/state_trackers/gbm/gbm_drm.c
+++ b/src/gallium/state_trackers/gbm/gbm_drm.c
@@ -190,10 +190,7 @@ gbm_gallium_drm_destroy(struct gbm_device *gbm)
 {
    struct gbm_gallium_drm_device *gdrm = gbm_gallium_drm_device(gbm);
 
-   gdrm->screen->destroy(gdrm->screen);
-
-   FREE(gdrm->base.driver_name);
-
+   gallium_screen_destroy(gdrm);
    FREE(gdrm);
 }
 
diff --git a/src/gallium/state_trackers/gbm/gbm_gallium_drmint.h b/src/gallium/state_trackers/gbm/gbm_gallium_drmint.h
index 6277b8dba2e..a5d6d834737 100644
--- a/src/gallium/state_trackers/gbm/gbm_gallium_drmint.h
+++ b/src/gallium/state_trackers/gbm/gbm_gallium_drmint.h
@@ -71,4 +71,7 @@ gbm_gallium_drm_device_create(int fd);
 int
 gallium_screen_create(struct gbm_gallium_drm_device *gdrm);
 
+void
+gallium_screen_destroy(struct gbm_gallium_drm_device *gdrm);
+
 #endif
diff --git a/src/gallium/targets/gbm/Makefile b/src/gallium/targets/gbm/Makefile
index cd0c61080d8..423debf176d 100644
--- a/src/gallium/targets/gbm/Makefile
+++ b/src/gallium/targets/gbm/Makefile
@@ -3,192 +3,39 @@
 TOP = ../../../..
 include $(TOP)/configs/current
 
-PIPE_PREFIX := pipe_
-
 GBM_BACKEND = gbm_gallium_drm
-GBM_SOURCES = gbm.c pipe_loader.c
+GBM_SOURCES = gbm.c
 
 GBM_INCLUDES = \
 	       -I$(TOP)/include \
 	       -I$(TOP)/src/gallium/state_trackers/gbm \
 	       -I$(TOP)/src/gbm/main \
 	       -I$(TOP)/src/gallium/auxiliary \
-	       -I$(TOP)/src/gallium/include \
+	       -I$(TOP)/src/gallium/winsys \
+	       -I$(TOP)/src/gallium/include
 
 GBM_LIBS = $(LIBUDEV_LIBS) $(LIBDRM_LIB) -lm \
 	   $(TOP)/src/gallium/state_trackers/gbm/libgbm.a \
-	   $(TOP)/src/gallium/drivers/identity/libidentity.a \
-	   $(TOP)/src/gallium/drivers/galahad/libgalahad.a \
-	   $(TOP)/src/gallium/drivers/trace/libtrace.a \
-	   $(TOP)/src/gallium/drivers/rbug/librbug.a \
-	   $(GALLIUM_AUXILIARIES)
-
+	   $(GALLIUM_PIPE_LOADER_LIBS) $(GALLIUM_AUXILIARIES)
 
 GBM_CFLAGS = \
-	     -DGBM_BACKEND_SEARCH_DIR=\"$(INSTALL_LIB_DIR)/gbm\" \
-	     -DPIPE_PREFIX=\"$(PIPE_PREFIX)\" \
+	     -DPIPE_SEARCH_DIR=\"$(PIPE_INSTALL_DIR)\" \
+             $(GALLIUM_PIPE_LOADER_DEFINES) \
 	     $(LIBUDEV_CFLAGS) \
-	     $(LIBDRM_CFLAGS)  
-
-
-pipe_INCLUDES = \
-	-I$(TOP)/include \
-	-I$(TOP)/src/gallium/auxiliary \
-	-I$(TOP)/src/gallium/drivers \
-	-I$(TOP)/src/gallium/include \
-	-I$(TOP)/src/gallium/winsys
-
-pipe_LIBS = \
-	$(TOP)/src/gallium/drivers/identity/libidentity.a \
-	$(TOP)/src/gallium/drivers/trace/libtrace.a \
-	$(TOP)/src/gallium/drivers/rbug/librbug.a \
-	$(GALLIUM_AUXILIARIES)
-
-# as if we are DRI modules
-pipe_SYS = $(DRI_LIB_DEPS)
-
-pipe_CLFLAGS = \
-	-DGALLIUM_RBUG -DGALLIUM_TRACE -DGALLIUM_GALAHAD \
-	$(LIBDRM_CFLAGS)
-
-pipe_LDFLAGS = -Wl,--no-undefined
-
-# i915 pipe driver
-i915_LIBS = \
-	$(TOP)/src/gallium/winsys/i915/drm/libi915drm.a \
-	$(TOP)/src/gallium/drivers/i915/libi915.a
-i915_SYS = -ldrm_intel
-
-# nouveau pipe driver
-nouveau_LIBS = \
-	$(TOP)/src/gallium/winsys/nouveau/drm/libnouveaudrm.a \
-	$(TOP)/src/gallium/drivers/nv30/libnv30.a \
-	$(TOP)/src/gallium/drivers/nv50/libnv50.a \
-	$(TOP)/src/gallium/drivers/nvc0/libnvc0.a \
-	$(TOP)/src/gallium/drivers/nouveau/libnouveau.a
-nouveau_SYS = -ldrm_nouveau
-
-# r300 pipe driver
-r300_LIBS = \
-	$(TOP)/src/gallium/winsys/radeon/drm/libradeonwinsys.a \
-	$(TOP)/src/gallium/drivers/r300/libr300.a
-r300_SYS += -ldrm_radeon
-
-# r600 pipe driver
-r600_LIBS = \
-	$(TOP)/src/gallium/winsys/radeon/drm/libradeonwinsys.a \
-	$(TOP)/src/gallium/drivers/r600/libr600.a
-r600_SYS += -ldrm_radeon
-
-# radeonsi pipe driver
-radeonsi_LIBS = \
-	$(TOP)/src/gallium/winsys/radeon/drm/libradeonwinsys.a \
-	$(TOP)/src/gallium/drivers/radeonsi/libradeonsi.a
-radeonsi_SYS += -ldrm_radeon
-
-# vmwgfx pipe driver
-vmwgfx_LIBS = \
-	$(TOP)/src/gallium/winsys/svga/drm/libsvgadrm.a \
-	$(TOP)/src/gallium/drivers/svga/libsvga.a
-
-
+	     $(LIBDRM_CFLAGS)
 
-# LLVM
-ifeq ($(MESA_LLVM),1)
-pipe_SYS += $(LLVM_LIBS)
-pipe_LDFLAGS += $(LLVM_LDFLAGS)
-endif
-
-ifneq ($(findstring llvmpipe,$(GALLIUM_DRIVERS_DIRS)),)
-pipe_LIBS += $(TOP)/src/gallium/drivers/llvmpipe/libllvmpipe.a
-endif
-
-# determine the targets/sources
-_pipe_TARGETS_CC =
-_pipe_TARGETS_CXX =
-pipe_SOURCES =
-
-ifneq ($(findstring i915/drm,$(GALLIUM_WINSYS_DIRS)),)
-_pipe_TARGETS_CC += $(PIPE_PREFIX)i915.so
-pipe_SOURCES += pipe_i915.c
-endif
-
-ifneq ($(findstring nouveau/drm,$(GALLIUM_WINSYS_DIRS)),)
-_pipe_TARGETS_CXX += $(PIPE_PREFIX)nouveau.so
-pipe_SOURCES += pipe_nouveau.c
-endif
-
-ifneq ($(findstring radeon/drm,$(GALLIUM_WINSYS_DIRS)),)
-ifneq ($(findstring r300,$(GALLIUM_DRIVERS_DIRS)),)
-_pipe_TARGETS_CC += $(PIPE_PREFIX)r300.so
-pipe_SOURCES += pipe_r300.c
-endif
-endif
-
-ifneq ($(findstring radeon/drm,$(GALLIUM_WINSYS_DIRS)),)
-ifneq ($(findstring r600,$(GALLIUM_DRIVERS_DIRS)),)
-_pipe_TARGETS_CC += $(PIPE_PREFIX)r600.so
-pipe_SOURCES += pipe_r600.c
-endif
-endif
-
-ifneq ($(findstring radeon/drm,$(GALLIUM_WINSYS_DIRS)),)
-ifneq ($(findstring radeonsi,$(GALLIUM_DRIVERS_DIRS)),)
-_pipe_TARGETS_CC += $(PIPE_PREFIX)radeonsi.so
-pipe_SOURCES += pipe_radeonsi.c
-endif
-endif
-
-ifneq ($(findstring svga/drm,$(GALLIUM_WINSYS_DIRS)),)
-_pipe_TARGETS_CC += $(PIPE_PREFIX)vmwgfx.so
-pipe_SOURCES += pipe_vmwgfx.c
-endif
-
-pipe_OBJECTS = $(pipe_SOURCES:.c=.o)
-
-ifeq ($(MESA_LLVM),1)
-pipe_TARGETS_CXX = $(_pipe_TARGETS_CXX) $(_pipe_TARGETS_CC)
-pipe_TARGETS_CC =
-else
-pipe_TARGETS_CXX = $(_pipe_TARGETS_CXX)
-pipe_TARGETS_CC = $(_pipe_TARGETS_CC)
-endif
-
-GBM_EXTRA_TARGETS = $(addprefix $(TOP)/$(LIB_DIR)/gbm/, $(pipe_TARGETS_CC)) $(addprefix $(TOP)/$(LIB_DIR)/gbm/, $(pipe_TARGETS_CXX))
+GBM_EXTRA_TARGETS = pipes
 GBM_EXTRA_INSTALL = install-pipes
 GBM_EXTRA_CLEAN = clean-pipes
-GBM_EXTRA_SOURCES = $(pipe_SOURCES)
 
 include $(TOP)/src/gbm/backends/Makefile.template
 
+PIPE_SRC_DIR = $(TOP)/src/gallium/targets/pipe-loader
+PIPE_INSTALL_DIR = $(INSTALL_LIB_DIR)/gbm
 
-$(GBM_EXTRA_TARGETS): $(TOP)/$(LIB_DIR)/gbm/%: %
-	@$(INSTALL) -d $(dir $@)
-	$(INSTALL) $< $(dir $@)
-
-$(pipe_TARGETS_CC): $(PIPE_PREFIX)%.so: pipe_%.o $(pipe_LIBS) $($*_LIBS)
-	$(MKLIB) -o $@ -noprefix -linker '$(CC)' \
-		-ldflags '-L$(TOP)/$(LIB_DIR) $(pipe_LDFLAGS) $(LDFLAGS)' \
-		$(MKLIB_OPTIONS) $< \
-		-Wl,--start-group $(pipe_LIBS) $($*_LIBS) -Wl,--end-group \
-		$(pipe_SYS) $($*_SYS)
-
-$(pipe_TARGETS_CXX): $(PIPE_PREFIX)%.so: pipe_%.o $(pipe_LIBS) $($*_LIBS)
-	$(MKLIB) -o $@ -noprefix -linker '$(CXX)' \
-		-ldflags '-L$(TOP)/$(LIB_DIR) $(pipe_LDFLAGS) $(LDFLAGS)' \
-		$(MKLIB_OPTIONS) $< \
-		-Wl,--start-group $(pipe_LIBS) $($*_LIBS) -Wl,--end-group \
-		$(pipe_SYS) $($*_SYS)
-
-$(pipe_OBJECTS): %.o: %.c
-	$(CC) -c -o $@ $< $(pipe_INCLUDES) $(pipe_CFLAGS) $(CFLAGS)
-
-install-pipes: $(GBM_EXTRA_TARGETS)
-	$(INSTALL) -d $(DESTDIR)$(INSTALL_LIB_DIR)/gbm
-	for tgt in $(GBM_EXTRA_TARGETS); do \
-		$(MINSTALL) "$$tgt" $(DESTDIR)$(INSTALL_LIB_DIR)/gbm; \
-	done
-
+pipes:
+	@$(MAKE) -C $(PIPE_SRC_DIR)
+install-pipes:
+	@$(MAKE) -C $(PIPE_SRC_DIR) PIPE_INSTALL_DIR=$(PIPE_INSTALL_DIR) install
 clean-pipes:
-	rm -f $(pipe_TARGETS)
-	rm -f $(pipe_OBJECTS)
+	@$(MAKE) -C $(PIPE_SRC_DIR) clean
diff --git a/src/gallium/targets/gbm/gbm.c b/src/gallium/targets/gbm/gbm.c
index e840fc5fa1a..7d2af513db8 100644
--- a/src/gallium/targets/gbm/gbm.c
+++ b/src/gallium/targets/gbm/gbm.c
@@ -25,36 +25,56 @@
  *    Benjamin Franzke <benjaminfranzke@googlemail.com>
  */
 
-#include "util/u_inlines.h"
-
 #include "gbm_gallium_drmint.h"
-#include "pipe_loader.h"
 
-static struct pipe_screen *
-create_drm_screen(const char *name, int fd)
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "pipe-loader/pipe_loader.h"
+
+static const char *
+get_library_search_path(void)
 {
-   struct pipe_module *pmod = get_pipe_module(name);
- 
-   return (pmod && pmod->drmdd && pmod->drmdd->create_screen) ?
-      pmod->drmdd->create_screen(fd) : NULL;
+   const char *search_path = NULL;
+
+   /* don't allow setuid apps to use GBM_BACKENDS_PATH */
+   if (geteuid() == getuid())
+      search_path = getenv("GBM_BACKENDS_PATH");
+   if (search_path == NULL)
+      search_path = PIPE_SEARCH_DIR;
+
+   return search_path;
 }
 
 int
 gallium_screen_create(struct gbm_gallium_drm_device *gdrm)
 {
-   gdrm->base.driver_name = drm_fd_get_screen_name(gdrm->base.base.fd);
-   if (gdrm->base.driver_name == NULL)
+   struct pipe_loader_device *dev;
+   int ret;
+
+   ret = pipe_loader_drm_probe_fd(&dev, gdrm->base.base.fd);
+   if (!ret)
       return -1;
 
-   gdrm->screen = create_drm_screen(gdrm->base.driver_name, gdrm->base.base.fd);
+   gdrm->screen = pipe_loader_create_screen(dev, get_library_search_path());
    if (gdrm->screen == NULL) {
       debug_printf("failed to load driver: %s\n", gdrm->base.driver_name);
+      pipe_loader_release(&dev, 1);
       return -1;
    };
 
+   gdrm->driver = dev;
+   gdrm->base.driver_name = strdup(dev->driver_name);
    return 0;
 }
 
+void
+gallium_screen_destroy(struct gbm_gallium_drm_device *gdrm)
+{
+   FREE(gdrm->base.driver_name);
+   gdrm->screen->destroy(gdrm->screen);
+   pipe_loader_release((struct pipe_loader_device **)&gdrm->driver, 1);
+}
+
 GBM_EXPORT struct gbm_backend gbm_backend = {
    .backend_name = "gallium_drm",
    .create_device = gbm_gallium_drm_device_create,
diff --git a/src/gallium/targets/gbm/pipe_i915.c b/src/gallium/targets/gbm/pipe_i915.c
deleted file mode 100644
index 85662cb85b5..00000000000
--- a/src/gallium/targets/gbm/pipe_i915.c
+++ /dev/null
@@ -1,27 +0,0 @@
-
-#include "target-helpers/inline_debug_helper.h"
-#include "state_tracker/drm_driver.h"
-#include "i915/drm/i915_drm_public.h"
-#include "i915/i915_public.h"
-
-static struct pipe_screen *
-create_screen(int fd)
-{
-   struct i915_winsys *iws;
-   struct pipe_screen *screen;
-
-   iws = i915_drm_winsys_create(fd);
-   if (!iws)
-      return NULL;
-
-   screen = i915_screen_create(iws);
-   if (!screen)
-      return NULL;
-
-   screen = debug_screen_wrap(screen);
-
-   return screen;
-}
-
-PUBLIC
-DRM_DRIVER_DESCRIPTOR("i915", "i915", create_screen, NULL)
diff --git a/src/gallium/targets/gbm/pipe_loader.c b/src/gallium/targets/gbm/pipe_loader.c
deleted file mode 100644
index 6200541dbf0..00000000000
--- a/src/gallium/targets/gbm/pipe_loader.c
+++ /dev/null
@@ -1,192 +0,0 @@
-/*
- * Copyright Â© 2011 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *    Kristian HÃ¸gsberg <krh@bitplanet.net>
- *    Benjamin Franzke <benjaminfranzke@googlemail.com>
- */
-
-#include <stdio.h>
-#include "util/u_string.h"
-#include "util/u_memory.h"
-
-#include <libudev.h>
-
-#include "gbm_gallium_drmint.h"
-#include "pipe_loader.h"
-#define DRIVER_MAP_GALLIUM_ONLY
-#include "pci_ids/pci_id_driver_map.h"
-
-static struct pipe_module pipe_modules[16];
-
-static INLINE char *
-loader_strdup(const char *str)
-{
-   return mem_dup(str, strlen(str) + 1);
-}
-
-char *
-drm_fd_get_screen_name(int fd)
-{
-   struct udev *udev;
-   struct udev_device *device, *parent;
-   const char *pci_id;
-   char *driver = NULL;
-   int vendor_id, chip_id, i, j;
-
-   udev = udev_new();
-   device = _gbm_udev_device_new_from_fd(udev, fd);
-   if (device == NULL)
-      return NULL;
-
-   parent = udev_device_get_parent(device);
-   if (parent == NULL) {
-      fprintf(stderr, "gbm: could not get parent device");
-      goto out;
-   }
-
-   pci_id = udev_device_get_property_value(parent, "PCI_ID");
-   if (pci_id == NULL ||
-       sscanf(pci_id, "%x:%x", &vendor_id, &chip_id) != 2) {
-      fprintf(stderr, "gbm: malformed or no PCI ID");
-      goto out;
-   }
-
-   for (i = 0; driver_map[i].driver; i++) {
-      if (vendor_id != driver_map[i].vendor_id)
-         continue;
-      if (driver_map[i].num_chips_ids == -1) {
-         driver = loader_strdup(driver_map[i].driver);
-         _gbm_log("pci id for %d: %04x:%04x, driver %s",
-                  fd, vendor_id, chip_id, driver);
-         goto out;
-      }
-
-      for (j = 0; j < driver_map[i].num_chips_ids; j++)
-         if (driver_map[i].chip_ids[j] == chip_id) {
-            driver = loader_strdup(driver_map[i].driver);
-            _gbm_log("pci id for %d: %04x:%04x, driver %s",
-                     fd, vendor_id, chip_id, driver);
-            goto out;
-         }
-   }
-
-out:
-   udev_device_unref(device);
-   udev_unref(udev);
-
-   return driver;
-}
-
-static void
-find_pipe_module(struct pipe_module *pmod, const char *name)
-{
-   char *search_paths, *end, *next, *p;
-   char path[PATH_MAX];
-   int ret;
-   
-   search_paths = NULL;
-   if (geteuid() == getuid()) {
-      /* don't allow setuid apps to use GBM_BACKENDS_PATH */
-      search_paths = getenv("GBM_BACKENDS_PATH");
-   }
-   if (search_paths == NULL)
-      search_paths = GBM_BACKEND_SEARCH_DIR;
-
-   end = search_paths + strlen(search_paths);
-   for (p = search_paths; p < end && pmod->lib == NULL; p = next + 1) {
-      int len;
-      next = strchr(p, ':');
-      if (next == NULL)
-         next = end;
-
-      len = next - p;
-
-      if (len) {
-         ret = util_snprintf(path, sizeof(path),
-                             "%.*s/" PIPE_PREFIX "%s" UTIL_DL_EXT, len, p, pmod->name);
-      }
-      else {
-         ret = util_snprintf(path, sizeof(path),
-                             PIPE_PREFIX "%s" UTIL_DL_EXT, pmod->name);
-      }
-      if (ret > 0 && ret < sizeof(path)) {
-         pmod->lib = util_dl_open(path);
-         debug_printf("loaded %s\n", path);
-      }
-
-   }
-}
-
-static boolean
-load_pipe_module(struct pipe_module *pmod, const char *name)
-{
-   pmod->name = loader_strdup(name);
-   if (!pmod->name)
-      return FALSE;
-
-   find_pipe_module(pmod, name);
-
-   if (pmod->lib) {
-      pmod->drmdd = (const struct drm_driver_descriptor *)
-         util_dl_get_proc_address(pmod->lib, "driver_descriptor");
-
-      /* sanity check on the name */
-      if (pmod->drmdd && strcmp(pmod->drmdd->name, pmod->name) != 0)
-         pmod->drmdd = NULL;
-
-      if (!pmod->drmdd) {
-         util_dl_close(pmod->lib);
-         pmod->lib = NULL;
-      }
-   }
-
-   return (pmod->drmdd != NULL);
-}
-
-struct pipe_module *
-get_pipe_module(const char *name)
-{
-   struct pipe_module *pmod = NULL;
-   int i;
-
-   if (!name)
-      return NULL;
-
-   for (i = 0; i < Elements(pipe_modules); i++) {
-      if (!pipe_modules[i].initialized ||
-          strcmp(pipe_modules[i].name, name) == 0) {
-         pmod = &pipe_modules[i];
-         break;
-      }
-   }
-   if (!pmod)
-      return NULL;
-
-   if (!pmod->initialized) {
-      load_pipe_module(pmod, name);
-      pmod->initialized = TRUE;
-   }
-
-   return pmod;
-}
diff --git a/src/gallium/targets/gbm/pipe_loader.h b/src/gallium/targets/gbm/pipe_loader.h
deleted file mode 100644
index 2e4cd9906b7..00000000000
--- a/src/gallium/targets/gbm/pipe_loader.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright Â© 2011 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *    Benjamin Franzke <benjaminfranzke@googlemail.com>
- */
-
-#ifndef _PIPE_LOADER_H_
-#define _PIPE_LOADER_H_
-
-#include "pipe/p_compiler.h"
-#include "util/u_dl.h"
-#include "state_tracker/drm_driver.h"
-
-struct pipe_module {
-   boolean initialized;
-   char *name;
-   struct util_dl_library *lib;
-   const struct drm_driver_descriptor *drmdd;
-};
-
-struct pipe_module *
-get_pipe_module(const char *name);
-
-char *
-drm_fd_get_screen_name(int fd);
-
-#endif
diff --git a/src/gallium/targets/gbm/pipe_nouveau.c b/src/gallium/targets/gbm/pipe_nouveau.c
deleted file mode 100644
index 65425e8d456..00000000000
--- a/src/gallium/targets/gbm/pipe_nouveau.c
+++ /dev/null
@@ -1,21 +0,0 @@
-
-#include "target-helpers/inline_debug_helper.h"
-#include "state_tracker/drm_driver.h"
-#include "nouveau/drm/nouveau_drm_public.h"
-
-static struct pipe_screen *
-create_screen(int fd)
-{
-   struct pipe_screen *screen;
-
-   screen = nouveau_drm_screen_create(fd);
-   if (!screen)
-      return NULL;
-
-   screen = debug_screen_wrap(screen);
-
-   return screen;
-}
-
-PUBLIC
-DRM_DRIVER_DESCRIPTOR("nouveau", "nouveau", create_screen, NULL)
diff --git a/src/gallium/targets/gbm/pipe_r300.c b/src/gallium/targets/gbm/pipe_r300.c
deleted file mode 100644
index 055685996e6..00000000000
--- a/src/gallium/targets/gbm/pipe_r300.c
+++ /dev/null
@@ -1,27 +0,0 @@
-
-#include "target-helpers/inline_debug_helper.h"
-#include "state_tracker/drm_driver.h"
-#include "radeon/drm/radeon_drm_public.h"
-#include "r300/r300_public.h"
-
-static struct pipe_screen *
-create_screen(int fd)
-{
-   struct radeon_winsys *sws;
-   struct pipe_screen *screen;
-
-   sws = radeon_drm_winsys_create(fd);
-   if (!sws)
-      return NULL;
-
-   screen = r300_screen_create(sws);
-   if (!screen)
-      return NULL;
-
-   screen = debug_screen_wrap(screen);
-
-   return screen;
-}
-
-PUBLIC
-DRM_DRIVER_DESCRIPTOR("r300", "radeon", create_screen, NULL)
diff --git a/src/gallium/targets/gbm/pipe_r600.c b/src/gallium/targets/gbm/pipe_r600.c
deleted file mode 100644
index 5d89aca6ec3..00000000000
--- a/src/gallium/targets/gbm/pipe_r600.c
+++ /dev/null
@@ -1,26 +0,0 @@
-#include "state_tracker/drm_driver.h"
-#include "target-helpers/inline_debug_helper.h"
-#include "radeon/drm/radeon_drm_public.h"
-#include "r600/r600_public.h"
-
-static struct pipe_screen *
-create_screen(int fd)
-{
-   struct radeon_winsys *rw;
-   struct pipe_screen *screen;
-
-   rw = radeon_drm_winsys_create(fd);
-   if (!rw)
-      return NULL;
-
-   screen = r600_screen_create(rw);
-   if (!screen)
-      return NULL;
-
-   screen = debug_screen_wrap(screen);
-
-   return screen;
-}
-
-PUBLIC
-DRM_DRIVER_DESCRIPTOR("r600", "radeon", create_screen, NULL)
diff --git a/src/gallium/targets/gbm/pipe_swrast.c b/src/gallium/targets/gbm/pipe_swrast.c
deleted file mode 100644
index 092abf07a52..00000000000
--- a/src/gallium/targets/gbm/pipe_swrast.c
+++ /dev/null
@@ -1,22 +0,0 @@
-
-#include "target-helpers/inline_sw_helper.h"
-#include "target-helpers/inline_debug_helper.h"
-#include "state_tracker/drm_driver.h"
-
-PUBLIC struct pipe_screen *
-swrast_create_screen(struct sw_winsys *ws);
-
-PUBLIC
-DRM_DRIVER_DESCRIPTOR("swrast", NULL, NULL, NULL)
-
-struct pipe_screen *
-swrast_create_screen(struct sw_winsys *ws)
-{
-   struct pipe_screen *screen;
-
-   screen = sw_screen_create(ws);
-   if (screen)
-      screen = debug_screen_wrap(screen);
-
-   return screen;
-}
diff --git a/src/gallium/targets/gbm/pipe_vmwgfx.c b/src/gallium/targets/gbm/pipe_vmwgfx.c
deleted file mode 100644
index bfe665be6eb..00000000000
--- a/src/gallium/targets/gbm/pipe_vmwgfx.c
+++ /dev/null
@@ -1,27 +0,0 @@
-
-#include "target-helpers/inline_debug_helper.h"
-#include "state_tracker/drm_driver.h"
-#include "svga/drm/svga_drm_public.h"
-#include "svga/svga_public.h"
-
-static struct pipe_screen *
-create_screen(int fd)
-{
-   struct svga_winsys_screen *sws;
-   struct pipe_screen *screen;
-
-   sws = svga_drm_winsys_screen_create(fd);
-   if (!sws)
-      return NULL;
-
-   screen = svga_screen_create(sws);
-   if (!screen)
-      return NULL;
-
-   screen = debug_screen_wrap(screen);
-
-   return screen;
-}
-
-PUBLIC
-DRM_DRIVER_DESCRIPTOR("vmwgfx", "vmwgfx", create_screen, NULL)
diff --git a/src/gallium/targets/opencl/Makefile.am b/src/gallium/targets/opencl/Makefile.am
new file mode 100644
index 00000000000..0d233c11b8d
--- /dev/null
+++ b/src/gallium/targets/opencl/Makefile.am
@@ -0,0 +1,36 @@
+AUTOMAKE_OPTIONS = subdir-objects
+
+lib_LTLIBRARIES = libOpenCL.la
+
+libOpenCL_la_LDFLAGS = \
+	-version-number 1:0
+
+libOpenCL_la_LIBADD = \
+	$(top_builddir)/src/gallium/state_trackers/clover/libclover.la \
+	$(top_builddir)/src/gallium/auxiliary/libgallium.a \
+	$(GALLIUM_PIPE_LOADER_LIBS) $(LIBUDEV_LIBS) \
+	-ldl
+
+libOpenCL_la_SOURCES =
+
+# Force usage of a C++ linker
+nodist_EXTRA_libOpenCL_la_SOURCES = dummy.cpp
+
+PIPE_SRC_DIR = $(top_srcdir)/src/gallium/targets/pipe-loader
+
+# Provide compatibility with scripts for the old Mesa build system for
+# a while by putting a link to the driver into /lib of the build tree.
+all-local: libOpenCL.la
+	@$(MAKE) -C $(PIPE_SRC_DIR)
+	$(MKDIR_P) $(top_builddir)/$(LIB_DIR)
+	ln -f .libs/libOpenCL.so* $(top_builddir)/$(LIB_DIR)/
+
+install-exec-local:
+	@$(MAKE) -C $(PIPE_SRC_DIR) PIPE_INSTALL_DIR=$(OPENCL_LIB_INSTALL_DIR) install
+
+clean-local:
+	@$(MAKE) -C $(PIPE_SRC_DIR) clean
+
+# FIXME: Remove when the rest of Gallium is converted to automake.
+TOP=$(top_builddir)
+default: all
diff --git a/src/gallium/targets/pipe-loader/Makefile b/src/gallium/targets/pipe-loader/Makefile
new file mode 100644
index 00000000000..eb3b4fc8aa3
--- /dev/null
+++ b/src/gallium/targets/pipe-loader/Makefile
@@ -0,0 +1,165 @@
+# Makefile for building pipe driver shared libraries.
+#
+# Input variables: PIPE_INSTALL_DIR, PIPE_PREFIX (optional)
+#
+TOP = ../../../..
+include $(TOP)/configs/current
+
+PIPE_PREFIX ?= pipe_
+
+PIPE_CPPFLAGS = \
+	-DGALLIUM_RBUG \
+	-DGALLIUM_TRACE \
+	-DGALLIUM_GALAHAD \
+	-I$(TOP)/include \
+	-I$(TOP)/src/gallium/auxiliary \
+	-I$(TOP)/src/gallium/drivers \
+	-I$(TOP)/src/gallium/include \
+	-I$(TOP)/src/gallium/winsys
+
+PIPE_LIBS = \
+	$(TOP)/src/gallium/drivers/identity/libidentity.a \
+	$(TOP)/src/gallium/drivers/galahad/libgalahad.a \
+	$(TOP)/src/gallium/drivers/trace/libtrace.a \
+	$(TOP)/src/gallium/drivers/rbug/librbug.a \
+	$(GALLIUM_AUXILIARIES)
+
+PIPE_SYS = $(LIBDRM_LIB) -lm -lpthread $(DLOPEN_LIBS)
+
+PIPE_CFLAGS = $(LIBDRM_CFLAGS)
+
+PIPE_LDFLAGS = -Wl,--no-undefined
+
+# i915 pipe driver
+i915_LIBS = \
+	$(TOP)/src/gallium/winsys/i915/drm/libi915drm.a \
+	$(TOP)/src/gallium/drivers/i915/libi915.a
+i915_SYS = -ldrm_intel
+
+# nouveau pipe driver
+nouveau_LIBS = \
+	$(TOP)/src/gallium/winsys/nouveau/drm/libnouveaudrm.a \
+	$(TOP)/src/gallium/drivers/nv30/libnv30.a \
+	$(TOP)/src/gallium/drivers/nv50/libnv50.a \
+	$(TOP)/src/gallium/drivers/nvc0/libnvc0.a \
+	$(TOP)/src/gallium/drivers/nouveau/libnouveau.a
+nouveau_SYS = -ldrm_nouveau
+
+# r300 pipe driver
+r300_LIBS = \
+	$(TOP)/src/gallium/winsys/radeon/drm/libradeonwinsys.a \
+	$(TOP)/src/gallium/drivers/r300/libr300.a
+r300_SYS += -ldrm_radeon
+
+# r600 pipe driver
+r600_LIBS = \
+	$(TOP)/src/gallium/winsys/radeon/drm/libradeonwinsys.a \
+	$(TOP)/src/gallium/drivers/r600/libr600.a
+r600_SYS += -ldrm_radeon
+
+# vmwgfx pipe driver
+vmwgfx_LIBS = \
+	$(TOP)/src/gallium/winsys/svga/drm/libsvgadrm.a \
+	$(TOP)/src/gallium/drivers/svga/libsvga.a
+
+ifneq ($(findstring llvmpipe,$(GALLIUM_DRIVERS_DIRS)),)
+   swrast_LIBS = $(TOP)/src/gallium/drivers/llvmpipe/libllvmpipe.a
+   PIPE_CFLAGS += -DGALLIUM_LLVMPIPE
+else ifneq ($(findstring softpipe,$(GALLIUM_DRIVERS_DIRS)),)
+   swrast_LIBS = $(TOP)/src/gallium/drivers/softpipe/libsoftpipe.a
+   PIPE_CFLAGS += -DGALLIUM_SOFTPIPE
+endif
+
+# LLVM
+ifeq ($(MESA_LLVM),1)
+   PIPE_SYS += $(LLVM_LIBS)
+   PIPE_LDFLAGS += $(LLVM_LDFLAGS)
+endif
+
+# determine the targets/sources
+_PIPE_TARGETS_CC =
+_PIPE_TARGETS_CXX =
+PIPE_SOURCES =
+
+ifneq ($(findstring i915/drm,$(GALLIUM_WINSYS_DIRS)),)
+   _PIPE_TARGETS_CC += $(PIPE_PREFIX)i915.so
+   PIPE_SOURCES += pipe_i915.c
+endif
+
+ifneq ($(findstring nouveau/drm,$(GALLIUM_WINSYS_DIRS)),)
+   _PIPE_TARGETS_CXX += $(PIPE_PREFIX)nouveau.so
+   PIPE_SOURCES += pipe_nouveau.c
+endif
+
+ifneq ($(findstring radeon/drm,$(GALLIUM_WINSYS_DIRS)),)
+ifneq ($(findstring r300,$(GALLIUM_DRIVERS_DIRS)),)
+   _PIPE_TARGETS_CC += $(PIPE_PREFIX)r300.so
+   PIPE_SOURCES += pipe_r300.c
+endif
+endif
+
+ifneq ($(findstring radeon/drm,$(GALLIUM_WINSYS_DIRS)),)
+ifneq ($(findstring r600,$(GALLIUM_DRIVERS_DIRS)),)
+   _PIPE_TARGETS_CC += $(PIPE_PREFIX)r600.so
+   PIPE_SOURCES += pipe_r600.c
+endif
+endif
+
+ifneq ($(findstring svga/drm,$(GALLIUM_WINSYS_DIRS)),)
+   _PIPE_TARGETS_CC += $(PIPE_PREFIX)vmwgfx.so
+   PIPE_SOURCES += pipe_vmwgfx.c
+endif
+
+ifneq ($(filter llvmpipe softpipe,$(GALLIUM_DRIVERS_DIRS)),)
+   _PIPE_TARGETS_CC += $(PIPE_PREFIX)swrast.so
+   PIPE_SOURCES += pipe_swrast.c
+endif
+
+PIPE_OBJECTS := $(PIPE_SOURCES:.c=.o)
+
+ifeq ($(MESA_LLVM),1)
+   PIPE_TARGETS_CXX = $(_PIPE_TARGETS_CXX) $(_PIPE_TARGETS_CC)
+   PIPE_TARGETS_CC =
+else
+   PIPE_TARGETS_CXX = $(_PIPE_TARGETS_CXX)
+   PIPE_TARGETS_CC = $(_PIPE_TARGETS_CC)
+endif
+
+PIPE_TARGETS = $(PIPE_TARGETS_CC) $(PIPE_TARGETS_CXX)
+
+default: depend $(PIPE_TARGETS)
+
+.SECONDEXPANSION:
+
+$(PIPE_TARGETS_CC): $(PIPE_PREFIX)%.so: pipe_%.o $(PIPE_LIBS) $$(%_LIBS)
+	$(MKLIB) -o $@ -noprefix -linker '$(CC)' \
+		-ldflags '-L$(TOP)/$(LIB_DIR) $(PIPE_LDFLAGS) $(LDFLAGS)' \
+		$(MKLIB_OPTIONS) $< \
+		-Wl,--start-group $(PIPE_LIBS) $($*_LIBS) -Wl,--end-group \
+		$(PIPE_SYS) $($*_SYS)
+
+$(PIPE_TARGETS_CXX): $(PIPE_PREFIX)%.so: pipe_%.o $(PIPE_LIBS) $$(%_LIBS)
+	$(MKLIB) -o $@ -noprefix -linker '$(CXX)' \
+		-ldflags '-L$(TOP)/$(LIB_DIR) $(PIPE_LDFLAGS) $(LDFLAGS)' \
+		$(MKLIB_OPTIONS) $< \
+		-Wl,--start-group $(PIPE_LIBS) $($*_LIBS) -Wl,--end-group \
+		$(PIPE_SYS) $($*_SYS)
+
+$(PIPE_OBJECTS): %.o: %.c
+	$(CC) -c -o $@ $< $(PIPE_CPPFLAGS) $(PIPE_CFLAGS) $(CFLAGS)
+
+install: $(PIPE_TARGETS)
+	$(INSTALL) -d $(DESTDIR)/$(PIPE_INSTALL_DIR)
+	for tgt in $(PIPE_TARGETS); do \
+		$(MINSTALL) "$$tgt" $(DESTDIR)/$(PIPE_INSTALL_DIR); \
+	done
+
+clean:
+	rm -f $(PIPE_TARGETS) $(PIPE_OBJECTS) depend depend.bak
+
+depend: $(PIPE_SOURCES)
+	rm -f depend
+	touch depend
+	$(MKDEP) $(MKDEP_OPTIONS) $(PIPE_CPPFLAGS) $(PIPE_SOURCES) 2>/dev/null
+
+sinclude depend
diff --git a/src/gallium/targets/pipe-loader/pipe_i915.c b/src/gallium/targets/pipe-loader/pipe_i915.c
new file mode 100644
index 00000000000..85662cb85b5
--- /dev/null
+++ b/src/gallium/targets/pipe-loader/pipe_i915.c
@@ -0,0 +1,27 @@
+
+#include "target-helpers/inline_debug_helper.h"
+#include "state_tracker/drm_driver.h"
+#include "i915/drm/i915_drm_public.h"
+#include "i915/i915_public.h"
+
+static struct pipe_screen *
+create_screen(int fd)
+{
+   struct i915_winsys *iws;
+   struct pipe_screen *screen;
+
+   iws = i915_drm_winsys_create(fd);
+   if (!iws)
+      return NULL;
+
+   screen = i915_screen_create(iws);
+   if (!screen)
+      return NULL;
+
+   screen = debug_screen_wrap(screen);
+
+   return screen;
+}
+
+PUBLIC
+DRM_DRIVER_DESCRIPTOR("i915", "i915", create_screen, NULL)
diff --git a/src/gallium/targets/pipe-loader/pipe_nouveau.c b/src/gallium/targets/pipe-loader/pipe_nouveau.c
new file mode 100644
index 00000000000..65425e8d456
--- /dev/null
+++ b/src/gallium/targets/pipe-loader/pipe_nouveau.c
@@ -0,0 +1,21 @@
+
+#include "target-helpers/inline_debug_helper.h"
+#include "state_tracker/drm_driver.h"
+#include "nouveau/drm/nouveau_drm_public.h"
+
+static struct pipe_screen *
+create_screen(int fd)
+{
+   struct pipe_screen *screen;
+
+   screen = nouveau_drm_screen_create(fd);
+   if (!screen)
+      return NULL;
+
+   screen = debug_screen_wrap(screen);
+
+   return screen;
+}
+
+PUBLIC
+DRM_DRIVER_DESCRIPTOR("nouveau", "nouveau", create_screen, NULL)
diff --git a/src/gallium/targets/pipe-loader/pipe_r300.c b/src/gallium/targets/pipe-loader/pipe_r300.c
new file mode 100644
index 00000000000..055685996e6
--- /dev/null
+++ b/src/gallium/targets/pipe-loader/pipe_r300.c
@@ -0,0 +1,27 @@
+
+#include "target-helpers/inline_debug_helper.h"
+#include "state_tracker/drm_driver.h"
+#include "radeon/drm/radeon_drm_public.h"
+#include "r300/r300_public.h"
+
+static struct pipe_screen *
+create_screen(int fd)
+{
+   struct radeon_winsys *sws;
+   struct pipe_screen *screen;
+
+   sws = radeon_drm_winsys_create(fd);
+   if (!sws)
+      return NULL;
+
+   screen = r300_screen_create(sws);
+   if (!screen)
+      return NULL;
+
+   screen = debug_screen_wrap(screen);
+
+   return screen;
+}
+
+PUBLIC
+DRM_DRIVER_DESCRIPTOR("r300", "radeon", create_screen, NULL)
diff --git a/src/gallium/targets/pipe-loader/pipe_r600.c b/src/gallium/targets/pipe-loader/pipe_r600.c
new file mode 100644
index 00000000000..5d89aca6ec3
--- /dev/null
+++ b/src/gallium/targets/pipe-loader/pipe_r600.c
@@ -0,0 +1,26 @@
+#include "state_tracker/drm_driver.h"
+#include "target-helpers/inline_debug_helper.h"
+#include "radeon/drm/radeon_drm_public.h"
+#include "r600/r600_public.h"
+
+static struct pipe_screen *
+create_screen(int fd)
+{
+   struct radeon_winsys *rw;
+   struct pipe_screen *screen;
+
+   rw = radeon_drm_winsys_create(fd);
+   if (!rw)
+      return NULL;
+
+   screen = r600_screen_create(rw);
+   if (!screen)
+      return NULL;
+
+   screen = debug_screen_wrap(screen);
+
+   return screen;
+}
+
+PUBLIC
+DRM_DRIVER_DESCRIPTOR("r600", "radeon", create_screen, NULL)
diff --git a/src/gallium/targets/pipe-loader/pipe_swrast.c b/src/gallium/targets/pipe-loader/pipe_swrast.c
new file mode 100644
index 00000000000..092abf07a52
--- /dev/null
+++ b/src/gallium/targets/pipe-loader/pipe_swrast.c
@@ -0,0 +1,22 @@
+
+#include "target-helpers/inline_sw_helper.h"
+#include "target-helpers/inline_debug_helper.h"
+#include "state_tracker/drm_driver.h"
+
+PUBLIC struct pipe_screen *
+swrast_create_screen(struct sw_winsys *ws);
+
+PUBLIC
+DRM_DRIVER_DESCRIPTOR("swrast", NULL, NULL, NULL)
+
+struct pipe_screen *
+swrast_create_screen(struct sw_winsys *ws)
+{
+   struct pipe_screen *screen;
+
+   screen = sw_screen_create(ws);
+   if (screen)
+      screen = debug_screen_wrap(screen);
+
+   return screen;
+}
diff --git a/src/gallium/targets/pipe-loader/pipe_vmwgfx.c b/src/gallium/targets/pipe-loader/pipe_vmwgfx.c
new file mode 100644
index 00000000000..bfe665be6eb
--- /dev/null
+++ b/src/gallium/targets/pipe-loader/pipe_vmwgfx.c
@@ -0,0 +1,27 @@
+
+#include "target-helpers/inline_debug_helper.h"
+#include "state_tracker/drm_driver.h"
+#include "svga/drm/svga_drm_public.h"
+#include "svga/svga_public.h"
+
+static struct pipe_screen *
+create_screen(int fd)
+{
+   struct svga_winsys_screen *sws;
+   struct pipe_screen *screen;
+
+   sws = svga_drm_winsys_screen_create(fd);
+   if (!sws)
+      return NULL;
+
+   screen = svga_screen_create(sws);
+   if (!screen)
+      return NULL;
+
+   screen = debug_screen_wrap(screen);
+
+   return screen;
+}
+
+PUBLIC
+DRM_DRIVER_DESCRIPTOR("vmwgfx", "vmwgfx", create_screen, NULL)
diff --git a/src/gallium/tests/trivial/Makefile b/src/gallium/tests/trivial/Makefile
index 4ddbb0b73dc..8c032016538 100644
--- a/src/gallium/tests/trivial/Makefile
+++ b/src/gallium/tests/trivial/Makefile
@@ -11,39 +11,39 @@ INCLUDES = \
 	-I$(TOP)/src/gallium/winsys \
 	$(PROG_INCLUDES)
 
-ifeq ($(MESA_LLVM),1)
-LINKS = $(TOP)/src/gallium/drivers/llvmpipe/libllvmpipe.a
-LDFLAGS += $(LLVM_LDFLAGS)
-endif
-
 LINKS += \
-	$(TOP)/src/gallium/drivers/rbug/librbug.a \
-	$(TOP)/src/gallium/drivers/trace/libtrace.a \
-	$(TOP)/src/gallium/drivers/galahad/libgalahad.a \
-	$(TOP)/src/gallium/winsys/sw/null/libws_null.a \
-	$(TOP)/src/gallium/drivers/softpipe/libsoftpipe.a \
+	$(GALLIUM_PIPE_LOADER_LIBS) \
 	$(GALLIUM_AUXILIARIES) \
-	$(PROG_LINKS)
+	$(PROG_LINKS) $(LIBUDEV_LIBS)
 
 SOURCES = \
 	tri.c \
-	quad-tex.c
+	quad-tex.c \
+	compute.c
 
 OBJECTS = $(SOURCES:.c=.o)
 
 PROGS = $(OBJECTS:.o=)
 
-PROG_DEFINES = \
-	-DGALLIUM_SOFTPIPE -DGALLIUM_RBUG -DGALLIUM_TRACE -DGALLIUM_GALAHAD
+PROG_DEFINES = -DPIPE_SEARCH_DIR=\"$(PIPE_SRC_DIR)\" \
+               $(GALLIUM_PIPE_LOADER_DEFINES)
+
+PIPE_SRC_DIR = $(TOP)/src/gallium/targets/pipe-loader
 
 ##### TARGETS #####
 
-default: $(PROGS)
+default: $(PROGS) pipes
+
+install:
 
 clean:
 	-rm -f $(PROGS)
 	-rm -f *.o
 	-rm -f result.bmp
+	@$(MAKE) -C $(PIPE_SRC_DIR) clean
+
+pipes:
+	@$(MAKE) -C $(PIPE_SRC_DIR)
 
 ##### RULES #####
 
diff --git a/src/gallium/tests/trivial/compute.c b/src/gallium/tests/trivial/compute.c
new file mode 100644
index 00000000000..1812090d3a0
--- /dev/null
+++ b/src/gallium/tests/trivial/compute.c
@@ -0,0 +1,1592 @@
+/*
+ * Copyright (C) 2011 Francisco Jerez.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <inttypes.h>
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "util/u_sampler.h"
+#include "util/u_format.h"
+#include "tgsi/tgsi_text.h"
+#include "pipe-loader/pipe_loader.h"
+
+#define MAX_RESOURCES 4
+
+struct context {
+        struct pipe_loader_device *dev;
+        struct pipe_screen *screen;
+        struct pipe_context *pipe;
+        void *hwcs;
+        void *hwsmp[MAX_RESOURCES];
+        struct pipe_resource *tex[MAX_RESOURCES];
+        bool tex_rw[MAX_RESOURCES];
+        struct pipe_sampler_view *view[MAX_RESOURCES];
+        struct pipe_surface *surf[MAX_RESOURCES];
+};
+
+#define DUMP_COMPUTE_PARAM(p, c) do {                                   \
+                uint64_t __v[4];                                        \
+                int __i, __n;                                           \
+                                                                        \
+                __n = ctx->screen->get_compute_param(ctx->screen, c, __v); \
+                printf("%s: {", #c);                                    \
+                                                                        \
+                for (__i = 0; __i < __n / sizeof(*__v); ++__i)          \
+                        printf(" %"PRIu64, __v[__i]);                   \
+                                                                        \
+                printf(" }\n");                                         \
+        } while (0)
+
+static void init_ctx(struct context *ctx)
+{
+        int ret;
+
+        ret = pipe_loader_probe(&ctx->dev, 1);
+        assert(ret);
+
+        ctx->screen = pipe_loader_create_screen(ctx->dev, PIPE_SEARCH_DIR);
+        assert(ctx->screen);
+
+        ctx->pipe = ctx->screen->context_create(ctx->screen, NULL);
+        assert(ctx->pipe);
+
+        DUMP_COMPUTE_PARAM(p, PIPE_COMPUTE_CAP_GRID_DIMENSION);
+        DUMP_COMPUTE_PARAM(p, PIPE_COMPUTE_CAP_MAX_GRID_SIZE);
+        DUMP_COMPUTE_PARAM(p, PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE);
+}
+
+static void destroy_ctx(struct context *ctx)
+{
+        ctx->pipe->destroy(ctx->pipe);
+        ctx->screen->destroy(ctx->screen);
+        pipe_loader_release(&ctx->dev, 1);
+        FREE(ctx);
+}
+
+static char *
+preprocess_prog(struct context *ctx, const char *src, const char *defs)
+{
+        const char header[] =
+                "#define RGLOBAL        RES[32767]\n"
+                "#define RLOCAL         RES[32766]\n"
+                "#define RPRIVATE       RES[32765]\n"
+                "#define RINPUT         RES[32764]\n";
+        char cmd[512];
+        char tmp[] = "/tmp/test-compute.tgsi-XXXXXX";
+        char *buf;
+        int fd, ret;
+        struct stat st;
+        FILE *p;
+
+        /* Open a temporary file */
+        fd = mkstemp(tmp);
+        assert(fd >= 0);
+        snprintf(cmd, sizeof(cmd), "cpp -P -nostdinc -undef %s > %s",
+                 defs ? defs : "", tmp);
+
+        /* Preprocess */
+        p = popen(cmd, "w");
+        fwrite(header, strlen(header), 1, p);
+        fwrite(src, strlen(src), 1, p);
+        ret = pclose(p);
+        assert(!ret);
+
+        /* Read back */
+        ret = fstat(fd, &st);
+        assert(!ret);
+
+        buf = malloc(st.st_size + 1);
+        ret = read(fd, buf, st.st_size);
+        assert(ret == st.st_size);
+        buf[ret] = 0;
+
+        /* Clean up */
+        close(fd);
+        unlink(tmp);
+
+        return buf;
+}
+
+static void init_prog(struct context *ctx, unsigned local_sz,
+                      unsigned private_sz, unsigned input_sz,
+                      const char *src, const char *defs)
+{
+        struct pipe_context *pipe = ctx->pipe;
+        struct tgsi_token prog[1024];
+        struct pipe_compute_state cs = {
+                .prog = prog,
+                .req_local_mem = local_sz,
+                .req_private_mem = private_sz,
+                .req_input_mem = input_sz
+        };
+        char *psrc = preprocess_prog(ctx, src, defs);
+        int ret;
+
+        ret = tgsi_text_translate(psrc, prog, Elements(prog));
+        assert(ret);
+        free(psrc);
+
+        ctx->hwcs = pipe->create_compute_state(pipe, &cs);
+        assert(ctx->hwcs);
+
+        pipe->bind_compute_state(pipe, ctx->hwcs);
+}
+
+static void destroy_prog(struct context *ctx)
+{
+        struct pipe_context *pipe = ctx->pipe;
+
+        pipe->delete_compute_state(pipe, ctx->hwcs);
+        ctx->hwcs = NULL;
+}
+
+static void init_tex(struct context *ctx, int slot,
+                     enum pipe_texture_target target, bool rw,
+                     enum pipe_format format, int w, int h,
+                     void (*init)(void *, int, int, int))
+{
+        struct pipe_context *pipe = ctx->pipe;
+        struct pipe_resource **tex = &ctx->tex[slot];
+        struct pipe_resource ttex = {
+                .target = target,
+                .format = format,
+                .width0 = w,
+                .height0 = h,
+                .depth0 = 1,
+                .array_size = 1,
+                .bind = (PIPE_BIND_SAMPLER_VIEW |
+                         PIPE_BIND_COMPUTE_RESOURCE |
+                         PIPE_BIND_GLOBAL)
+        };
+        int dx = util_format_get_blocksize(format);
+        int dy = util_format_get_stride(format, w);
+        int nx = (target == PIPE_BUFFER ? (w / dx) :
+                  util_format_get_nblocksx(format, w));
+        int ny = (target == PIPE_BUFFER ? 1 :
+                  util_format_get_nblocksy(format, h));
+        struct pipe_transfer *xfer;
+        char *map;
+        int x, y;
+
+        *tex = ctx->screen->resource_create(ctx->screen, &ttex);
+        assert(*tex);
+
+        xfer = pipe->get_transfer(pipe, *tex, 0, PIPE_TRANSFER_WRITE,
+                                  &(struct pipe_box) { .width = w,
+                                                  .height = h,
+                                                  .depth = 1 });
+        assert(xfer);
+
+        map = pipe->transfer_map(pipe, xfer);
+        assert(map);
+
+        for (y = 0; y < ny; ++y) {
+                for (x = 0; x < nx; ++x) {
+                        init(map + y * dy + x * dx, slot, x, y);
+                }
+        }
+
+        pipe->transfer_unmap(pipe, xfer);
+        pipe->transfer_destroy(pipe, xfer);
+
+        ctx->tex_rw[slot] = rw;
+}
+
+static bool default_check(void *x, void *y, int sz) {
+        return !memcmp(x, y, sz);
+}
+
+static void check_tex(struct context *ctx, int slot,
+                      void (*expect)(void *, int, int, int),
+                      bool (*check)(void *, void *, int))
+{
+        struct pipe_context *pipe = ctx->pipe;
+        struct pipe_resource *tex = ctx->tex[slot];
+        int dx = util_format_get_blocksize(tex->format);
+        int dy = util_format_get_stride(tex->format, tex->width0);
+        int nx = (tex->target == PIPE_BUFFER ? (tex->width0 / dx) :
+                  util_format_get_nblocksx(tex->format, tex->width0));
+        int ny = (tex->target == PIPE_BUFFER ? 1 :
+                  util_format_get_nblocksy(tex->format, tex->height0));
+        struct pipe_transfer *xfer;
+        char *map;
+        int x, y, i;
+        int err = 0;
+
+        if (!check)
+                check = default_check;
+
+        xfer = pipe->get_transfer(pipe, tex, 0, PIPE_TRANSFER_READ,
+                                  &(struct pipe_box) { .width = tex->width0,
+                                        .height = tex->height0,
+                                        .depth = 1 });
+        assert(xfer);
+
+        map = pipe->transfer_map(pipe, xfer);
+        assert(map);
+
+        for (y = 0; y < ny; ++y) {
+                for (x = 0; x < nx; ++x) {
+                        uint32_t exp[4];
+                        uint32_t *res = (uint32_t *)(map + y * dy + x * dx);
+
+                        expect(exp, slot, x, y);
+                        if (check(res, exp, dx) || (++err) > 20)
+                                continue;
+
+                        if (dx < 4) {
+                                uint32_t u = 0, v = 0;
+
+                                for (i = 0; i < dx; i++) {
+                                        u |= ((uint8_t *)exp)[i] << (8 * i);
+                                        v |= ((uint8_t *)res)[i] << (8 * i);
+                                }
+                                printf("(%d, %d): got 0x%x, expected 0x%x\n",
+                                       x, y, v, u);
+                        } else {
+                                for (i = 0; i < dx / 4; i++) {
+                                        printf("(%d, %d)[%d]: got 0x%x/%f,"
+                                               " expected 0x%x/%f\n", x, y, i,
+                                               res[i], ((float *)res)[i],
+                                               exp[i], ((float *)exp)[i]);
+                                }
+                        }
+                }
+        }
+
+        pipe->transfer_unmap(pipe, xfer);
+        pipe->transfer_destroy(pipe, xfer);
+
+        if (err)
+                printf("(%d, %d): \x1b[31mFAIL\x1b[0m (%d)\n", x, y, err);
+        else
+                printf("(%d, %d): \x1b[32mOK\x1b[0m\n", x, y);
+}
+
+static void destroy_tex(struct context *ctx)
+{
+        int i;
+
+        for (i = 0; i < MAX_RESOURCES; ++i) {
+                if (ctx->tex[i])
+                        pipe_resource_reference(&ctx->tex[i], NULL);
+        }
+}
+
+static void init_sampler_views(struct context *ctx, const int *slots)
+{
+        struct pipe_context *pipe = ctx->pipe;
+        struct pipe_sampler_view tview;
+        int i;
+
+        for (i = 0; *slots >= 0; ++i, ++slots) {
+                u_sampler_view_default_template(&tview, ctx->tex[*slots],
+                                                ctx->tex[*slots]->format);
+
+                ctx->view[i] = pipe->create_sampler_view(pipe, ctx->tex[*slots],
+                                                         &tview);
+                assert(ctx->view[i]);
+        }
+
+        pipe->set_compute_sampler_views(pipe, 0, i, ctx->view);
+}
+
+static void destroy_sampler_views(struct context *ctx)
+{
+        struct pipe_context *pipe = ctx->pipe;
+        int i;
+
+        pipe->set_compute_sampler_views(pipe, 0, MAX_RESOURCES, NULL);
+
+        for (i = 0; i < MAX_RESOURCES; ++i) {
+                if (ctx->view[i]) {
+                        pipe->sampler_view_destroy(pipe, ctx->view[i]);
+                        ctx->view[i] = NULL;
+                }
+        }
+}
+
+static void init_compute_resources(struct context *ctx, const int *slots)
+{
+        struct pipe_context *pipe = ctx->pipe;
+        int i;
+
+        for (i = 0; *slots >= 0; ++i, ++slots) {
+                struct pipe_surface tsurf = {
+                        .format = ctx->tex[*slots]->format,
+                        .usage = ctx->tex[*slots]->bind,
+                        .writable = ctx->tex_rw[*slots]
+                };
+
+                if (ctx->tex[*slots]->target == PIPE_BUFFER)
+                        tsurf.u.buf.last_element = ctx->tex[*slots]->width0 - 1;
+
+                ctx->surf[i] = pipe->create_surface(pipe, ctx->tex[*slots],
+                                                    &tsurf);
+                assert(ctx->surf[i]);
+        }
+
+        pipe->set_compute_resources(pipe, 0, i, ctx->surf);
+}
+
+static void destroy_compute_resources(struct context *ctx)
+{
+        struct pipe_context *pipe = ctx->pipe;
+        int i;
+
+        pipe->set_compute_resources(pipe, 0, MAX_RESOURCES, NULL);
+
+        for (i = 0; i < MAX_RESOURCES; ++i) {
+                if (ctx->surf[i]) {
+                        pipe->surface_destroy(pipe, ctx->surf[i]);
+                        ctx->surf[i] = NULL;
+                }
+        }
+}
+
+static void init_sampler_states(struct context *ctx, int n)
+{
+        struct pipe_context *pipe = ctx->pipe;
+        struct pipe_sampler_state smp = {
+                .normalized_coords = 1,
+        };
+        int i;
+
+        for (i = 0; i < n; ++i) {
+                ctx->hwsmp[i] = pipe->create_sampler_state(pipe, &smp);
+                assert(ctx->hwsmp[i]);
+        }
+
+        pipe->bind_compute_sampler_states(pipe, 0, i, ctx->hwsmp);
+}
+
+static void destroy_sampler_states(struct context *ctx)
+{
+        struct pipe_context *pipe = ctx->pipe;
+        int i;
+
+        pipe->bind_compute_sampler_states(pipe, 0, MAX_RESOURCES, NULL);
+
+        for (i = 0; i < MAX_RESOURCES; ++i) {
+                if (ctx->hwsmp[i]) {
+                        pipe->delete_sampler_state(pipe, ctx->hwsmp[i]);
+                        ctx->hwsmp[i] = NULL;
+                }
+        }
+}
+
+static void init_globals(struct context *ctx, const int *slots,
+                         uint32_t **handles)
+{
+        struct pipe_context *pipe = ctx->pipe;
+        struct pipe_resource *res[MAX_RESOURCES];
+        int i;
+
+        for (i = 0; *slots >= 0; ++i, ++slots)
+                res[i] = ctx->tex[*slots];
+
+        pipe->set_global_binding(pipe, 0, i, res, handles);
+}
+
+static void destroy_globals(struct context *ctx)
+{
+        struct pipe_context *pipe = ctx->pipe;
+
+        pipe->set_global_binding(pipe, 0, MAX_RESOURCES, NULL, NULL);
+}
+
+static void launch_grid(struct context *ctx, const uint *block_layout,
+                        const uint *grid_layout, uint32_t pc,
+                        const void *input)
+{
+        struct pipe_context *pipe = ctx->pipe;
+
+        pipe->launch_grid(pipe, block_layout, grid_layout, pc, input);
+}
+
+static void test_system_values(struct context *ctx)
+{
+        const char *src = "COMP\n"
+                "DCL RES[0], BUFFER, RAW, WR\n"
+                "DCL SV[0], BLOCK_ID[0]\n"
+                "DCL SV[1], BLOCK_SIZE[0]\n"
+                "DCL SV[2], GRID_SIZE[0]\n"
+                "DCL SV[3], THREAD_ID[0]\n"
+                "DCL TEMP[0], LOCAL\n"
+                "DCL TEMP[1], LOCAL\n"
+                "IMM UINT32 { 64, 0, 0, 0 }\n"
+                "IMM UINT32 { 16, 0, 0, 0 }\n"
+                "IMM UINT32 { 0, 0, 0, 0 }\n"
+                "\n"
+                "BGNSUB"
+                "  UMUL TEMP[0], SV[0], SV[1]\n"
+                "  UADD TEMP[0], TEMP[0], SV[3]\n"
+                "  UMUL TEMP[1], SV[1], SV[2]\n"
+                "  UMUL TEMP[0].w, TEMP[0], TEMP[1].zzzz\n"
+                "  UMUL TEMP[0].zw, TEMP[0], TEMP[1].yyyy\n"
+                "  UMUL TEMP[0].yzw, TEMP[0], TEMP[1].xxxx\n"
+                "  UADD TEMP[0].xy, TEMP[0].xyxy, TEMP[0].zwzw\n"
+                "  UADD TEMP[0].x, TEMP[0].xxxx, TEMP[0].yyyy\n"
+                "  UMUL TEMP[0].x, TEMP[0], IMM[0]\n"
+                "  STORE RES[0].xyzw, TEMP[0], SV[0]\n"
+                "  UADD TEMP[0].x, TEMP[0], IMM[1]\n"
+                "  STORE RES[0].xyzw, TEMP[0], SV[1]\n"
+                "  UADD TEMP[0].x, TEMP[0], IMM[1]\n"
+                "  STORE RES[0].xyzw, TEMP[0], SV[2]\n"
+                "  UADD TEMP[0].x, TEMP[0], IMM[1]\n"
+                "  STORE RES[0].xyzw, TEMP[0], SV[3]\n"
+                "  RET\n"
+                "ENDSUB\n";
+        void init(void *p, int s, int x, int y) {
+                *(uint32_t *)p = 0xdeadbeef;
+        }
+        void expect(void *p, int s, int x, int y) {
+                int id = x / 16, sv = (x % 16) / 4, c = x % 4;
+                int tid[] = { id % 20, (id % 240) / 20, id / 240, 0 };
+                int bsz[] = { 4, 3, 5, 1};
+                int gsz[] = { 5, 4, 1, 1};
+
+                switch (sv) {
+                case 0:
+                        *(uint32_t *)p = tid[c] / bsz[c];
+                        break;
+                case 1:
+                        *(uint32_t *)p = bsz[c];
+                        break;
+                case 2:
+                        *(uint32_t *)p = gsz[c];
+                        break;
+                case 3:
+                        *(uint32_t *)p = tid[c] % bsz[c];
+                        break;
+                }
+        }
+
+        printf("- %s\n", __func__);
+
+        init_prog(ctx, 0, 0, 0, src, NULL);
+        init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
+                 76800, 0, init);
+        init_compute_resources(ctx, (int []) { 0, -1 });
+        launch_grid(ctx, (uint []){4, 3, 5}, (uint []){5, 4, 1}, 0, NULL);
+        check_tex(ctx, 0, expect, NULL);
+        destroy_compute_resources(ctx);
+        destroy_tex(ctx);
+        destroy_prog(ctx);
+}
+
+static void test_resource_access(struct context *ctx)
+{
+        const char *src = "COMP\n"
+                "DCL RES[0], BUFFER, RAW, WR\n"
+                "DCL RES[1], 2D, RAW, WR\n"
+                "DCL SV[0], BLOCK_ID[0]\n"
+                "DCL TEMP[0], LOCAL\n"
+                "DCL TEMP[1], LOCAL\n"
+                "IMM UINT32 { 15, 0, 0, 0 }\n"
+                "IMM UINT32 { 16, 1, 0, 0 }\n"
+                "\n"
+                "    BGNSUB\n"
+                "       UADD TEMP[0].x, SV[0].xxxx, SV[0].yyyy\n"
+                "       AND TEMP[0].x, TEMP[0], IMM[0]\n"
+                "       UMUL TEMP[0].x, TEMP[0], IMM[1]\n"
+                "       LOAD TEMP[0].xyzw, RES[0], TEMP[0]\n"
+                "       UMUL TEMP[1], SV[0], IMM[1]\n"
+                "       STORE RES[1].xyzw, TEMP[1], TEMP[0]\n"
+                "       RET\n"
+                "    ENDSUB\n";
+        void init0(void *p, int s, int x, int y) {
+                *(float *)p = 8.0 - (float)x;
+        }
+        void init1(void *p, int s, int x, int y) {
+                *(uint32_t *)p = 0xdeadbeef;
+        }
+        void expect(void *p, int s, int x, int y) {
+                *(float *)p = 8.0 - (float)((x + 4*y) & 0x3f);
+        }
+
+        printf("- %s\n", __func__);
+
+        init_prog(ctx, 0, 0, 0, src, NULL);
+        init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
+                 256, 0, init0);
+        init_tex(ctx, 1, PIPE_TEXTURE_2D, true, PIPE_FORMAT_R32_FLOAT,
+                 60, 12, init1);
+        init_compute_resources(ctx, (int []) { 0, 1, -1 });
+        launch_grid(ctx, (uint []){1, 1, 1}, (uint []){15, 12, 1}, 0, NULL);
+        check_tex(ctx, 1, expect, NULL);
+        destroy_compute_resources(ctx);
+        destroy_tex(ctx);
+        destroy_prog(ctx);
+}
+
+static void test_function_calls(struct context *ctx)
+{
+        const char *src = "COMP\n"
+                "DCL RES[0], 2D, RAW, WR\n"
+                "DCL SV[0], BLOCK_ID[0]\n"
+                "DCL SV[1], BLOCK_SIZE[0]\n"
+                "DCL SV[2], GRID_SIZE[0]\n"
+                "DCL SV[3], THREAD_ID[0]\n"
+                "DCL TEMP[0]\n"
+                "DCL TEMP[1]\n"
+                "DCL TEMP[2], LOCAL\n"
+                "IMM UINT32 { 0, 11, 22, 33 }\n"
+                "IMM FLT32 { 11, 33, 55, 99 }\n"
+                "IMM UINT32 { 4, 1, 0, 0 }\n"
+                "IMM UINT32 { 12, 0, 0, 0 }\n"
+                "\n"
+                "00: BGNSUB\n"
+                "01:  UMUL TEMP[0].x, TEMP[0], TEMP[0]\n"
+                "02:  UADD TEMP[1].x, TEMP[1], IMM[2].yyyy\n"
+                "03:  USLT TEMP[0].x, TEMP[0], IMM[0]\n"
+                "04:  RET\n"
+                "05: ENDSUB\n"
+                "06: BGNSUB\n"
+                "07:  UMUL TEMP[0].x, TEMP[0], TEMP[0]\n"
+                "08:  UADD TEMP[1].x, TEMP[1], IMM[2].yyyy\n"
+                "09:  USLT TEMP[0].x, TEMP[0], IMM[0].yyyy\n"
+                "10:  IF TEMP[0].xxxx\n"
+                "11:   CAL :0\n"
+                "12:  ENDIF\n"
+                "13:  RET\n"
+                "14: ENDSUB\n"
+                "15: BGNSUB\n"
+                "16:  UMUL TEMP[2], SV[0], SV[1]\n"
+                "17:  UADD TEMP[2], TEMP[2], SV[3]\n"
+                "18:  UMUL TEMP[2], TEMP[2], IMM[2]\n"
+                "00:  MOV TEMP[1].x, IMM[2].wwww\n"
+                "19:  LOAD TEMP[0].x, RES[0].xxxx, TEMP[2]\n"
+                "20:  CAL :6\n"
+                "21:  STORE RES[0].x, TEMP[2], TEMP[1].xxxx\n"
+                "22:  RET\n"
+                "23: ENDSUB\n";
+        void init(void *p, int s, int x, int y) {
+                *(uint32_t *)p = 15 * y + x;
+        }
+        void expect(void *p, int s, int x, int y) {
+                *(uint32_t *)p = (15 * y + x) < 4 ? 2 : 1 ;
+        }
+
+        printf("- %s\n", __func__);
+
+        init_prog(ctx, 0, 0, 0, src, NULL);
+        init_tex(ctx, 0, PIPE_TEXTURE_2D, true, PIPE_FORMAT_R32_FLOAT,
+                 15, 12, init);
+        init_compute_resources(ctx, (int []) { 0, -1 });
+        launch_grid(ctx, (uint []){3, 3, 3}, (uint []){5, 4, 1}, 15, NULL);
+        check_tex(ctx, 0, expect, NULL);
+        destroy_compute_resources(ctx);
+        destroy_tex(ctx);
+        destroy_prog(ctx);
+}
+
+static void test_input_global(struct context *ctx)
+{
+        const char *src = "COMP\n"
+                "DCL SV[0], THREAD_ID[0]\n"
+                "DCL TEMP[0], LOCAL\n"
+                "DCL TEMP[1], LOCAL\n"
+                "IMM UINT32 { 8, 0, 0, 0 }\n"
+                "\n"
+                "    BGNSUB\n"
+                "       UMUL TEMP[0], SV[0], IMM[0]\n"
+                "       LOAD TEMP[1].xy, RINPUT, TEMP[0]\n"
+                "       LOAD TEMP[0].x, RGLOBAL, TEMP[1].yyyy\n"
+                "       UADD TEMP[1].x, TEMP[0], -TEMP[1]\n"
+                "       STORE RGLOBAL.x, TEMP[1].yyyy, TEMP[1]\n"
+                "       RET\n"
+                "    ENDSUB\n";
+        void init(void *p, int s, int x, int y) {
+                *(uint32_t *)p = 0xdeadbeef;
+        }
+        void expect(void *p, int s, int x, int y) {
+                *(uint32_t *)p = 0xdeadbeef - (x == 0 ? 0x10001 + 2 * s : 0);
+        }
+        uint32_t input[8] = { 0x10001, 0x10002, 0x10003, 0x10004,
+                              0x10005, 0x10006, 0x10007, 0x10008 };
+
+        printf("- %s\n", __func__);
+
+        init_prog(ctx, 0, 0, 32, src, NULL);
+        init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT, 32, 0, init);
+        init_tex(ctx, 1, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT, 32, 0, init);
+        init_tex(ctx, 2, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT, 32, 0, init);
+        init_tex(ctx, 3, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT, 32, 0, init);
+        init_globals(ctx, (int []){ 0, 1, 2, 3, -1 },
+                     (uint32_t *[]){ &input[1], &input[3],
+                                     &input[5], &input[7] });
+        launch_grid(ctx, (uint []){4, 1, 1}, (uint []){1, 1, 1}, 0, input);
+        check_tex(ctx, 0, expect, NULL);
+        check_tex(ctx, 1, expect, NULL);
+        check_tex(ctx, 2, expect, NULL);
+        check_tex(ctx, 3, expect, NULL);
+        destroy_globals(ctx);
+        destroy_tex(ctx);
+        destroy_prog(ctx);
+}
+
+static void test_private(struct context *ctx)
+{
+        const char *src = "COMP\n"
+                "DCL RES[0], BUFFER, RAW, WR\n"
+                "DCL SV[0], BLOCK_ID[0]\n"
+                "DCL SV[1], BLOCK_SIZE[0]\n"
+                "DCL SV[2], THREAD_ID[0]\n"
+                "DCL TEMP[0], LOCAL\n"
+                "DCL TEMP[1], LOCAL\n"
+                "DCL TEMP[2], LOCAL\n"
+                "IMM UINT32 { 128, 0, 0, 0 }\n"
+                "IMM UINT32 { 4, 0, 0, 0 }\n"
+                "\n"
+                "    BGNSUB\n"
+                "       UMUL TEMP[0].x, SV[0], SV[1]\n"
+                "       UADD TEMP[0].x, TEMP[0], SV[2]\n"
+                "       MOV TEMP[1].x, IMM[0].wwww\n"
+                "       BGNLOOP\n"
+                "               USEQ TEMP[2].x, TEMP[1], IMM[0]\n"
+                "               IF TEMP[2]\n"
+                "                       BRK\n"
+                "               ENDIF\n"
+                "               UDIV TEMP[2].x, TEMP[1], IMM[1]\n"
+                "               UADD TEMP[2].x, TEMP[2], TEMP[0]\n"
+                "               STORE RPRIVATE.x, TEMP[1], TEMP[2]\n"
+                "               UADD TEMP[1].x, TEMP[1], IMM[1]\n"
+                "       ENDLOOP\n"
+                "       MOV TEMP[1].x, IMM[0].wwww\n"
+                "       UMUL TEMP[0].x, TEMP[0], IMM[0]\n"
+                "       BGNLOOP\n"
+                "               USEQ TEMP[2].x, TEMP[1], IMM[0]\n"
+                "               IF TEMP[2]\n"
+                "                       BRK\n"
+                "               ENDIF\n"
+                "               LOAD TEMP[2].x, RPRIVATE, TEMP[1]\n"
+                "               STORE RES[0].x, TEMP[0], TEMP[2]\n"
+                "               UADD TEMP[0].x, TEMP[0], IMM[1]\n"
+                "               UADD TEMP[1].x, TEMP[1], IMM[1]\n"
+                "       ENDLOOP\n"
+                "       RET\n"
+                "    ENDSUB\n";
+        void init(void *p, int s, int x, int y) {
+                *(uint32_t *)p = 0xdeadbeef;
+        }
+        void expect(void *p, int s, int x, int y) {
+                *(uint32_t *)p = (x / 32) + x % 32;
+        }
+
+        printf("- %s\n", __func__);
+
+        init_prog(ctx, 0, 128, 0, src, NULL);
+        init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
+                 32768, 0, init);
+        init_compute_resources(ctx, (int []) { 0, -1 });
+        launch_grid(ctx, (uint []){16, 1, 1}, (uint []){16, 1, 1}, 0, NULL);
+        check_tex(ctx, 0, expect, NULL);
+        destroy_compute_resources(ctx);
+        destroy_tex(ctx);
+        destroy_prog(ctx);
+}
+
+static void test_local(struct context *ctx)
+{
+        const char *src = "COMP\n"
+                "DCL RES[0], BUFFER, RAW, WR\n"
+                "DCL SV[0], BLOCK_ID[0]\n"
+                "DCL SV[1], BLOCK_SIZE[0]\n"
+                "DCL SV[2], THREAD_ID[0]\n"
+                "DCL TEMP[0], LOCAL\n"
+                "DCL TEMP[1], LOCAL\n"
+                "DCL TEMP[2], LOCAL\n"
+                "IMM UINT32 { 1, 0, 0, 0 }\n"
+                "IMM UINT32 { 2, 0, 0, 0 }\n"
+                "IMM UINT32 { 4, 0, 0, 0 }\n"
+                "IMM UINT32 { 32, 0, 0, 0 }\n"
+                "IMM UINT32 { 128, 0, 0, 0 }\n"
+                "\n"
+                "    BGNSUB\n"
+                "       UMUL TEMP[0].x, SV[2], IMM[2]\n"
+                "       STORE RLOCAL.x, TEMP[0], IMM[0].wwww\n"
+                "       MFENCE RLOCAL\n"
+                "       USLT TEMP[1].x, SV[2], IMM[3]\n"
+                "       IF TEMP[1]\n"
+                "               UADD TEMP[1].x, TEMP[0], IMM[4]\n"
+                "               BGNLOOP\n"
+                "                       LOAD TEMP[2].x, RLOCAL, TEMP[1]\n"
+                "                       USEQ TEMP[2].x, TEMP[2], IMM[0]\n"
+                "                       IF TEMP[2]\n"
+                "                               BRK\n"
+                "                       ENDIF\n"
+                "               ENDLOOP\n"
+                "               STORE RLOCAL.x, TEMP[0], IMM[0]\n"
+                "               MFENCE RLOCAL\n"
+                "               BGNLOOP\n"
+                "                       LOAD TEMP[2].x, RLOCAL, TEMP[1]\n"
+                "                       USEQ TEMP[2].x, TEMP[2], IMM[1]\n"
+                "                       IF TEMP[2]\n"
+                "                               BRK\n"
+                "                       ENDIF\n"
+                "               ENDLOOP\n"
+                "       ELSE\n"
+                "               UADD TEMP[1].x, TEMP[0], -IMM[4]\n"
+                "               BGNLOOP\n"
+                "                       LOAD TEMP[2].x, RLOCAL, TEMP[1]\n"
+                "                       USEQ TEMP[2].x, TEMP[2], IMM[0].wwww\n"
+                "                       IF TEMP[2]\n"
+                "                               BRK\n"
+                "                       ENDIF\n"
+                "               ENDLOOP\n"
+                "               STORE RLOCAL.x, TEMP[0], IMM[0]\n"
+                "               MFENCE RLOCAL\n"
+                "               BGNLOOP\n"
+                "                       LOAD TEMP[2].x, RLOCAL, TEMP[1]\n"
+                "                       USEQ TEMP[2].x, TEMP[2], IMM[0]\n"
+                "                       IF TEMP[2]\n"
+                "                               BRK\n"
+                "                       ENDIF\n"
+                "               ENDLOOP\n"
+                "               STORE RLOCAL.x, TEMP[0], IMM[1]\n"
+                "               MFENCE RLOCAL\n"
+                "       ENDIF\n"
+                "       UMUL TEMP[1].x, SV[0], SV[1]\n"
+                "       UMUL TEMP[1].x, TEMP[1], IMM[2]\n"
+                "       UADD TEMP[1].x, TEMP[1], TEMP[0]\n"
+                "       LOAD TEMP[0].x, RLOCAL, TEMP[0]\n"
+                "       STORE RES[0].x, TEMP[1], TEMP[0]\n"
+                "       RET\n"
+                "    ENDSUB\n";
+        void init(void *p, int s, int x, int y) {
+                *(uint32_t *)p = 0xdeadbeef;
+        }
+        void expect(void *p, int s, int x, int y) {
+                *(uint32_t *)p = x & 0x20 ? 2 : 1;
+        }
+
+        printf("- %s\n", __func__);
+
+        init_prog(ctx, 256, 0, 0, src, NULL);
+        init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
+                 4096, 0, init);
+        init_compute_resources(ctx, (int []) { 0, -1 });
+        launch_grid(ctx, (uint []){64, 1, 1}, (uint []){16, 1, 1}, 0, NULL);
+        check_tex(ctx, 0, expect, NULL);
+        destroy_compute_resources(ctx);
+        destroy_tex(ctx);
+        destroy_prog(ctx);
+}
+
+static void test_sample(struct context *ctx)
+{
+        const char *src = "COMP\n"
+                "DCL SVIEW[0], 2D, FLOAT\n"
+                "DCL RES[0], 2D, RAW, WR\n"
+                "DCL SAMP[0]\n"
+                "DCL SV[0], BLOCK_ID[0]\n"
+                "DCL TEMP[0], LOCAL\n"
+                "DCL TEMP[1], LOCAL\n"
+                "IMM UINT32 { 16, 1, 0, 0 }\n"
+                "IMM FLT32 { 128, 32, 0, 0 }\n"
+                "\n"
+                "    BGNSUB\n"
+                "       I2F TEMP[1], SV[0]\n"
+                "       DIV TEMP[1], TEMP[1], IMM[1]\n"
+                "       SAMPLE TEMP[1], TEMP[1], SVIEW[0], SAMP[0]\n"
+                "       UMUL TEMP[0], SV[0], IMM[0]\n"
+                "       STORE RES[0].xyzw, TEMP[0], TEMP[1]\n"
+                "       RET\n"
+                "    ENDSUB\n";
+        void init(void *p, int s, int x, int y) {
+                *(float *)p = s ? 1 : x * y;
+        }
+        void expect(void *p, int s, int x, int y) {
+                switch (x % 4) {
+                case 0:
+                        *(float *)p = x / 4 * y;
+                        break;
+                case 1:
+                case 2:
+                        *(float *)p = 0;
+                        break;
+                case 3:
+                        *(float *)p = 1;
+                        break;
+                }
+        }
+
+        printf("- %s\n", __func__);
+
+        init_prog(ctx, 0, 0, 0, src, NULL);
+        init_tex(ctx, 0, PIPE_TEXTURE_2D, true, PIPE_FORMAT_R32_FLOAT,
+                 128, 32, init);
+        init_tex(ctx, 1, PIPE_TEXTURE_2D, true, PIPE_FORMAT_R32_FLOAT,
+                 512, 32, init);
+        init_compute_resources(ctx, (int []) { 1, -1 });
+        init_sampler_views(ctx, (int []) { 0, -1 });
+        init_sampler_states(ctx, 2);
+        launch_grid(ctx, (uint []){1, 1, 1}, (uint []){128, 32, 1}, 0, NULL);
+        check_tex(ctx, 1, expect, NULL);
+        destroy_sampler_states(ctx);
+        destroy_sampler_views(ctx);
+        destroy_compute_resources(ctx);
+        destroy_tex(ctx);
+        destroy_prog(ctx);
+}
+
+static void test_many_kern(struct context *ctx)
+{
+        const char *src = "COMP\n"
+                "DCL RES[0], BUFFER, RAW, WR\n"
+                "DCL TEMP[0], LOCAL\n"
+                "IMM UINT32 { 0, 1, 2, 3 }\n"
+                "IMM UINT32 { 4, 0, 0, 0 }\n"
+                "\n"
+                "    BGNSUB\n"
+                "       UMUL TEMP[0].x, IMM[0].xxxx, IMM[1].xxxx\n"
+                "       STORE RES[0].x, TEMP[0], IMM[0].xxxx\n"
+                "       RET\n"
+                "    ENDSUB\n"
+                "    BGNSUB\n"
+                "       UMUL TEMP[0].x, IMM[0].yyyy, IMM[1].xxxx\n"
+                "       STORE RES[0].x, TEMP[0], IMM[0].yyyy\n"
+                "       RET\n"
+                "    ENDSUB\n"
+                "    BGNSUB\n"
+                "       UMUL TEMP[0].x, IMM[0].zzzz, IMM[1].xxxx\n"
+                "       STORE RES[0].x, TEMP[0], IMM[0].zzzz\n"
+                "       RET\n"
+                "    ENDSUB\n"
+                "    BGNSUB\n"
+                "       UMUL TEMP[0].x, IMM[0].wwww, IMM[1].xxxx\n"
+                "       STORE RES[0].x, TEMP[0], IMM[0].wwww\n"
+                "       RET\n"
+                "    ENDSUB\n";
+        void init(void *p, int s, int x, int y) {
+                *(uint32_t *)p = 0xdeadbeef;
+        }
+        void expect(void *p, int s, int x, int y) {
+                *(uint32_t *)p = x;
+        }
+
+        printf("- %s\n", __func__);
+
+        init_prog(ctx, 0, 0, 0, src, NULL);
+        init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
+                 16, 0, init);
+        init_compute_resources(ctx, (int []) { 0, -1 });
+        launch_grid(ctx, (uint []){1, 1, 1}, (uint []){1, 1, 1}, 0, NULL);
+        launch_grid(ctx, (uint []){1, 1, 1}, (uint []){1, 1, 1}, 5, NULL);
+        launch_grid(ctx, (uint []){1, 1, 1}, (uint []){1, 1, 1}, 10, NULL);
+        launch_grid(ctx, (uint []){1, 1, 1}, (uint []){1, 1, 1}, 15, NULL);
+        check_tex(ctx, 0, expect, NULL);
+        destroy_compute_resources(ctx);
+        destroy_tex(ctx);
+        destroy_prog(ctx);
+}
+
+static void test_constant(struct context *ctx)
+{
+        const char *src = "COMP\n"
+                "DCL RES[0], BUFFER, RAW\n"
+                "DCL RES[1], BUFFER, RAW, WR\n"
+                "DCL SV[0], BLOCK_ID[0]\n"
+                "DCL TEMP[0], LOCAL\n"
+                "DCL TEMP[1], LOCAL\n"
+                "IMM UINT32 { 4, 0, 0, 0 }\n"
+                "\n"
+                "    BGNSUB\n"
+                "       UMUL TEMP[0].x, SV[0], IMM[0]\n"
+                "       LOAD TEMP[1].x, RES[0], TEMP[0]\n"
+                "       STORE RES[1].x, TEMP[0], TEMP[1]\n"
+                "       RET\n"
+                "    ENDSUB\n";
+        void init(void *p, int s, int x, int y) {
+                *(float *)p = s ? 0xdeadbeef : 8.0 - (float)x;
+        }
+        void expect(void *p, int s, int x, int y) {
+                *(float *)p = 8.0 - (float)x;
+        }
+
+        printf("- %s\n", __func__);
+
+        init_prog(ctx, 0, 0, 0, src, NULL);
+        init_tex(ctx, 0, PIPE_BUFFER, false, PIPE_FORMAT_R32_FLOAT,
+                 256, 0, init);
+        init_tex(ctx, 1, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
+                 256, 0, init);
+        init_compute_resources(ctx, (int []) { 0, 1, -1 });
+        launch_grid(ctx, (uint []){1, 1, 1}, (uint []){64, 1, 1}, 0, NULL);
+        check_tex(ctx, 1, expect, NULL);
+        destroy_compute_resources(ctx);
+        destroy_tex(ctx);
+        destroy_prog(ctx);
+}
+
+static void test_resource_indirect(struct context *ctx)
+{
+        const char *src = "COMP\n"
+                "DCL RES[0], BUFFER, RAW, WR\n"
+                "DCL RES[1..3], BUFFER, RAW\n"
+                "DCL SV[0], BLOCK_ID[0]\n"
+                "DCL TEMP[0], LOCAL\n"
+                "DCL TEMP[1], LOCAL\n"
+                "IMM UINT32 { 4, 0, 0, 0 }\n"
+                "\n"
+                "    BGNSUB\n"
+                "       UMUL TEMP[0].x, SV[0], IMM[0]\n"
+                "       LOAD TEMP[1].x, RES[1], TEMP[0]\n"
+                "       LOAD TEMP[1].x, RES[TEMP[1].x+2], TEMP[0]\n"
+                "       STORE RES[0].x, TEMP[0], TEMP[1]\n"
+                "       RET\n"
+                "    ENDSUB\n";
+        void init(void *p, int s, int x, int y) {
+                *(uint32_t *)p = s == 0 ? 0xdeadbeef :
+                   s == 1 ? x % 2 :
+                   s == 2 ? 2 * x :
+                   2 * x + 1;
+        }
+        void expect(void *p, int s, int x, int y) {
+           *(uint32_t *)p = 2 * x + (x % 2 ? 1 : 0);
+        }
+
+        printf("- %s\n", __func__);
+
+        init_prog(ctx, 0, 0, 0, src, NULL);
+        init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
+                 256, 0, init);
+        init_tex(ctx, 1, PIPE_BUFFER, false, PIPE_FORMAT_R32_FLOAT,
+                 256, 0, init);
+        init_tex(ctx, 2, PIPE_BUFFER, false, PIPE_FORMAT_R32_FLOAT,
+                 256, 0, init);
+        init_tex(ctx, 3, PIPE_BUFFER, false, PIPE_FORMAT_R32_FLOAT,
+                 256, 0, init);
+        init_compute_resources(ctx, (int []) { 0, 1, 2, 3, -1 });
+        launch_grid(ctx, (uint []){1, 1, 1}, (uint []){64, 1, 1}, 0, NULL);
+        check_tex(ctx, 0, expect, NULL);
+        destroy_compute_resources(ctx);
+        destroy_tex(ctx);
+        destroy_prog(ctx);
+}
+
+enum pipe_format surface_fmts[] = {
+        PIPE_FORMAT_B8G8R8A8_UNORM,
+        PIPE_FORMAT_B8G8R8X8_UNORM,
+        PIPE_FORMAT_A8R8G8B8_UNORM,
+        PIPE_FORMAT_X8R8G8B8_UNORM,
+        PIPE_FORMAT_X8R8G8B8_UNORM,
+        PIPE_FORMAT_L8_UNORM,
+        PIPE_FORMAT_A8_UNORM,
+        PIPE_FORMAT_I8_UNORM,
+        PIPE_FORMAT_L8A8_UNORM,
+        PIPE_FORMAT_R32_FLOAT,
+        PIPE_FORMAT_R32G32_FLOAT,
+        PIPE_FORMAT_R32G32B32A32_FLOAT,
+        PIPE_FORMAT_R32_UNORM,
+        PIPE_FORMAT_R32G32_UNORM,
+        PIPE_FORMAT_R32G32B32A32_UNORM,
+        PIPE_FORMAT_R32_SNORM,
+        PIPE_FORMAT_R32G32_SNORM,
+        PIPE_FORMAT_R32G32B32A32_SNORM,
+        PIPE_FORMAT_R8_UINT,
+        PIPE_FORMAT_R8G8_UINT,
+        PIPE_FORMAT_R8G8B8A8_UINT,
+        PIPE_FORMAT_R8_SINT,
+        PIPE_FORMAT_R8G8_SINT,
+        PIPE_FORMAT_R8G8B8A8_SINT,
+        PIPE_FORMAT_R32_UINT,
+        PIPE_FORMAT_R32G32_UINT,
+        PIPE_FORMAT_R32G32B32A32_UINT,
+        PIPE_FORMAT_R32_SINT,
+        PIPE_FORMAT_R32G32_SINT,
+        PIPE_FORMAT_R32G32B32A32_SINT
+};
+
+static void test_surface_ld(struct context *ctx)
+{
+        const char *src = "COMP\n"
+                "DCL RES[0], 2D\n"
+                "DCL RES[1], 2D, RAW, WR\n"
+                "DCL SV[0], BLOCK_ID[0]\n"
+                "DCL TEMP[0], LOCAL\n"
+                "DCL TEMP[1], LOCAL\n"
+                "IMM UINT32 { 16, 1, 0, 0 }\n"
+                "\n"
+                "    BGNSUB\n"
+                "       LOAD TEMP[1], RES[0], SV[0]\n"
+                "       UMUL TEMP[0], SV[0], IMM[0]\n"
+                "       STORE RES[1].xyzw, TEMP[0], TEMP[1]\n"
+                "       RET\n"
+                "    ENDSUB\n";
+        int i = 0;
+        void init0f(void *p, int s, int x, int y) {
+                float v[] = { 1.0, -.75, .50, -.25 };
+                util_format_write_4f(surface_fmts[i], v, 0,
+                                     p, 0, 0, 0, 1, 1);
+        }
+        void init0i(void *p, int s, int x, int y) {
+                int v[] = { 0xffffffff, 0xffff, 0xff, 0xf };
+                util_format_write_4i(surface_fmts[i], v, 0,
+                                     p, 0, 0, 0, 1, 1);
+        }
+        void init1(void *p, int s, int x, int y) {
+                *(uint32_t *)p = 0xdeadbeef;
+        }
+        void expectf(void *p, int s, int x, int y) {
+                float v[4], w[4];
+                init0f(v, s, x / 4, y);
+                util_format_read_4f(surface_fmts[i], w, 0,
+                                    v, 0, 0, 0, 1, 1);
+                *(float *)p = w[x % 4];
+        }
+        void expecti(void *p, int s, int x, int y) {
+                int32_t v[4], w[4];
+                init0i(v, s, x / 4, y);
+                util_format_read_4i(surface_fmts[i], w, 0,
+                                    v, 0, 0, 0, 1, 1);
+                *(uint32_t *)p = w[x % 4];
+        }
+
+        printf("- %s\n", __func__);
+
+        init_prog(ctx, 0, 0, 0, src, NULL);
+
+        for (i = 0; i < Elements(surface_fmts); i++) {
+                bool is_int = util_format_is_pure_integer(surface_fmts[i]);
+
+                printf("   - %s\n", util_format_name(surface_fmts[i]));
+
+                init_tex(ctx, 0, PIPE_TEXTURE_2D, true, surface_fmts[i],
+                         128, 32, (is_int ? init0i : init0f));
+                init_tex(ctx, 1, PIPE_TEXTURE_2D, true, PIPE_FORMAT_R32_FLOAT,
+                         512, 32, init1);
+                init_compute_resources(ctx, (int []) { 0, 1, -1 });
+                init_sampler_states(ctx, 2);
+                launch_grid(ctx, (uint []){1, 1, 1}, (uint []){128, 32, 1}, 0,
+                            NULL);
+                check_tex(ctx, 1, (is_int ? expecti : expectf), NULL);
+                destroy_sampler_states(ctx);
+                destroy_compute_resources(ctx);
+                destroy_tex(ctx);
+        }
+
+        destroy_prog(ctx);
+}
+
+static void test_surface_st(struct context *ctx)
+{
+        const char *src = "COMP\n"
+                "DCL RES[0], 2D, RAW\n"
+                "DCL RES[1], 2D, WR\n"
+                "DCL SV[0], BLOCK_ID[0]\n"
+                "DCL TEMP[0], LOCAL\n"
+                "DCL TEMP[1], LOCAL\n"
+                "IMM UINT32 { 16, 1, 0, 0 }\n"
+                "\n"
+                "    BGNSUB\n"
+                "       UMUL TEMP[0], SV[0], IMM[0]\n"
+                "       LOAD TEMP[1], RES[0], TEMP[0]\n"
+                "       STORE RES[1], SV[0], TEMP[1]\n"
+                "       RET\n"
+                "    ENDSUB\n";
+        int i = 0;
+        void init0f(void *p, int s, int x, int y) {
+                float v[] = { 1.0, -.75, 0.5, -.25 };
+                *(float *)p = v[x % 4];
+        }
+        void init0i(void *p, int s, int x, int y) {
+                int v[] = { 0xffffffff, 0xffff, 0xff, 0xf };
+                *(int32_t *)p = v[x % 4];
+        }
+        void init1(void *p, int s, int x, int y) {
+                memset(p, 1, util_format_get_blocksize(surface_fmts[i]));
+        }
+        void expectf(void *p, int s, int x, int y) {
+                float vf[4];
+                int j;
+
+                for (j = 0; j < 4; j++)
+                        init0f(&vf[j], s, 4 * x + j, y);
+                util_format_write_4f(surface_fmts[i], vf, 0,
+                                     p, 0, 0, 0, 1, 1);
+        }
+        void expects(void *p, int s, int x, int y) {
+                int32_t v[4];
+                int j;
+
+                for (j = 0; j < 4; j++)
+                        init0i(&v[j], s, 4 * x + j, y);
+                util_format_write_4i(surface_fmts[i], v, 0,
+                                     p, 0, 0, 0, 1, 1);
+        }
+        void expectu(void *p, int s, int x, int y) {
+                uint32_t v[4];
+                int j;
+
+                for (j = 0; j < 4; j++)
+                        init0i(&v[j], s, 4 * x + j, y);
+                util_format_write_4ui(surface_fmts[i], v, 0,
+                                      p, 0, 0, 0, 1, 1);
+        }
+        bool check(void *x, void *y, int sz) {
+                int j;
+
+                if (util_format_is_float(surface_fmts[i])) {
+                        return fabs(*(float *)x - *(float *)y) < 3.92156863e-3;
+
+                } else if ((sz % 4) == 0) {
+                        for (j = 0; j < sz / 4; j++)
+                                if (abs(((uint32_t *)x)[j] -
+                                        ((uint32_t *)y)[j]) > 1)
+                                        return false;
+                        return true;
+                } else {
+                        return !memcmp(x, y, sz);
+                }
+        }
+
+        printf("- %s\n", __func__);
+
+        init_prog(ctx, 0, 0, 0, src, NULL);
+
+        for (i = 0; i < Elements(surface_fmts); i++) {
+                bool is_signed = (util_format_description(surface_fmts[i])
+                                  ->channel[0].type == UTIL_FORMAT_TYPE_SIGNED);
+                bool is_int = util_format_is_pure_integer(surface_fmts[i]);
+
+                printf("   - %s\n", util_format_name(surface_fmts[i]));
+
+                init_tex(ctx, 0, PIPE_TEXTURE_2D, true, PIPE_FORMAT_R32_FLOAT,
+                         512, 32, (is_int ? init0i : init0f));
+                init_tex(ctx, 1, PIPE_TEXTURE_2D, true, surface_fmts[i],
+                         128, 32, init1);
+                init_compute_resources(ctx, (int []) { 0, 1, -1 });
+                init_sampler_states(ctx, 2);
+                launch_grid(ctx, (uint []){1, 1, 1}, (uint []){128, 32, 1}, 0,
+                            NULL);
+                check_tex(ctx, 1, (is_int && is_signed ? expects :
+                                   is_int && !is_signed ? expectu :
+                                   expectf), check);
+                destroy_sampler_states(ctx);
+                destroy_compute_resources(ctx);
+                destroy_tex(ctx);
+        }
+
+        destroy_prog(ctx);
+}
+
+static void test_barrier(struct context *ctx)
+{
+        const char *src = "COMP\n"
+                "DCL RES[0], BUFFER, RAW, WR\n"
+                "DCL SV[0], BLOCK_ID[0]\n"
+                "DCL SV[1], BLOCK_SIZE[0]\n"
+                "DCL SV[2], THREAD_ID[0]\n"
+                "DCL TEMP[0], LOCAL\n"
+                "DCL TEMP[1], LOCAL\n"
+                "DCL TEMP[2], LOCAL\n"
+                "DCL TEMP[3], LOCAL\n"
+                "IMM UINT32 { 1, 0, 0, 0 }\n"
+                "IMM UINT32 { 4, 0, 0, 0 }\n"
+                "IMM UINT32 { 32, 0, 0, 0 }\n"
+                "\n"
+                "    BGNSUB\n"
+                "       UMUL TEMP[0].x, SV[2], IMM[1]\n"
+                "       MOV TEMP[1].x, IMM[0].wwww\n"
+                "       BGNLOOP\n"
+                "               BARRIER\n"
+                "               STORE RLOCAL.x, TEMP[0], TEMP[1]\n"
+                "               BARRIER\n"
+                "               MOV TEMP[2].x, IMM[0].wwww\n"
+                "               BGNLOOP\n"
+                "                       UMUL TEMP[3].x, TEMP[2], IMM[1]\n"
+                "                       LOAD TEMP[3].x, RLOCAL, TEMP[3]\n"
+                "                       USNE TEMP[3].x, TEMP[3], TEMP[1]\n"
+                "                       IF TEMP[3]\n"
+                "                               END\n"
+                "                       ENDIF\n"
+                "                       UADD TEMP[2].x, TEMP[2], IMM[0]\n"
+                "                       USEQ TEMP[3].x, TEMP[2], SV[1]\n"
+                "                       IF TEMP[3]\n"
+                "                               BRK\n"
+                "                       ENDIF\n"
+                "               ENDLOOP\n"
+                "               UADD TEMP[1].x, TEMP[1], IMM[0]\n"
+                "               USEQ TEMP[2].x, TEMP[1], IMM[2]\n"
+                "               IF TEMP[2]\n"
+                "                       BRK\n"
+                "               ENDIF\n"
+                "       ENDLOOP\n"
+                "       UMUL TEMP[1].x, SV[0], SV[1]\n"
+                "       UMUL TEMP[1].x, TEMP[1], IMM[1]\n"
+                "       UADD TEMP[1].x, TEMP[1], TEMP[0]\n"
+                "       LOAD TEMP[0].x, RLOCAL, TEMP[0]\n"
+                "       STORE RES[0].x, TEMP[1], TEMP[0]\n"
+                "       RET\n"
+                "    ENDSUB\n";
+        void init(void *p, int s, int x, int y) {
+                *(uint32_t *)p = 0xdeadbeef;
+        }
+        void expect(void *p, int s, int x, int y) {
+                *(uint32_t *)p = 31;
+        }
+
+        printf("- %s\n", __func__);
+
+        init_prog(ctx, 256, 0, 0, src, NULL);
+        init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
+                 4096, 0, init);
+        init_compute_resources(ctx, (int []) { 0, -1 });
+        launch_grid(ctx, (uint []){64, 1, 1}, (uint []){16, 1, 1}, 0, NULL);
+        check_tex(ctx, 0, expect, NULL);
+        destroy_compute_resources(ctx);
+        destroy_tex(ctx);
+        destroy_prog(ctx);
+}
+
+static void test_atom_ops(struct context *ctx, bool global)
+{
+        const char *src = "COMP\n"
+                "#ifdef TARGET_GLOBAL\n"
+                "#define target RES[0]\n"
+                "#else\n"
+                "#define target RLOCAL\n"
+                "#endif\n"
+                ""
+                "DCL RES[0], BUFFER, RAW, WR\n"
+                "#define threadid SV[0]\n"
+                "DCL threadid, THREAD_ID[0]\n"
+                ""
+                "#define offset TEMP[0]\n"
+                "DCL offset, LOCAL\n"
+                "#define tmp TEMP[1]\n"
+                "DCL tmp, LOCAL\n"
+                ""
+                "#define k0 IMM[0]\n"
+                "IMM UINT32 { 0, 0, 0, 0 }\n"
+                "#define k1 IMM[1]\n"
+                "IMM UINT32 { 1, 0, 0, 0 }\n"
+                "#define k2 IMM[2]\n"
+                "IMM UINT32 { 2, 0, 0, 0 }\n"
+                "#define k3 IMM[3]\n"
+                "IMM UINT32 { 3, 0, 0, 0 }\n"
+                "#define k4 IMM[4]\n"
+                "IMM UINT32 { 4, 0, 0, 0 }\n"
+                "#define k5 IMM[5]\n"
+                "IMM UINT32 { 5, 0, 0, 0 }\n"
+                "#define k6 IMM[6]\n"
+                "IMM UINT32 { 6, 0, 0, 0 }\n"
+                "#define k7 IMM[7]\n"
+                "IMM UINT32 { 7, 0, 0, 0 }\n"
+                "#define k8 IMM[8]\n"
+                "IMM UINT32 { 8, 0, 0, 0 }\n"
+                "#define k9 IMM[9]\n"
+                "IMM UINT32 { 9, 0, 0, 0 }\n"
+                "#define korig IMM[10].xxxx\n"
+                "#define karg IMM[10].yyyy\n"
+                "IMM UINT32 { 3735928559, 286331153, 0, 0 }\n"
+                "\n"
+                "    BGNSUB\n"
+                "       UMUL offset.x, threadid, k4\n"
+                "       STORE target.x, offset, korig\n"
+                "       USEQ tmp.x, threadid, k0\n"
+                "       IF tmp\n"
+                "               ATOMUADD tmp.x, target, offset, karg\n"
+                "               ATOMUADD tmp.x, target, offset, tmp\n"
+                "       ENDIF\n"
+                "       USEQ tmp.x, threadid, k1\n"
+                "       IF tmp\n"
+                "               ATOMXCHG tmp.x, target, offset, karg\n"
+                "               ATOMXCHG tmp.x, target, offset, tmp\n"
+                "       ENDIF\n"
+                "       USEQ tmp.x, threadid, k2\n"
+                "       IF tmp\n"
+                "               ATOMCAS tmp.x, target, offset, korig, karg\n"
+                "               ATOMCAS tmp.x, target, offset, tmp, k0\n"
+                "       ENDIF\n"
+                "       USEQ tmp.x, threadid, k3\n"
+                "       IF tmp\n"
+                "               ATOMAND tmp.x, target, offset, karg\n"
+                "               ATOMAND tmp.x, target, offset, tmp\n"
+                "       ENDIF\n"
+                "       USEQ tmp.x, threadid, k4\n"
+                "       IF tmp\n"
+                "               ATOMOR tmp.x, target, offset, karg\n"
+                "               ATOMOR tmp.x, target, offset, tmp\n"
+                "       ENDIF\n"
+                "       USEQ tmp.x, threadid, k5\n"
+                "       IF tmp\n"
+                "               ATOMXOR tmp.x, target, offset, karg\n"
+                "               ATOMXOR tmp.x, target, offset, tmp\n"
+                "       ENDIF\n"
+                "       USEQ tmp.x, threadid, k6\n"
+                "       IF tmp\n"
+                "               ATOMUMIN tmp.x, target, offset, karg\n"
+                "               ATOMUMIN tmp.x, target, offset, tmp\n"
+                "       ENDIF\n"
+                "       USEQ tmp.x, threadid, k7\n"
+                "       IF tmp\n"
+                "               ATOMUMAX tmp.x, target, offset, karg\n"
+                "               ATOMUMAX tmp.x, target, offset, tmp\n"
+                "       ENDIF\n"
+                "       USEQ tmp.x, threadid, k8\n"
+                "       IF tmp\n"
+                "               ATOMIMIN tmp.x, target, offset, karg\n"
+                "               ATOMIMIN tmp.x, target, offset, tmp\n"
+                "       ENDIF\n"
+                "       USEQ tmp.x, threadid, k9\n"
+                "       IF tmp\n"
+                "               ATOMIMAX tmp.x, target, offset, karg\n"
+                "               ATOMIMAX tmp.x, target, offset, tmp\n"
+                "       ENDIF\n"
+                "#ifdef TARGET_LOCAL\n"
+                "       LOAD tmp.x, RLOCAL, offset\n"
+                "       STORE RES[0].x, offset, tmp\n"
+                "#endif\n"
+                "       RET\n"
+                "    ENDSUB\n";
+
+        void init(void *p, int s, int x, int y) {
+                *(uint32_t *)p = 0xbad;
+        }
+        void expect(void *p, int s, int x, int y) {
+                switch (x) {
+                case 0:
+                        *(uint32_t *)p = 0xce6c8eef;
+                        break;
+                case 1:
+                        *(uint32_t *)p = 0xdeadbeef;
+                        break;
+                case 2:
+                        *(uint32_t *)p = 0x11111111;
+                        break;
+                case 3:
+                        *(uint32_t *)p = 0x10011001;
+                        break;
+                case 4:
+                        *(uint32_t *)p = 0xdfbdbfff;
+                        break;
+                case 5:
+                        *(uint32_t *)p = 0x11111111;
+                        break;
+                case 6:
+                        *(uint32_t *)p = 0x11111111;
+                        break;
+                case 7:
+                        *(uint32_t *)p = 0xdeadbeef;
+                        break;
+                case 8:
+                        *(uint32_t *)p = 0xdeadbeef;
+                        break;
+                case 9:
+                        *(uint32_t *)p = 0x11111111;
+                        break;
+                }
+        }
+
+        printf("- %s (%s)\n", __func__, global ? "global" : "local");
+
+        init_prog(ctx, 40, 0, 0, src,
+                  (global ? "-DTARGET_GLOBAL" : "-DTARGET_LOCAL"));
+        init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
+                 40, 0, init);
+        init_compute_resources(ctx, (int []) { 0, -1 });
+        launch_grid(ctx, (uint []){10, 1, 1}, (uint []){1, 1, 1}, 0, NULL);
+        check_tex(ctx, 0, expect, NULL);
+        destroy_compute_resources(ctx);
+        destroy_tex(ctx);
+        destroy_prog(ctx);
+}
+
+static void test_atom_race(struct context *ctx, bool global)
+{
+        const char *src = "COMP\n"
+                "#ifdef TARGET_GLOBAL\n"
+                "#define target RES[0]\n"
+                "#else\n"
+                "#define target RLOCAL\n"
+                "#endif\n"
+                ""
+                "DCL RES[0], BUFFER, RAW, WR\n"
+                ""
+                "#define blockid SV[0]\n"
+                "DCL blockid, BLOCK_ID[0]\n"
+                "#define blocksz SV[1]\n"
+                "DCL blocksz, BLOCK_SIZE[0]\n"
+                "#define threadid SV[2]\n"
+                "DCL threadid, THREAD_ID[0]\n"
+                ""
+                "#define offset TEMP[0]\n"
+                "DCL offset, LOCAL\n"
+                "#define arg TEMP[1]\n"
+                "DCL arg, LOCAL\n"
+                "#define count TEMP[2]\n"
+                "DCL count, LOCAL\n"
+                "#define vlocal TEMP[3]\n"
+                "DCL vlocal, LOCAL\n"
+                "#define vshared TEMP[4]\n"
+                "DCL vshared, LOCAL\n"
+                "#define last TEMP[5]\n"
+                "DCL last, LOCAL\n"
+                "#define tmp0 TEMP[6]\n"
+                "DCL tmp0, LOCAL\n"
+                "#define tmp1 TEMP[7]\n"
+                "DCL tmp1, LOCAL\n"
+                ""
+                "#define k0 IMM[0]\n"
+                "IMM UINT32 { 0, 0, 0, 0 }\n"
+                "#define k1 IMM[1]\n"
+                "IMM UINT32 { 1, 0, 0, 0 }\n"
+                "#define k4 IMM[2]\n"
+                "IMM UINT32 { 4, 0, 0, 0 }\n"
+                "#define k32 IMM[3]\n"
+                "IMM UINT32 { 32, 0, 0, 0 }\n"
+                "#define k128 IMM[4]\n"
+                "IMM UINT32 { 128, 0, 0, 0 }\n"
+                "#define kdeadcafe IMM[5]\n"
+                "IMM UINT32 { 3735931646, 0, 0, 0 }\n"
+                "#define kallowed_set IMM[6]\n"
+                "IMM UINT32 { 559035650, 0, 0, 0 }\n"
+                "#define k11111111 IMM[7]\n"
+                "IMM UINT32 { 286331153, 0, 0, 0 }\n"
+                "\n"
+                "    BGNSUB\n"
+                "       MOV offset.x, threadid\n"
+                "#ifdef TARGET_GLOBAL\n"
+                "       UMUL tmp0.x, blockid, blocksz\n"
+                "       UADD offset.x, offset, tmp0\n"
+                "#endif\n"
+                "       UMUL offset.x, offset, k4\n"
+                "       USLT tmp0.x, threadid, k32\n"
+                "       STORE target.x, offset, k0\n"
+                "       BARRIER\n"
+                "       IF tmp0\n"
+                "               MOV vlocal.x, k0\n"
+                "               MOV arg.x, kdeadcafe\n"
+                "               BGNLOOP\n"
+                "                       INEG arg.x, arg\n"
+                "                       ATOMUADD vshared.x, target, offset, arg\n"
+                "                       SFENCE target\n"
+                "                       USNE tmp0.x, vshared, vlocal\n"
+                "                       IF tmp0\n"
+                "                               BRK\n"
+                "                       ENDIF\n"
+                "                       UADD vlocal.x, vlocal, arg\n"
+                "               ENDLOOP\n"
+                "               UADD vlocal.x, vshared, arg\n"
+                "               LOAD vshared.x, target, offset\n"
+                "               USEQ tmp0.x, vshared, vlocal\n"
+                "               STORE target.x, offset, tmp0\n"
+                "       ELSE\n"
+                "               UADD offset.x, offset, -k128\n"
+                "               MOV count.x, k0\n"
+                "               MOV last.x, k0\n"
+                "               BGNLOOP\n"
+                "                       LOAD vshared.x, target, offset\n"
+                "                       USEQ tmp0.x, vshared, kallowed_set.xxxx\n"
+                "                       USEQ tmp1.x, vshared, kallowed_set.yyyy\n"
+                "                       OR tmp0.x, tmp0, tmp1\n"
+                "                       IF tmp0\n"
+                "                               USEQ tmp0.x, vshared, last\n"
+                "                               IF tmp0\n"
+                "                                       CONT\n"
+                "                               ENDIF\n"
+                "                               MOV last.x, vshared\n"
+                "                       ELSE\n"
+                "                               END\n"
+                "                       ENDIF\n"
+                "                       UADD count.x, count, k1\n"
+                "                       USEQ tmp0.x, count, k128\n"
+                "                       IF tmp0\n"
+                "                               BRK\n"
+                "                       ENDIF\n"
+                "               ENDLOOP\n"
+                "               ATOMXCHG tmp0.x, target, offset, k11111111\n"
+                "               UADD offset.x, offset, k128\n"
+                "               ATOMXCHG tmp0.x, target, offset, k11111111\n"
+                "               SFENCE target\n"
+                "       ENDIF\n"
+                "#ifdef TARGET_LOCAL\n"
+                "       LOAD tmp0.x, RLOCAL, offset\n"
+                "       UMUL tmp1.x, blockid, blocksz\n"
+                "       UMUL tmp1.x, tmp1, k4\n"
+                "       UADD offset.x, offset, tmp1\n"
+                "       STORE RES[0].x, offset, tmp0\n"
+                "#endif\n"
+                "       RET\n"
+                "    ENDSUB\n";
+
+        void init(void *p, int s, int x, int y) {
+                *(uint32_t *)p = 0xdeadbeef;
+        }
+        void expect(void *p, int s, int x, int y) {
+                *(uint32_t *)p = x & 0x20 ? 0x11111111 : 0xffffffff;
+        }
+
+        printf("- %s (%s)\n", __func__, global ? "global" : "local");
+
+        init_prog(ctx, 256, 0, 0, src,
+                  (global ? "-DTARGET_GLOBAL" : "-DTARGET_LOCAL"));
+        init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
+                 4096, 0, init);
+        init_compute_resources(ctx, (int []) { 0, -1 });
+        launch_grid(ctx, (uint []){64, 1, 1}, (uint []){16, 1, 1}, 0, NULL);
+        check_tex(ctx, 0, expect, NULL);
+        destroy_compute_resources(ctx);
+        destroy_tex(ctx);
+        destroy_prog(ctx);
+}
+
+int main(int argc, char *argv[])
+{
+        struct context *ctx = CALLOC_STRUCT(context);
+
+        init_ctx(ctx);
+        test_system_values(ctx);
+        test_resource_access(ctx);
+        test_function_calls(ctx);
+        test_input_global(ctx);
+        test_private(ctx);
+        test_local(ctx);
+        test_sample(ctx);
+        test_many_kern(ctx);
+        test_constant(ctx);
+        test_resource_indirect(ctx);
+        test_surface_ld(ctx);
+        test_surface_st(ctx);
+        test_barrier(ctx);
+        test_atom_ops(ctx, true);
+        test_atom_race(ctx, true);
+        test_atom_ops(ctx, false);
+        test_atom_race(ctx, false);
+        destroy_ctx(ctx);
+
+        return 0;
+}
diff --git a/src/gallium/tests/trivial/quad-tex.c b/src/gallium/tests/trivial/quad-tex.c
index cc19e8d5eec..7caac29299f 100644
--- a/src/gallium/tests/trivial/quad-tex.c
+++ b/src/gallium/tests/trivial/quad-tex.c
@@ -57,16 +57,12 @@
 #include "util/u_memory.h"
 /* util_make_[fragment|vertex]_passthrough_shader */
 #include "util/u_simple_shaders.h"
-
-/* sw_screen_create: to get a software pipe driver */
-#include "target-helpers/inline_sw_helper.h"
-/* debug_screen_wrap: to wrap with debug pipe drivers */
-#include "target-helpers/inline_debug_helper.h"
-/* null software winsys */
-#include "sw/null/null_sw_winsys.h"
+/* to get a hardware pipe driver */
+#include "pipe-loader/pipe_loader.h"
 
 struct program
 {
+	struct pipe_loader_device *dev;
 	struct pipe_screen *screen;
 	struct pipe_context *pipe;
 	struct cso_context *cso;
@@ -93,10 +89,15 @@ struct program
 static void init_prog(struct program *p)
 {
 	struct pipe_surface surf_tmpl;
-	/* create the software rasterizer */
-	p->screen = sw_screen_create(null_sw_create());
-	/* wrap the screen with any debugger */
-	p->screen = debug_screen_wrap(p->screen);
+	int ret;
+
+	/* find a hardware device */
+	ret = pipe_loader_probe(&p->dev, 1);
+	assert(ret);
+
+	/* init a pipe screen */
+	p->screen = pipe_loader_create_screen(p->dev, PIPE_SEARCH_DIR);
+	assert(p->screen);
 
 	/* create the pipe driver context and cso context */
 	p->pipe = p->screen->context_create(p->screen, NULL);
@@ -298,6 +299,7 @@ static void close_prog(struct program *p)
 	cso_destroy_context(p->cso);
 	p->pipe->destroy(p->pipe);
 	p->screen->destroy(p->screen);
+	pipe_loader_release(&p->dev, 1);
 
 	FREE(p);
 }
diff --git a/src/gallium/tests/trivial/tri.c b/src/gallium/tests/trivial/tri.c
index 9190f7824e9..f3e1e944154 100644
--- a/src/gallium/tests/trivial/tri.c
+++ b/src/gallium/tests/trivial/tri.c
@@ -55,16 +55,12 @@
 #include "util/u_memory.h"
 /* util_make_[fragment|vertex]_passthrough_shader */
 #include "util/u_simple_shaders.h"
-
-/* sw_screen_create: to get a software pipe driver */
-#include "target-helpers/inline_sw_helper.h"
-/* debug_screen_wrap: to wrap with debug pipe drivers */
-#include "target-helpers/inline_debug_helper.h"
-/* null software winsys */
-#include "sw/null/null_sw_winsys.h"
+/* to get a hardware pipe driver */
+#include "pipe-loader/pipe_loader.h"
 
 struct program
 {
+	struct pipe_loader_device *dev;
 	struct pipe_screen *screen;
 	struct pipe_context *pipe;
 	struct cso_context *cso;
@@ -88,10 +84,15 @@ struct program
 static void init_prog(struct program *p)
 {
 	struct pipe_surface surf_tmpl;
-	/* create the software rasterizer */
-	p->screen = sw_screen_create(null_sw_create());
-	/* wrap the screen with any debugger */
-	p->screen = debug_screen_wrap(p->screen);
+	int ret;
+
+	/* find a hardware device */
+	ret = pipe_loader_probe(&p->dev, 1);
+	assert(ret);
+
+	/* init a pipe screen */
+	p->screen = pipe_loader_create_screen(p->dev, PIPE_SEARCH_DIR);
+	assert(p->screen);
 
 	/* create the pipe driver context and cso context */
 	p->pipe = p->screen->context_create(p->screen, NULL);
@@ -234,6 +235,7 @@ static void close_prog(struct program *p)
 	cso_destroy_context(p->cso);
 	p->pipe->destroy(p->pipe);
 	p->screen->destroy(p->screen);
+	pipe_loader_release(&p->dev, 1);
 
 	FREE(p);
 }
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
index 4d343b8489b..fc57d676876 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
@@ -81,6 +81,10 @@
 #define RADEON_INFO_IB_VM_MAX_SIZE  0x0f
 #endif
 
+#ifndef RADEON_INFO_MAX_PIPES
+#define RADEON_INFO_MAX_PIPES 0x10
+#endif
+
 
 /* Enable/disable feature access for one command stream.
  * If enable == TRUE, return TRUE on success.
@@ -299,6 +303,12 @@ static boolean do_winsys_init(struct radeon_drm_winsys *ws)
 	ws->info.r600_has_streamout = ws->info.drm_minor >= 13;
     }
 
+    /* Get max pipes, this is only needed for compute shaders.  All evergreen+
+     * chips have at least 2 pipes, so we use 2 as a default. */
+    ws->info.r600_max_pipes = 2;
+    radeon_get_drm_value(ws->fd, RADEON_INFO_MAX_PIPES, NULL,
+                         &ws->info.r600_max_pipes);
+
     return TRUE;
 }
 
diff --git a/src/gallium/winsys/radeon/drm/radeon_winsys.h b/src/gallium/winsys/radeon/drm/radeon_winsys.h
index 99768248644..6f85b3e11d2 100644
--- a/src/gallium/winsys/radeon/drm/radeon_winsys.h
+++ b/src/gallium/winsys/radeon/drm/radeon_winsys.h
@@ -98,6 +98,7 @@ struct radeon_info {
     uint32_t r600_va_start;
     uint32_t r600_ib_vm_max_size;
     boolean r600_has_streamout;
+    uint32_t r600_max_pipes;
 };
 
 enum radeon_feature_id {
diff --git a/src/gbm/backends/dri/gbm_dri.c b/src/gbm/backends/dri/gbm_dri.c
index 4df6e8fcf06..e5ddfb6ce69 100644
--- a/src/gbm/backends/dri/gbm_dri.c
+++ b/src/gbm/backends/dri/gbm_dri.c
@@ -291,6 +291,18 @@ gbm_dri_is_format_supported(struct gbm_device *gbm,
    return 1;
 }
 
+static int
+gbm_dri_bo_write(struct gbm_bo *_bo, const void *buf, size_t count)
+{
+   struct gbm_dri_device *dri = gbm_dri_device(_bo->gbm);
+   struct gbm_dri_bo *bo = gbm_dri_bo(_bo);
+
+   if (dri->image->base.version < 4)
+      return -1;
+
+   return dri->image->write(bo->image, buf, count);
+}
+
 static void
 gbm_dri_bo_destroy(struct gbm_bo *_bo)
 {
@@ -390,6 +402,9 @@ gbm_dri_bo_create(struct gbm_device *gbm,
    int dri_format;
    unsigned dri_use = 0;
 
+   if (dri->image->base.version < 4 && (usage & GBM_BO_USE_WRITE))
+      return NULL;
+
    bo = calloc(1, sizeof *bo);
    if (bo == NULL)
       return NULL;
@@ -421,6 +436,8 @@ gbm_dri_bo_create(struct gbm_device *gbm,
       dri_use |= __DRI_IMAGE_USE_SCANOUT;
    if (usage & GBM_BO_USE_CURSOR_64X64)
       dri_use |= __DRI_IMAGE_USE_CURSOR;
+   if (usage & GBM_BO_USE_WRITE)
+      dri_use |= __DRI_IMAGE_USE_WRITE;
 
    bo->image =
       dri->image->createImage(dri->screen,
@@ -491,6 +508,7 @@ dri_device_create(int fd)
    dri->base.base.bo_create = gbm_dri_bo_create;
    dri->base.base.bo_create_from_egl_image = gbm_dri_bo_create_from_egl_image;
    dri->base.base.is_format_supported = gbm_dri_is_format_supported;
+   dri->base.base.bo_write = gbm_dri_bo_write;
    dri->base.base.bo_destroy = gbm_dri_bo_destroy;
    dri->base.base.destroy = dri_destroy;
    dri->base.base.surface_create = gbm_dri_surface_create;
diff --git a/src/gbm/backends/dri/gbm_driint.h b/src/gbm/backends/dri/gbm_driint.h
index 3b7db65ce9f..f4043683f11 100644
--- a/src/gbm/backends/dri/gbm_driint.h
+++ b/src/gbm/backends/dri/gbm_driint.h
@@ -77,9 +77,6 @@ struct gbm_dri_bo {
 struct gbm_dri_surface {
    struct gbm_surface base;
 
-   __DRIbuffer *(*get_front_buffer)(struct gbm_dri_surface *, void *);
-   void (*release_buffer)(struct gbm_dri_surface *, __DRIbuffer *, void *);
-   int (*has_free_buffers)(void *);
    void *dri_private;
 };
 
diff --git a/src/gbm/main/gbm.c b/src/gbm/main/gbm.c
index 79ba65051f2..3994f86aafc 100644
--- a/src/gbm/main/gbm.c
+++ b/src/gbm/main/gbm.c
@@ -231,6 +231,65 @@ gbm_bo_get_handle(struct gbm_bo *bo)
    return bo->handle;
 }
 
+/** Write data into the buffer object
+ *
+ * If the buffer object was created with the GBM_BO_USE_WRITE flag,
+ * this function can used to write data into the buffer object.  The
+ * data is copied directly into the object and it's the responsiblity
+ * of the caller to make sure the data represents valid pixel data,
+ * according to the width, height, stride and format of the buffer object.
+ *
+ * \param bo The buffer object
+ * \param buf The data to write
+ * \param count The number of bytes to write
+ * \return Returns -1 on error, 0 otherwise
+ */
+GBM_EXPORT int
+gbm_bo_write(struct gbm_bo *bo, const void *buf, size_t count)
+{
+   return bo->gbm->bo_write(bo, buf, count);
+}
+
+/** Get the gbm device used to create the buffer object
+ *
+ * \param bo The buffer object
+ * \return Returns the gbm device with which the buffer object was created
+ */
+GBM_EXPORT struct gbm_device *
+gbm_bo_get_device(struct gbm_bo *bo)
+{
+	return bo->gbm;
+}
+
+/** Set the user data associated with a buffer object
+ *
+ * \param bo The buffer object
+ * \param data The data to associate to the buffer object
+ * \param destroy_user_data A callback (which may be %NULL) that will be
+ * called prior to the buffer destruction
+ */
+GBM_EXPORT void
+gbm_bo_set_user_data(struct gbm_bo *bo, void *data,
+		     void (*destroy_user_data)(struct gbm_bo *, void *))
+{
+   bo->user_data = data;
+   bo->destroy_user_data = destroy_user_data;
+}
+
+/** Get the user data associated with a buffer object
+ *
+ * \param bo The buffer object
+ * \return Returns the user data associated with the buffer object or %NULL
+ * if no data was associated with it
+ *
+ * \sa gbm_bo_set_user_data()
+ */
+GBM_EXPORT void *
+gbm_bo_get_user_data(struct gbm_bo *bo)
+{
+   return bo->user_data;
+}
+
 /**
  * Destroys the given buffer object and frees all resources associated with
  * it.
@@ -240,6 +299,9 @@ gbm_bo_get_handle(struct gbm_bo *bo)
 GBM_EXPORT void
 gbm_bo_destroy(struct gbm_bo *bo)
 {
+   if (bo->destroy_user_data)
+      bo->destroy_user_data(bo, bo->user_data);
+
    bo->gbm->bo_destroy(bo);
 }
 
@@ -357,10 +419,11 @@ gbm_surface_destroy(struct gbm_surface *surf)
  *
  * \param surf The surface
  *
- * \return A newly allocated buffer object that should be released
- * with gbm_surface_release_buffer() when no longer needed.  This bo
- * should not be destroyed using gbm_bo_destroy().  If an error occurs
- * this function returns %NULL.
+ * \return A buffer object that should be released with
+ * gbm_surface_release_buffer() when no longer needed.  The implementation
+ * is free to reuse buffers released with gbm_surface_release_buffer() so
+ * this bo should not be destroyed using gbm_bo_destroy().  If an error
+ * occurs this function returns %NULL.
  */
 GBM_EXPORT struct gbm_bo *
 gbm_surface_lock_front_buffer(struct gbm_surface *surf)
@@ -371,10 +434,11 @@ gbm_surface_lock_front_buffer(struct gbm_surface *surf)
 /**
  * Release a locked buffer obtained with gbm_surface_lock_front_buffer()
  *
- * The bo is destroyed after a call to this function and returns the
- * underlying buffer to the gbm surface.  Releasing a bo will
- * typically make gbm_surface_has_free_buffer() return 1 and thus
- * allow rendering the next frame, but not always.
+ * Returns the underlying buffer to the gbm surface.  Releasing a bo
+ * will typically make gbm_surface_has_free_buffer() return 1 and thus
+ * allow rendering the next frame, but not always. The implementation
+ * may choose to destroy the bo immediately or reuse it, in which case
+ * the user data associated with it is unchanged.
  *
  * \param surf The surface
  * \param bo The buffer object
diff --git a/src/gbm/main/gbm.h b/src/gbm/main/gbm.h
index 6748752d8f1..af5dc5aee8c 100644
--- a/src/gbm/main/gbm.h
+++ b/src/gbm/main/gbm.h
@@ -201,6 +201,12 @@ enum gbm_bo_flags {
     * as the storage for a color buffer
     */
    GBM_BO_USE_RENDERING    = (1 << 2),
+   /**
+    * Buffer can be used for gbm_bo_write.  This is guaranteed to work
+    * with GBM_BO_USE_CURSOR_64X64. but may not work for other
+    * combinations.
+    */
+   GBM_BO_USE_WRITE    = (1 << 3),
 };
 
 int
@@ -242,9 +248,22 @@ gbm_bo_get_pitch(struct gbm_bo *bo);
 uint32_t
 gbm_bo_get_format(struct gbm_bo *bo);
 
+struct gbm_device *
+gbm_bo_get_device(struct gbm_bo *bo);
+
 union gbm_bo_handle
 gbm_bo_get_handle(struct gbm_bo *bo);
 
+int
+gbm_bo_write(struct gbm_bo *bo, const void *buf, size_t count);
+
+void
+gbm_bo_set_user_data(struct gbm_bo *bo, void *data,
+		     void (*destroy_user_data)(struct gbm_bo *, void *));
+
+void *
+gbm_bo_get_user_data(struct gbm_bo *bo);
+
 void
 gbm_bo_destroy(struct gbm_bo *bo);
 
diff --git a/src/gbm/main/gbmint.h b/src/gbm/main/gbmint.h
index 53d73f40df6..8eb8671aeb2 100644
--- a/src/gbm/main/gbmint.h
+++ b/src/gbm/main/gbmint.h
@@ -70,6 +70,7 @@ struct gbm_device {
                                               void *egl_dpy, void *egl_img,
                                               uint32_t width, uint32_t height,
                                               uint32_t usage);
+   int (*bo_write)(struct gbm_bo *bo, const void *buf, size_t data);
    void (*bo_destroy)(struct gbm_bo *bo);
 
    struct gbm_surface *(*surface_create)(struct gbm_device *gbm,
@@ -94,6 +95,8 @@ struct gbm_bo {
    uint32_t pitch;
    uint32_t format;
    union gbm_bo_handle  handle;
+   void *user_data;
+   void (*destroy_user_data)(struct gbm_bo *, void *);
 };
 
 struct gbm_surface {
diff --git a/src/glsl/TODO b/src/glsl/TODO
index c99d7e152d6..eb73fc2e813 100644
--- a/src/glsl/TODO
+++ b/src/glsl/TODO
@@ -6,22 +6,10 @@
   constant index values.  For others it is more complicated.  Perhaps these
   cases should be silently converted to uniforms?
 
-- Implement support for ir_binop_dot in ir_algebraic.cpp.  Perform
+- Implement support for ir_binop_dot in opt_algebraic.cpp.  Perform
   transformations such as "dot(v, vec3(0.0, 1.0, 0.0))" -> v.y.
 
 - Track source locations throughout the IR.  There are currently several
   places where we cannot emit line numbers for errors (and currently emit 0:0)
   because we've "lost" the line number information.  This is particularly
   noticeable at link time.
-
-1.30 features:
-
-- Implement AST-to-HIR conversion of switch-statements
-  - switch
-  - case
-  - Update break to correcly handle mixed nexting of switch-statements
-    and loops.
-
-- Implement support for gl_ClipDistance.  This is non-trivial because
-  gl_ClipDistance is exposed as a float[8], but all hardware actually
-  implements it as vec4[2].
\ No newline at end of file
diff --git a/src/glsl/ast_function.cpp b/src/glsl/ast_function.cpp
index 8bf0ba2a876..9e7c5995fc9 100644
--- a/src/glsl/ast_function.cpp
+++ b/src/glsl/ast_function.cpp
@@ -153,21 +153,21 @@ verify_parameter_modes(_mesa_glsl_parse_state *state,
 	 }
 
 	 ir_variable *var = actual->variable_referenced();
-	 if (var) {
-	    if (var->read_only) {
-	       _mesa_glsl_error(&loc, state,
-				"function parameter '%s %s' references the "
-				"read-only variable '%s'",
-				mode, formal->name,
-				actual->variable_referenced()->name);
-	       return false;
-	    } else if (!actual->is_lvalue()) {
-	       _mesa_glsl_error(&loc, state,
-				"function parameter '%s %s' is not an lvalue",
-				mode, formal->name);
-	       return false;
-	    }
+	 if (var)
 	    var->assigned = true;
+
+	 if (var && var->read_only) {
+	    _mesa_glsl_error(&loc, state,
+			     "function parameter '%s %s' references the "
+			     "read-only variable '%s'",
+			     mode, formal->name,
+			     actual->variable_referenced()->name);
+	    return false;
+	 } else if (!actual->is_lvalue()) {
+	    _mesa_glsl_error(&loc, state,
+			     "function parameter '%s %s' is not an lvalue",
+			     mode, formal->name);
+	    return false;
 	 }
       }
 
@@ -278,7 +278,7 @@ generate_call(exec_list *instructions, ir_function_signature *sig,
     * Function calls were first allowed to be constant expressions in GLSL 1.20.
     */
    if (state->language_version >= 120) {
-      ir_constant *value = sig->constant_expression_value(actual_parameters);
+      ir_constant *value = sig->constant_expression_value(actual_parameters, NULL);
       if (value != NULL) {
 	 return value;
       }
diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp
index e24914b832a..86bb8741bcc 100644
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -4039,13 +4039,13 @@ detect_conflicting_assignments(struct _mesa_glsl_parse_state *state,
    foreach_list(node, instructions) {
       ir_variable *var = ((ir_instruction *)node)->as_variable();
 
-      if (!var)
+      if (!var || !var->assigned)
 	 continue;
 
       if (strcmp(var->name, "gl_FragColor") == 0)
-	 gl_FragColor_assigned = var->assigned;
+	 gl_FragColor_assigned = true;
       else if (strcmp(var->name, "gl_FragData") == 0)
-	 gl_FragData_assigned = var->assigned;
+	 gl_FragData_assigned = true;
       else if (strncmp(var->name, "gl_", 3) != 0) {
 	 if (state->target == fragment_shader &&
 	     (var->mode == ir_var_out || var->mode == ir_var_inout)) {
diff --git a/src/glsl/glsl_parser_extras.cpp b/src/glsl/glsl_parser_extras.cpp
index ae7a365f4b2..6f1c86b43ff 100644
--- a/src/glsl/glsl_parser_extras.cpp
+++ b/src/glsl/glsl_parser_extras.cpp
@@ -36,8 +36,9 @@ extern "C" {
 #include "ir_optimization.h"
 #include "loop_analysis.h"
 
-_mesa_glsl_parse_state::_mesa_glsl_parse_state(struct gl_context *ctx,
+_mesa_glsl_parse_state::_mesa_glsl_parse_state(struct gl_context *_ctx,
 					       GLenum target, void *mem_ctx)
+ : ctx(_ctx)
 {
    switch (target) {
    case GL_VERTEX_SHADER:   this->target = vertex_shader; break;
@@ -134,24 +135,49 @@ _mesa_glsl_shader_target_name(enum _mesa_glsl_parser_targets target)
    return "unknown";
 }
 
+/* This helper function will append the given message to the shader's
+   info log and report it via GL_ARB_debug_output. Per that extension,
+   'type' is one of the enum values classifying the message, and
+   'id' is the implementation-defined ID of the given message. */
+static void
+_mesa_glsl_msg(const YYLTYPE *locp, _mesa_glsl_parse_state *state,
+               GLenum type, GLuint id, const char *fmt, va_list ap)
+{
+   bool error = (type == GL_DEBUG_TYPE_ERROR_ARB);
+
+   assert(state->info_log != NULL);
+
+   /* Get the offset that the new message will be written to. */
+   int msg_offset = strlen(state->info_log);
+
+   ralloc_asprintf_append(&state->info_log, "%u:%u(%u): %s: ",
+					    locp->source,
+					    locp->first_line,
+					    locp->first_column,
+					    error ? "error" : "warning");
+   ralloc_vasprintf_append(&state->info_log, fmt, ap);
+
+   const char *const msg = &state->info_log[msg_offset];
+   struct gl_context *ctx = state->ctx;
+   /* Report the error via GL_ARB_debug_output. */
+   if (error)
+      _mesa_shader_debug(ctx, type, id, msg, strlen(msg));
+
+   ralloc_strcat(&state->info_log, "\n");
+}
 
 void
 _mesa_glsl_error(YYLTYPE *locp, _mesa_glsl_parse_state *state,
 		 const char *fmt, ...)
 {
    va_list ap;
+   GLenum type = GL_DEBUG_TYPE_ERROR_ARB;
 
    state->error = true;
 
-   assert(state->info_log != NULL);
-   ralloc_asprintf_append(&state->info_log, "%u:%u(%u): error: ",
-					    locp->source,
-					    locp->first_line,
-					    locp->first_column);
    va_start(ap, fmt);
-   ralloc_vasprintf_append(&state->info_log, fmt, ap);
+   _mesa_glsl_msg(locp, state, type, SHADER_ERROR_UNKNOWN, fmt, ap);
    va_end(ap);
-   ralloc_strcat(&state->info_log, "\n");
 }
 
 
@@ -160,16 +186,11 @@ _mesa_glsl_warning(const YYLTYPE *locp, _mesa_glsl_parse_state *state,
 		   const char *fmt, ...)
 {
    va_list ap;
+   GLenum type = GL_DEBUG_TYPE_OTHER_ARB;
 
-   assert(state->info_log != NULL);
-   ralloc_asprintf_append(&state->info_log, "%u:%u(%u): warning: ",
-					    locp->source,
-					    locp->first_line,
-					    locp->first_column);
    va_start(ap, fmt);
-   ralloc_vasprintf_append(&state->info_log, fmt, ap);
+   _mesa_glsl_msg(locp, state, type, 0, fmt, ap);
    va_end(ap);
-   ralloc_strcat(&state->info_log, "\n");
 }
 
 
diff --git a/src/glsl/glsl_parser_extras.h b/src/glsl/glsl_parser_extras.h
index 55676f5a9ec..1a909c68b7f 100644
--- a/src/glsl/glsl_parser_extras.h
+++ b/src/glsl/glsl_parser_extras.h
@@ -57,7 +57,7 @@ struct glsl_switch_state {
 };
 
 struct _mesa_glsl_parse_state {
-   _mesa_glsl_parse_state(struct gl_context *ctx, GLenum target,
+   _mesa_glsl_parse_state(struct gl_context *_ctx, GLenum target,
 			  void *mem_ctx);
 
    /* Callers of this ralloc-based new need not call delete. It's
@@ -77,6 +77,7 @@ struct _mesa_glsl_parse_state {
       ralloc_free(mem);
    }
 
+   struct gl_context *const ctx;
    void *scanner;
    exec_list translation_unit;
    glsl_symbol_table *symbols;
diff --git a/src/glsl/ir.cpp b/src/glsl/ir.cpp
index 1ba87515ea7..970d8f3bac0 100644
--- a/src/glsl/ir.cpp
+++ b/src/glsl/ir.cpp
@@ -716,12 +716,27 @@ ir_constant::ir_constant(const struct glsl_type *type, exec_list *value_list)
 ir_constant *
 ir_constant::zero(void *mem_ctx, const glsl_type *type)
 {
-   assert(type->is_numeric() || type->is_boolean());
+   assert(type->is_scalar() || type->is_vector() || type->is_matrix()
+	  || type->is_record() || type->is_array());
 
    ir_constant *c = new(mem_ctx) ir_constant;
    c->type = type;
    memset(&c->value, 0, sizeof(c->value));
 
+   if (type->is_array()) {
+      c->array_elements = ralloc_array(c, ir_constant *, type->length);
+
+      for (unsigned i = 0; i < type->length; i++)
+	 c->array_elements[i] = ir_constant::zero(c, type->element_type());
+   }
+
+   if (type->is_record()) {
+      for (unsigned i = 0; i < type->length; i++) {
+	 ir_constant *comp = ir_constant::zero(mem_ctx, type->fields.structure[i].type);
+	 c->components.push_tail(comp);
+      }
+   }
+
    return c;
 }
 
@@ -841,6 +856,95 @@ ir_constant::get_record_field(const char *name)
    return (ir_constant *) node;
 }
 
+void
+ir_constant::copy_offset(ir_constant *src, int offset)
+{
+   switch (this->type->base_type) {
+   case GLSL_TYPE_UINT:
+   case GLSL_TYPE_INT:
+   case GLSL_TYPE_FLOAT:
+   case GLSL_TYPE_BOOL: {
+      unsigned int size = src->type->components();
+      assert (size <= this->type->components() - offset);
+      for (unsigned int i=0; i<size; i++) {
+	 switch (this->type->base_type) {
+	 case GLSL_TYPE_UINT:
+	    value.u[i+offset] = src->get_uint_component(i);
+	    break;
+	 case GLSL_TYPE_INT:
+	    value.i[i+offset] = src->get_int_component(i);
+	    break;
+	 case GLSL_TYPE_FLOAT:
+	    value.f[i+offset] = src->get_float_component(i);
+	    break;
+	 case GLSL_TYPE_BOOL:
+	    value.b[i+offset] = src->get_bool_component(i);
+	    break;
+	 default: // Shut up the compiler
+	    break;
+	 }
+      }
+      break;
+   }
+
+   case GLSL_TYPE_STRUCT: {
+      assert (src->type == this->type);
+      this->components.make_empty();
+      foreach_list(node, &src->components) {
+	 ir_constant *const orig = (ir_constant *) node;
+
+	 this->components.push_tail(orig->clone(this, NULL));
+      }
+      break;
+   }
+
+   case GLSL_TYPE_ARRAY: {
+      assert (src->type == this->type);
+      for (unsigned i = 0; i < this->type->length; i++) {
+	 this->array_elements[i] = src->array_elements[i]->clone(this, NULL);
+      }
+      break;
+   }
+
+   default:
+      assert(!"Should not get here.");
+      break;
+   }
+}
+
+void
+ir_constant::copy_masked_offset(ir_constant *src, int offset, unsigned int mask)
+{
+   assert (!type->is_array() && !type->is_record());
+
+   if (!type->is_vector() && !type->is_matrix()) {
+      offset = 0;
+      mask = 1;
+   }
+
+   int id = 0;
+   for (int i=0; i<4; i++) {
+      if (mask & (1 << i)) {
+	 switch (this->type->base_type) {
+	 case GLSL_TYPE_UINT:
+	    value.u[i+offset] = src->get_uint_component(id++);
+	    break;
+	 case GLSL_TYPE_INT:
+	    value.i[i+offset] = src->get_int_component(id++);
+	    break;
+	 case GLSL_TYPE_FLOAT:
+	    value.f[i+offset] = src->get_float_component(id++);
+	    break;
+	 case GLSL_TYPE_BOOL:
+	    value.b[i+offset] = src->get_bool_component(id++);
+	    break;
+	 default:
+	    assert(!"Should not get here.");
+	    return;
+	 }
+      }
+   }
+}
 
 bool
 ir_constant::has_value(const ir_constant *c) const
@@ -1377,6 +1481,7 @@ ir_function_signature::ir_function_signature(const glsl_type *return_type)
 {
    this->ir_type = ir_type_function_signature;
    this->is_builtin = false;
+   this->origin = NULL;
 }
 
 
diff --git a/src/glsl/ir.h b/src/glsl/ir.h
index ddfaf3614ae..9c7961ab92c 100644
--- a/src/glsl/ir.h
+++ b/src/glsl/ir.h
@@ -146,7 +146,7 @@ public:
 
    virtual ir_visitor_status accept(ir_hierarchical_visitor *);
 
-   virtual ir_constant *constant_expression_value();
+   virtual ir_constant *constant_expression_value(struct hash_table *variable_context = NULL);
 
    virtual ir_rvalue * as_rvalue()
    {
@@ -502,10 +502,11 @@ public:
    virtual ir_visitor_status accept(ir_hierarchical_visitor *);
 
    /**
-    * Attempt to evaluate this function as a constant expression, given
-    * a list of the actual parameters.  Returns NULL for non-built-ins.
+    * Attempt to evaluate this function as a constant expression,
+    * given a list of the actual parameters and the variable context.
+    * Returns NULL for non-built-ins.
     */
-   ir_constant *constant_expression_value(exec_list *actual_parameters);
+   ir_constant *constant_expression_value(exec_list *actual_parameters, struct hash_table *variable_context);
 
    /**
     * Get the name of the function for which this is a signature
@@ -571,7 +572,25 @@ private:
    /** Function of which this signature is one overload. */
    class ir_function *_function;
 
+   /** Function signature of which this one is a prototype clone */
+   const ir_function_signature *origin;
+
    friend class ir_function;
+
+   /**
+    * Helper function to run a list of instructions for constant
+    * expression evaluation.
+    *
+    * The hash table represents the values of the visible variables.
+    * There are no scoping issues because the table is indexed on
+    * ir_variable pointers, not variable names.
+    *
+    * Returns false if the expression is not constant, true otherwise,
+    * and the value in *result if result is non-NULL.
+    */
+   bool constant_expression_evaluate_expression_list(const struct exec_list &body,
+						     struct hash_table *variable_context,
+						     ir_constant **result);
 };
 
 
@@ -763,7 +782,7 @@ public:
 
    virtual ir_assignment *clone(void *mem_ctx, struct hash_table *ht) const;
 
-   virtual ir_constant *constant_expression_value();
+   virtual ir_constant *constant_expression_value(struct hash_table *variable_context = NULL);
 
    virtual void accept(ir_visitor *v)
    {
@@ -999,10 +1018,14 @@ public:
    /**
     * Attempt to constant-fold the expression
     *
+    * The "variable_context" hash table links ir_variable * to ir_constant *
+    * that represent the variables' values.  \c NULL represents an empty
+    * context.
+    *
     * If the expression cannot be constant folded, this method will return
     * \c NULL.
     */
-   virtual ir_constant *constant_expression_value();
+   virtual ir_constant *constant_expression_value(struct hash_table *variable_context = NULL);
 
    /**
     * Determine the number of operands used by an expression
@@ -1065,7 +1088,7 @@ public:
 
    virtual ir_call *clone(void *mem_ctx, struct hash_table *ht) const;
 
-   virtual ir_constant *constant_expression_value();
+   virtual ir_constant *constant_expression_value(struct hash_table *variable_context = NULL);
 
    virtual ir_call *as_call()
    {
@@ -1297,7 +1320,7 @@ public:
 
    virtual ir_texture *clone(void *mem_ctx, struct hash_table *) const;
 
-   virtual ir_constant *constant_expression_value();
+   virtual ir_constant *constant_expression_value(struct hash_table *variable_context = NULL);
 
    virtual void accept(ir_visitor *v)
    {
@@ -1389,7 +1412,7 @@ public:
 
    virtual ir_swizzle *clone(void *mem_ctx, struct hash_table *) const;
 
-   virtual ir_constant *constant_expression_value();
+   virtual ir_constant *constant_expression_value(struct hash_table *variable_context = NULL);
 
    virtual ir_swizzle *as_swizzle()
    {
@@ -1446,6 +1469,15 @@ public:
     * Get the variable that is ultimately referenced by an r-value
     */
    virtual ir_variable *variable_referenced() const = 0;
+
+   /**
+    * Get the constant that is ultimately referenced by an r-value,
+    * in a constant expression evaluation context.
+    *
+    * The offset is used when the reference is to a specific column of
+    * a matrix.
+    */
+  virtual void constant_referenced(struct hash_table *variable_context, ir_constant *&store, int &offset) const = 0;
 };
 
 
@@ -1456,7 +1488,7 @@ public:
    virtual ir_dereference_variable *clone(void *mem_ctx,
 					  struct hash_table *) const;
 
-   virtual ir_constant *constant_expression_value();
+   virtual ir_constant *constant_expression_value(struct hash_table *variable_context = NULL);
 
    virtual ir_dereference_variable *as_dereference_variable()
    {
@@ -1471,6 +1503,15 @@ public:
       return this->var;
    }
 
+   /**
+    * Get the constant that is ultimately referenced by an r-value,
+    * in a constant expression evaluation context.
+    *
+    * The offset is used when the reference is to a specific column of
+    * a matrix.
+    */
+   virtual void constant_referenced(struct hash_table *variable_context, ir_constant *&store, int &offset) const;
+
    virtual ir_variable *whole_variable_referenced()
    {
       /* ir_dereference_variable objects always dereference the entire
@@ -1505,7 +1546,7 @@ public:
    virtual ir_dereference_array *clone(void *mem_ctx,
 				       struct hash_table *) const;
 
-   virtual ir_constant *constant_expression_value();
+   virtual ir_constant *constant_expression_value(struct hash_table *variable_context = NULL);
 
    virtual ir_dereference_array *as_dereference_array()
    {
@@ -1520,6 +1561,15 @@ public:
       return this->array->variable_referenced();
    }
 
+   /**
+    * Get the constant that is ultimately referenced by an r-value,
+    * in a constant expression evaluation context.
+    *
+    * The offset is used when the reference is to a specific column of
+    * a matrix.
+    */
+   virtual void constant_referenced(struct hash_table *variable_context, ir_constant *&store, int &offset) const;
+
    virtual void accept(ir_visitor *v)
    {
       v->visit(this);
@@ -1544,7 +1594,7 @@ public:
    virtual ir_dereference_record *clone(void *mem_ctx,
 					struct hash_table *) const;
 
-   virtual ir_constant *constant_expression_value();
+   virtual ir_constant *constant_expression_value(struct hash_table *variable_context = NULL);
 
    /**
     * Get the variable that is ultimately referenced by an r-value
@@ -1554,6 +1604,15 @@ public:
       return this->record->variable_referenced();
    }
 
+   /**
+    * Get the constant that is ultimately referenced by an r-value,
+    * in a constant expression evaluation context.
+    *
+    * The offset is used when the reference is to a specific column of
+    * a matrix.
+    */
+   virtual void constant_referenced(struct hash_table *variable_context, ir_constant *&store, int &offset) const;
+
    virtual void accept(ir_visitor *v)
    {
       v->visit(this);
@@ -1609,7 +1668,7 @@ public:
 
    virtual ir_constant *clone(void *mem_ctx, struct hash_table *) const;
 
-   virtual ir_constant *constant_expression_value();
+   virtual ir_constant *constant_expression_value(struct hash_table *variable_context = NULL);
 
    virtual ir_constant *as_constant()
    {
@@ -1641,6 +1700,31 @@ public:
 
    ir_constant *get_record_field(const char *name);
 
+   /**
+    * Copy the values on another constant at a given offset.
+    *
+    * The offset is ignored for array or struct copies, it's only for
+    * scalars or vectors into vectors or matrices.
+    *
+    * With identical types on both sides and zero offset it's clone()
+    * without creating a new object.
+    */
+
+   void copy_offset(ir_constant *src, int offset);
+
+   /**
+    * Copy the values on another constant at a given offset and
+    * following an assign-like mask.
+    *
+    * The mask is ignored for scalars.
+    *
+    * Note that this function only handles what assign can handle,
+    * i.e. at most a vector as source and a column of a matrix as
+    * destination.
+    */
+
+   void copy_masked_offset(ir_constant *src, int offset, unsigned int mask);
+
    /**
     * Determine whether a constant has the same value as another constant
     *
diff --git a/src/glsl/ir_clone.cpp b/src/glsl/ir_clone.cpp
index 5a7a71cf6ba..591fe7b7756 100644
--- a/src/glsl/ir_clone.cpp
+++ b/src/glsl/ir_clone.cpp
@@ -53,6 +53,7 @@ ir_variable::clone(void *mem_ctx, struct hash_table *ht) const
    var->invariant = this->invariant;
    var->interpolation = this->interpolation;
    var->location = this->location;
+   var->index = this->index;
    var->warn_extension = this->warn_extension;
    var->origin_upper_left = this->origin_upper_left;
    var->pixel_center_integer = this->pixel_center_integer;
@@ -72,12 +73,6 @@ ir_variable::clone(void *mem_ctx, struct hash_table *ht) const
 	     sizeof(this->state_slots[0]) * var->num_state_slots);
    }
 
-   if (this->explicit_location)
-      var->location = this->location;
-
-   if (this->explicit_index)
-      var->index = this->index;
-
    if (this->constant_value)
       var->constant_value = this->constant_value->clone(mem_ctx, ht);
 
@@ -329,6 +324,7 @@ ir_function_signature::clone_prototype(void *mem_ctx, struct hash_table *ht) con
 
    copy->is_defined = false;
    copy->is_builtin = this->is_builtin;
+   copy->origin = this;
 
    /* Clone the parameter list, but NOT the body.
     */
diff --git a/src/glsl/ir_constant_expression.cpp b/src/glsl/ir_constant_expression.cpp
index 4e1714a8420..08a33285b3a 100644
--- a/src/glsl/ir_constant_expression.cpp
+++ b/src/glsl/ir_constant_expression.cpp
@@ -38,6 +38,7 @@
 #include "ir.h"
 #include "ir_visitor.h"
 #include "glsl_types.h"
+#include "program/hash_table.h"
 
 /* Using C99 rounding functions for roundToEven() implementation is
  * difficult, because round(), rint, and nearbyint() are affected by
@@ -71,14 +72,14 @@ dot(ir_constant *op0, ir_constant *op1)
 }
 
 ir_constant *
-ir_rvalue::constant_expression_value()
+ir_rvalue::constant_expression_value(struct hash_table *variable_context)
 {
    assert(this->type->is_error());
    return NULL;
 }
 
 ir_constant *
-ir_expression::constant_expression_value()
+ir_expression::constant_expression_value(struct hash_table *variable_context)
 {
    if (this->type->is_error())
       return NULL;
@@ -89,7 +90,7 @@ ir_expression::constant_expression_value()
    memset(&data, 0, sizeof(data));
 
    for (unsigned operand = 0; operand < this->get_num_operands(); operand++) {
-      op[operand] = this->operands[operand]->constant_expression_value();
+      op[operand] = this->operands[operand]->constant_expression_value(variable_context);
       if (!op[operand])
 	 return NULL;
    }
@@ -640,13 +641,13 @@ ir_expression::constant_expression_value()
       for (unsigned c = 0; c < op[0]->type->components(); c++) {
 	 switch (op[0]->type->base_type) {
 	 case GLSL_TYPE_UINT:
-	    data.b[0] = op[0]->value.u[0] < op[1]->value.u[0];
+	    data.b[c] = op[0]->value.u[c] < op[1]->value.u[c];
 	    break;
 	 case GLSL_TYPE_INT:
-	    data.b[0] = op[0]->value.i[0] < op[1]->value.i[0];
+	    data.b[c] = op[0]->value.i[c] < op[1]->value.i[c];
 	    break;
 	 case GLSL_TYPE_FLOAT:
-	    data.b[0] = op[0]->value.f[0] < op[1]->value.f[0];
+	    data.b[c] = op[0]->value.f[c] < op[1]->value.f[c];
 	    break;
 	 default:
 	    assert(0);
@@ -676,13 +677,13 @@ ir_expression::constant_expression_value()
       for (unsigned c = 0; c < op[0]->type->components(); c++) {
 	 switch (op[0]->type->base_type) {
 	 case GLSL_TYPE_UINT:
-	    data.b[0] = op[0]->value.u[0] <= op[1]->value.u[0];
+	    data.b[c] = op[0]->value.u[c] <= op[1]->value.u[c];
 	    break;
 	 case GLSL_TYPE_INT:
-	    data.b[0] = op[0]->value.i[0] <= op[1]->value.i[0];
+	    data.b[c] = op[0]->value.i[c] <= op[1]->value.i[c];
 	    break;
 	 case GLSL_TYPE_FLOAT:
-	    data.b[0] = op[0]->value.f[0] <= op[1]->value.f[0];
+	    data.b[c] = op[0]->value.f[c] <= op[1]->value.f[c];
 	    break;
 	 default:
 	    assert(0);
@@ -694,13 +695,13 @@ ir_expression::constant_expression_value()
       for (unsigned c = 0; c < op[0]->type->components(); c++) {
 	 switch (op[0]->type->base_type) {
 	 case GLSL_TYPE_UINT:
-	    data.b[0] = op[0]->value.u[0] >= op[1]->value.u[0];
+	    data.b[c] = op[0]->value.u[c] >= op[1]->value.u[c];
 	    break;
 	 case GLSL_TYPE_INT:
-	    data.b[0] = op[0]->value.i[0] >= op[1]->value.i[0];
+	    data.b[c] = op[0]->value.i[c] >= op[1]->value.i[c];
 	    break;
 	 case GLSL_TYPE_FLOAT:
-	    data.b[0] = op[0]->value.f[0] >= op[1]->value.f[0];
+	    data.b[c] = op[0]->value.f[c] >= op[1]->value.f[c];
 	    break;
 	 default:
 	    assert(0);
@@ -886,7 +887,7 @@ ir_expression::constant_expression_value()
 
 
 ir_constant *
-ir_texture::constant_expression_value()
+ir_texture::constant_expression_value(struct hash_table *variable_context)
 {
    /* texture lookups aren't constant expressions */
    return NULL;
@@ -894,9 +895,9 @@ ir_texture::constant_expression_value()
 
 
 ir_constant *
-ir_swizzle::constant_expression_value()
+ir_swizzle::constant_expression_value(struct hash_table *variable_context)
 {
-   ir_constant *v = this->val->constant_expression_value();
+   ir_constant *v = this->val->constant_expression_value(variable_context);
 
    if (v != NULL) {
       ir_constant_data data = { { 0 } };
@@ -922,13 +923,33 @@ ir_swizzle::constant_expression_value()
 }
 
 
+void
+ir_dereference_variable::constant_referenced(struct hash_table *variable_context,
+					     ir_constant *&store, int &offset) const
+{
+   if (variable_context) {
+      store = (ir_constant *)hash_table_find(variable_context, var);
+      offset = 0;
+   } else {
+      store = NULL;
+      offset = 0;
+   }
+}
+
 ir_constant *
-ir_dereference_variable::constant_expression_value()
+ir_dereference_variable::constant_expression_value(struct hash_table *variable_context)
 {
    /* This may occur during compile and var->type is glsl_type::error_type */
    if (!var)
       return NULL;
 
+   /* Give priority to the context hashtable, if it exists */
+   if (variable_context) {
+      ir_constant *value = (ir_constant *)hash_table_find(variable_context, var);
+      if(value)
+	 return value;
+   }
+
    /* The constant_value of a uniform variable is its initializer,
     * not the lifetime constant value of the uniform.
     */
@@ -942,11 +963,65 @@ ir_dereference_variable::constant_expression_value()
 }
 
 
+void
+ir_dereference_array::constant_referenced(struct hash_table *variable_context,
+					  ir_constant *&store, int &offset) const
+{
+   ir_constant *index_c = array_index->constant_expression_value(variable_context);
+
+   if (!index_c || !index_c->type->is_scalar() || !index_c->type->is_integer()) {
+      store = 0;
+      offset = 0;
+      return;
+   }
+
+   int index = index_c->type->base_type == GLSL_TYPE_INT ?
+      index_c->get_int_component(0) :
+      index_c->get_uint_component(0);
+
+   ir_constant *substore;
+   int suboffset;
+   const ir_dereference *deref = array->as_dereference();
+   if (!deref) {
+      store = 0;
+      offset = 0;
+      return;
+   }
+
+   deref->constant_referenced(variable_context, substore, suboffset);
+
+   if (!substore) {
+      store = 0;
+      offset = 0;
+      return;
+   }
+
+   const glsl_type *vt = substore->type;
+   if (vt->is_array()) {
+      store = substore->get_array_element(index);
+      offset = 0;
+      return;
+   }
+   if (vt->is_matrix()) {
+      store = substore;
+      offset = index * vt->vector_elements;
+      return;
+   }
+   if (vt->is_vector()) {
+      store = substore;
+      offset = suboffset + index;
+      return;
+   }
+
+   store = 0;
+   offset = 0;
+}
+
 ir_constant *
-ir_dereference_array::constant_expression_value()
+ir_dereference_array::constant_expression_value(struct hash_table *variable_context)
 {
-   ir_constant *array = this->array->constant_expression_value();
-   ir_constant *idx = this->array_index->constant_expression_value();
+   ir_constant *array = this->array->constant_expression_value(variable_context);
+   ir_constant *idx = this->array_index->constant_expression_value(variable_context);
 
    if ((array != NULL) && (idx != NULL)) {
       void *ctx = ralloc_parent(this);
@@ -997,8 +1072,33 @@ ir_dereference_array::constant_expression_value()
 }
 
 
+void
+ir_dereference_record::constant_referenced(struct hash_table *variable_context,
+					   ir_constant *&store, int &offset) const
+{
+   ir_constant *substore;
+   int suboffset;
+   const ir_dereference *deref = record->as_dereference();
+   if (!deref) {
+      store = 0;
+      offset = 0;
+      return;
+   }
+
+   deref->constant_referenced(variable_context, substore, suboffset);
+
+   if (!substore) {
+      store = 0;
+      offset = 0;
+      return;
+   }
+
+   store = substore->get_record_field(field);
+   offset = 0;
+}
+
 ir_constant *
-ir_dereference_record::constant_expression_value()
+ir_dereference_record::constant_expression_value(struct hash_table *variable_context)
 {
    ir_constant *v = this->record->constant_expression_value();
 
@@ -1007,7 +1107,7 @@ ir_dereference_record::constant_expression_value()
 
 
 ir_constant *
-ir_assignment::constant_expression_value()
+ir_assignment::constant_expression_value(struct hash_table *variable_context)
 {
    /* FINISHME: Handle CEs involving assignment (return RHS) */
    return NULL;
@@ -1015,21 +1115,130 @@ ir_assignment::constant_expression_value()
 
 
 ir_constant *
-ir_constant::constant_expression_value()
+ir_constant::constant_expression_value(struct hash_table *variable_context)
 {
    return this;
 }
 
 
 ir_constant *
-ir_call::constant_expression_value()
+ir_call::constant_expression_value(struct hash_table *variable_context)
 {
-   return this->callee->constant_expression_value(&this->actual_parameters);
+   return this->callee->constant_expression_value(&this->actual_parameters, variable_context);
 }
 
 
+bool ir_function_signature::constant_expression_evaluate_expression_list(const struct exec_list &body,
+									 struct hash_table *variable_context,
+									 ir_constant **result)
+{
+   foreach_list(n, &body) {
+      ir_instruction *inst = (ir_instruction *)n;
+      switch(inst->ir_type) {
+
+	 /* (declare () type symbol) */
+      case ir_type_variable: {
+	 ir_variable *var = inst->as_variable();
+	 hash_table_insert(variable_context, ir_constant::zero(this, var->type), var);
+	 break;
+      }
+
+	 /* (assign [condition] (write-mask) (ref) (value)) */
+      case ir_type_assignment: {
+	 ir_assignment *asg = inst->as_assignment();
+	 if (asg->condition) {
+	    ir_constant *cond = asg->condition->constant_expression_value(variable_context);
+	    if (!cond)
+	       return false;
+	    if (!cond->get_bool_component(0))
+	       break;
+	 }
+
+	 ir_constant *store = NULL;
+	 int offset = 0;
+	 asg->lhs->constant_referenced(variable_context, store, offset);
+
+	 if (!store)
+	    return false;
+
+	 ir_constant *value = asg->rhs->constant_expression_value(variable_context);
+
+	 if (!value)
+	    return false;
+
+	 store->copy_masked_offset(value, offset, asg->write_mask);
+	 break;
+      }
+
+	 /* (return (expression)) */
+      case ir_type_return:
+	 assert (result);
+	 *result = inst->as_return()->value->constant_expression_value(variable_context);
+	 return *result != NULL;
+
+	 /* (call name (ref) (params))*/
+      case ir_type_call: {
+	 ir_call *call = inst->as_call();
+
+	 /* Just say no to void functions in constant expressions.  We
+	  * don't need them at that point.
+	  */
+
+	 if (!call->return_deref)
+	    return false;
+
+	 ir_constant *store = NULL;
+	 int offset = 0;
+	 call->return_deref->constant_referenced(variable_context, store, offset);
+
+	 if (!store)
+	    return false;
+
+	 ir_constant *value = call->constant_expression_value(variable_context);
+
+	 if(!value)
+	    return false;
+
+	 store->copy_offset(value, offset);
+	 break;
+      }
+
+	 /* (if condition (then-instructions) (else-instructions)) */
+      case ir_type_if: {
+	 ir_if *iif = inst->as_if();
+
+	 ir_constant *cond = iif->condition->constant_expression_value(variable_context);
+	 if (!cond || !cond->type->is_boolean())
+	    return false;
+
+	 exec_list &branch = cond->get_bool_component(0) ? iif->then_instructions : iif->else_instructions;
+
+	 *result = NULL;
+	 if (!constant_expression_evaluate_expression_list(branch, variable_context, result))
+	    return false;
+
+	 /* If there was a return in the branch chosen, drop out now. */
+	 if (*result)
+	    return true;
+
+	 break;
+      }
+
+	 /* Every other expression type, we drop out. */
+      default:
+	 return false;
+      }
+   }
+
+   /* Reaching the end of the block is not an error condition */
+   if (result)
+      *result = NULL;
+
+   return true;
+}
+
 ir_constant *
-ir_function_signature::constant_expression_value(exec_list *actual_parameters)
+ir_function_signature::constant_expression_value(exec_list *actual_parameters, struct hash_table *variable_context)
 {
    const glsl_type *type = this->return_type;
    if (type == glsl_type::void_type)
@@ -1042,396 +1251,48 @@ ir_function_signature::constant_expression_value(exec_list *actual_parameters)
    if (!this->is_builtin)
       return NULL;
 
-   unsigned num_parameters = 0;
+   /*
+    * Of the builtin functions, only the texture lookups and the noise
+    * ones must not be used in constant expressions.  They all include
+    * specific opcodes so they don't need to be special-cased at this
+    * point.
+    */
+
+   /* Initialize the table of dereferencable names with the function
+    * parameters.  Verify their const-ness on the way.
+    *
+    * We expect the correctness of the number of parameters to have
+    * been checked earlier.
+    */
+   hash_table *deref_hash = hash_table_ctor(8, hash_table_pointer_hash,
+					    hash_table_pointer_compare);
+
+   /* If "origin" is non-NULL, then the function body is there.  So we
+    * have to use the variable objects from the object with the body,
+    * but the parameter instanciation on the current object.
+    */
+   const exec_node *parameter_info = origin ? origin->parameters.head : parameters.head;
 
-   /* Check if all parameters are constant */
-   ir_constant *op[3];
    foreach_list(n, actual_parameters) {
-      ir_constant *constant = ((ir_rvalue *) n)->constant_expression_value();
+      ir_constant *constant = ((ir_rvalue *) n)->constant_expression_value(variable_context);
       if (constant == NULL)
 	 return NULL;
 
-      op[num_parameters] = constant;
+      ir_variable *var = (ir_variable *)parameter_info;
+      hash_table_insert(deref_hash, constant, var);
 
-      assert(num_parameters < 3);
-      num_parameters++;
+      parameter_info = parameter_info->next;
    }
 
-   /* Individual cases below can either:
-    * - Assign "expr" a new ir_expression to evaluate (for basic opcodes)
-    * - Fill "data" with appopriate constant data
-    * - Return an ir_constant directly.
-    */
-   void *mem_ctx = ralloc_parent(this);
-   ir_expression *expr = NULL;
-
-   ir_constant_data data;
-   memset(&data, 0, sizeof(data));
-
-   const char *callee = this->function_name();
-   if (strcmp(callee, "abs") == 0) {
-      expr = new(mem_ctx) ir_expression(ir_unop_abs, type, op[0], NULL);
-   } else if (strcmp(callee, "all") == 0) {
-      assert(op[0]->type->is_boolean());
-      for (unsigned c = 0; c < op[0]->type->components(); c++) {
-	 if (!op[0]->value.b[c])
-	    return new(mem_ctx) ir_constant(false);
-      }
-      return new(mem_ctx) ir_constant(true);
-   } else if (strcmp(callee, "any") == 0) {
-      assert(op[0]->type->is_boolean());
-      for (unsigned c = 0; c < op[0]->type->components(); c++) {
-	 if (op[0]->value.b[c])
-	    return new(mem_ctx) ir_constant(true);
-      }
-      return new(mem_ctx) ir_constant(false);
-   } else if (strcmp(callee, "acos") == 0) {
-      assert(op[0]->type->is_float());
-      for (unsigned c = 0; c < op[0]->type->components(); c++)
-	 data.f[c] = acosf(op[0]->value.f[c]);
-   } else if (strcmp(callee, "acosh") == 0) {
-      assert(op[0]->type->is_float());
-      for (unsigned c = 0; c < op[0]->type->components(); c++)
-	 data.f[c] = acoshf(op[0]->value.f[c]);
-   } else if (strcmp(callee, "asin") == 0) {
-      assert(op[0]->type->is_float());
-      for (unsigned c = 0; c < op[0]->type->components(); c++)
-	 data.f[c] = asinf(op[0]->value.f[c]);
-   } else if (strcmp(callee, "asinh") == 0) {
-      assert(op[0]->type->is_float());
-      for (unsigned c = 0; c < op[0]->type->components(); c++)
-	 data.f[c] = asinhf(op[0]->value.f[c]);
-   } else if (strcmp(callee, "atan") == 0) {
-      assert(op[0]->type->is_float());
-      if (num_parameters == 2) {
-	 assert(op[1]->type->is_float());
-	 for (unsigned c = 0; c < op[0]->type->components(); c++)
-	    data.f[c] = atan2f(op[0]->value.f[c], op[1]->value.f[c]);
-      } else {
-	 for (unsigned c = 0; c < op[0]->type->components(); c++)
-	    data.f[c] = atanf(op[0]->value.f[c]);
-      }
-   } else if (strcmp(callee, "atanh") == 0) {
-      assert(op[0]->type->is_float());
-      for (unsigned c = 0; c < op[0]->type->components(); c++)
-	 data.f[c] = atanhf(op[0]->value.f[c]);
-   } else if (strcmp(callee, "dFdx") == 0 || strcmp(callee, "dFdy") == 0) {
-      return ir_constant::zero(mem_ctx, type);
-   } else if (strcmp(callee, "ceil") == 0) {
-      expr = new(mem_ctx) ir_expression(ir_unop_ceil, type, op[0], NULL);
-   } else if (strcmp(callee, "clamp") == 0) {
-      assert(num_parameters == 3);
-      unsigned c1_inc = op[1]->type->is_scalar() ? 0 : 1;
-      unsigned c2_inc = op[2]->type->is_scalar() ? 0 : 1;
-      for (unsigned c = 0, c1 = 0, c2 = 0;
-	   c < op[0]->type->components();
-	   c1 += c1_inc, c2 += c2_inc, c++) {
-
-	 switch (op[0]->type->base_type) {
-	 case GLSL_TYPE_UINT:
-	    data.u[c] = CLAMP(op[0]->value.u[c], op[1]->value.u[c1],
-			      op[2]->value.u[c2]);
-	    break;
-	 case GLSL_TYPE_INT:
-	    data.i[c] = CLAMP(op[0]->value.i[c], op[1]->value.i[c1],
-			      op[2]->value.i[c2]);
-	    break;
-	 case GLSL_TYPE_FLOAT:
-	    data.f[c] = CLAMP(op[0]->value.f[c], op[1]->value.f[c1],
-			      op[2]->value.f[c2]);
-	    break;
-	 default:
-	    assert(!"Should not get here.");
-	 }
-      }
-   } else if (strcmp(callee, "cos") == 0) {
-      expr = new(mem_ctx) ir_expression(ir_unop_cos, type, op[0], NULL);
-   } else if (strcmp(callee, "cosh") == 0) {
-      assert(op[0]->type->is_float());
-      for (unsigned c = 0; c < op[0]->type->components(); c++)
-	 data.f[c] = coshf(op[0]->value.f[c]);
-   } else if (strcmp(callee, "cross") == 0) {
-      assert(op[0]->type == glsl_type::vec3_type);
-      assert(op[1]->type == glsl_type::vec3_type);
-      data.f[0] = (op[0]->value.f[1] * op[1]->value.f[2] -
-		   op[1]->value.f[1] * op[0]->value.f[2]);
-      data.f[1] = (op[0]->value.f[2] * op[1]->value.f[0] -
-		   op[1]->value.f[2] * op[0]->value.f[0]);
-      data.f[2] = (op[0]->value.f[0] * op[1]->value.f[1] -
-		   op[1]->value.f[0] * op[0]->value.f[1]);
-   } else if (strcmp(callee, "degrees") == 0) {
-      assert(op[0]->type->is_float());
-      for (unsigned c = 0; c < op[0]->type->components(); c++)
-	 data.f[c] = 180.0F / M_PI * op[0]->value.f[c];
-   } else if (strcmp(callee, "distance") == 0) {
-      assert(op[0]->type->is_float() && op[1]->type->is_float());
-      float length_squared = 0.0;
-      for (unsigned c = 0; c < op[0]->type->components(); c++) {
-	 float t = op[0]->value.f[c] - op[1]->value.f[c];
-	 length_squared += t * t;
-      }
-      return new(mem_ctx) ir_constant(sqrtf(length_squared));
-   } else if (strcmp(callee, "dot") == 0) {
-      return new(mem_ctx) ir_constant(dot(op[0], op[1]));
-   } else if (strcmp(callee, "equal") == 0) {
-      assert(op[0]->type->is_vector() && op[1] && op[1]->type->is_vector());
-      for (unsigned c = 0; c < op[0]->type->components(); c++) {
-	 switch (op[0]->type->base_type) {
-	 case GLSL_TYPE_UINT:
-	    data.b[c] = op[0]->value.u[c] == op[1]->value.u[c];
-	    break;
-	 case GLSL_TYPE_INT:
-	    data.b[c] = op[0]->value.i[c] == op[1]->value.i[c];
-	    break;
-	 case GLSL_TYPE_FLOAT:
-	    data.b[c] = op[0]->value.f[c] == op[1]->value.f[c];
-	    break;
-	 case GLSL_TYPE_BOOL:
-	    data.b[c] = op[0]->value.b[c] == op[1]->value.b[c];
-	    break;
-	 default:
-	    assert(!"Should not get here.");
-	 }
-      }
-   } else if (strcmp(callee, "exp") == 0) {
-      expr = new(mem_ctx) ir_expression(ir_unop_exp, type, op[0], NULL);
-   } else if (strcmp(callee, "exp2") == 0) {
-      expr = new(mem_ctx) ir_expression(ir_unop_exp2, type, op[0], NULL);
-   } else if (strcmp(callee, "faceforward") == 0) {
-      if (dot(op[2], op[1]) < 0)
-	 return op[0];
-      for (unsigned c = 0; c < op[0]->type->components(); c++)
-	 data.f[c] = -op[0]->value.f[c];
-   } else if (strcmp(callee, "floor") == 0) {
-      expr = new(mem_ctx) ir_expression(ir_unop_floor, type, op[0], NULL);
-   } else if (strcmp(callee, "fract") == 0) {
-      expr = new(mem_ctx) ir_expression(ir_unop_fract, type, op[0], NULL);
-   } else if (strcmp(callee, "fwidth") == 0) {
-      return ir_constant::zero(mem_ctx, type);
-   } else if (strcmp(callee, "greaterThan") == 0) {
-      assert(op[0]->type->is_vector() && op[1] && op[1]->type->is_vector());
-      for (unsigned c = 0; c < op[0]->type->components(); c++) {
-	 switch (op[0]->type->base_type) {
-	 case GLSL_TYPE_UINT:
-	    data.b[c] = op[0]->value.u[c] > op[1]->value.u[c];
-	    break;
-	 case GLSL_TYPE_INT:
-	    data.b[c] = op[0]->value.i[c] > op[1]->value.i[c];
-	    break;
-	 case GLSL_TYPE_FLOAT:
-	    data.b[c] = op[0]->value.f[c] > op[1]->value.f[c];
-	    break;
-	 default:
-	    assert(!"Should not get here.");
-	 }
-      }
-   } else if (strcmp(callee, "greaterThanEqual") == 0) {
-      assert(op[0]->type->is_vector() && op[1] && op[1]->type->is_vector());
-      for (unsigned c = 0; c < op[0]->type->components(); c++) {
-	 switch (op[0]->type->base_type) {
-	 case GLSL_TYPE_UINT:
-	    data.b[c] = op[0]->value.u[c] >= op[1]->value.u[c];
-	    break;
-	 case GLSL_TYPE_INT:
-	    data.b[c] = op[0]->value.i[c] >= op[1]->value.i[c];
-	    break;
-	 case GLSL_TYPE_FLOAT:
-	    data.b[c] = op[0]->value.f[c] >= op[1]->value.f[c];
-	    break;
-	 default:
-	    assert(!"Should not get here.");
-	 }
-      }
-   } else if (strcmp(callee, "inversesqrt") == 0) {
-      expr = new(mem_ctx) ir_expression(ir_unop_rsq, type, op[0], NULL);
-   } else if (strcmp(callee, "length") == 0) {
-      return new(mem_ctx) ir_constant(sqrtf(dot(op[0], op[0])));
-   } else if (strcmp(callee, "lessThan") == 0) {
-      assert(op[0]->type->is_vector() && op[1] && op[1]->type->is_vector());
-      for (unsigned c = 0; c < op[0]->type->components(); c++) {
-	 switch (op[0]->type->base_type) {
-	 case GLSL_TYPE_UINT:
-	    data.b[c] = op[0]->value.u[c] < op[1]->value.u[c];
-	    break;
-	 case GLSL_TYPE_INT:
-	    data.b[c] = op[0]->value.i[c] < op[1]->value.i[c];
-	    break;
-	 case GLSL_TYPE_FLOAT:
-	    data.b[c] = op[0]->value.f[c] < op[1]->value.f[c];
-	    break;
-	 default:
-	    assert(!"Should not get here.");
-	 }
-      }
-   } else if (strcmp(callee, "lessThanEqual") == 0) {
-      assert(op[0]->type->is_vector() && op[1] && op[1]->type->is_vector());
-      for (unsigned c = 0; c < op[0]->type->components(); c++) {
-	 switch (op[0]->type->base_type) {
-	 case GLSL_TYPE_UINT:
-	    data.b[c] = op[0]->value.u[c] <= op[1]->value.u[c];
-	    break;
-	 case GLSL_TYPE_INT:
-	    data.b[c] = op[0]->value.i[c] <= op[1]->value.i[c];
-	    break;
-	 case GLSL_TYPE_FLOAT:
-	    data.b[c] = op[0]->value.f[c] <= op[1]->value.f[c];
-	    break;
-	 default:
-	    assert(!"Should not get here.");
-	 }
-      }
-   } else if (strcmp(callee, "log") == 0) {
-      expr = new(mem_ctx) ir_expression(ir_unop_log, type, op[0], NULL);
-   } else if (strcmp(callee, "log2") == 0) {
-      expr = new(mem_ctx) ir_expression(ir_unop_log2, type, op[0], NULL);
-   } else if (strcmp(callee, "matrixCompMult") == 0) {
-      assert(op[0]->type->is_float() && op[1]->type->is_float());
-      for (unsigned c = 0; c < op[0]->type->components(); c++)
-	 data.f[c] = op[0]->value.f[c] * op[1]->value.f[c];
-   } else if (strcmp(callee, "max") == 0) {
-      expr = new(mem_ctx) ir_expression(ir_binop_max, type, op[0], op[1]);
-   } else if (strcmp(callee, "min") == 0) {
-      expr = new(mem_ctx) ir_expression(ir_binop_min, type, op[0], op[1]);
-   } else if (strcmp(callee, "mix") == 0) {
-      assert(op[0]->type->is_float() && op[1]->type->is_float());
-      if (op[2]->type->is_float()) {
-	 unsigned c2_inc = op[2]->type->is_scalar() ? 0 : 1;
-	 unsigned components = op[0]->type->components();
-	 for (unsigned c = 0, c2 = 0; c < components; c2 += c2_inc, c++) {
-	    data.f[c] = op[0]->value.f[c] * (1 - op[2]->value.f[c2]) +
-			op[1]->value.f[c] * op[2]->value.f[c2];
-	 }
-      } else {
-	 assert(op[2]->type->is_boolean());
-	 for (unsigned c = 0; c < op[0]->type->components(); c++)
-	    data.f[c] = op[op[2]->value.b[c] ? 1 : 0]->value.f[c];
-      }
-   } else if (strcmp(callee, "mod") == 0) {
-      expr = new(mem_ctx) ir_expression(ir_binop_mod, type, op[0], op[1]);
-   } else if (strcmp(callee, "normalize") == 0) {
-      assert(op[0]->type->is_float());
-      float length = sqrtf(dot(op[0], op[0]));
-
-      if (length == 0)
-	 return ir_constant::zero(mem_ctx, type);
+   ir_constant *result = NULL;
 
-      for (unsigned c = 0; c < op[0]->type->components(); c++)
-	 data.f[c] = op[0]->value.f[c] / length;
-   } else if (strcmp(callee, "not") == 0) {
-      expr = new(mem_ctx) ir_expression(ir_unop_logic_not, type, op[0], NULL);
-   } else if (strcmp(callee, "notEqual") == 0) {
-      assert(op[0]->type->is_vector() && op[1] && op[1]->type->is_vector());
-      for (unsigned c = 0; c < op[0]->type->components(); c++) {
-	 switch (op[0]->type->base_type) {
-	 case GLSL_TYPE_UINT:
-	    data.b[c] = op[0]->value.u[c] != op[1]->value.u[c];
-	    break;
-	 case GLSL_TYPE_INT:
-	    data.b[c] = op[0]->value.i[c] != op[1]->value.i[c];
-	    break;
-	 case GLSL_TYPE_FLOAT:
-	    data.b[c] = op[0]->value.f[c] != op[1]->value.f[c];
-	    break;
-	 case GLSL_TYPE_BOOL:
-	    data.b[c] = op[0]->value.b[c] != op[1]->value.b[c];
-	    break;
-	 default:
-	    assert(!"Should not get here.");
-	 }
-      }
-   } else if (strcmp(callee, "outerProduct") == 0) {
-      assert(op[0]->type->is_vector() && op[1]->type->is_vector());
-      const unsigned m = op[0]->type->vector_elements;
-      const unsigned n = op[1]->type->vector_elements;
-      for (unsigned j = 0; j < n; j++) {
-	 for (unsigned i = 0; i < m; i++) {
-	    data.f[i+m*j] = op[0]->value.f[i] * op[1]->value.f[j];
-	 }
-      }
-   } else if (strcmp(callee, "pow") == 0) {
-      expr = new(mem_ctx) ir_expression(ir_binop_pow, type, op[0], op[1]);
-   } else if (strcmp(callee, "radians") == 0) {
-      assert(op[0]->type->is_float());
-      for (unsigned c = 0; c < op[0]->type->components(); c++)
-	 data.f[c] = M_PI / 180.0F * op[0]->value.f[c];
-   } else if (strcmp(callee, "reflect") == 0) {
-      assert(op[0]->type->is_float());
-      float dot_NI = dot(op[1], op[0]);
-      for (unsigned c = 0; c < op[0]->type->components(); c++)
-	 data.f[c] = op[0]->value.f[c] - 2 * dot_NI * op[1]->value.f[c];
-   } else if (strcmp(callee, "refract") == 0) {
-      const float eta = op[2]->value.f[0];
-      const float dot_NI = dot(op[1], op[0]);
-      const float k = 1.0F - eta * eta * (1.0F - dot_NI * dot_NI);
-      if (k < 0.0) {
-	 return ir_constant::zero(mem_ctx, type);
-      } else {
-	 for (unsigned c = 0; c < type->components(); c++) {
-	    data.f[c] = eta * op[0]->value.f[c] - (eta * dot_NI + sqrtf(k))
-			    * op[1]->value.f[c];
-	 }
-      }
-   } else if (strcmp(callee, "round") == 0 ||
-	      strcmp(callee, "roundEven") == 0) {
-      expr = new(mem_ctx) ir_expression(ir_unop_round_even, op[0]);
-   } else if (strcmp(callee, "sign") == 0) {
-      expr = new(mem_ctx) ir_expression(ir_unop_sign, type, op[0], NULL);
-   } else if (strcmp(callee, "sin") == 0) {
-      expr = new(mem_ctx) ir_expression(ir_unop_sin, type, op[0], NULL);
-   } else if (strcmp(callee, "sinh") == 0) {
-      assert(op[0]->type->is_float());
-      for (unsigned c = 0; c < op[0]->type->components(); c++)
-	 data.f[c] = sinhf(op[0]->value.f[c]);
-   } else if (strcmp(callee, "smoothstep") == 0) {
-      assert(num_parameters == 3);
-      assert(op[1]->type == op[0]->type);
-      unsigned edge_inc = op[0]->type->is_scalar() ? 0 : 1;
-      for (unsigned c = 0, e = 0; c < type->components(); e += edge_inc, c++) {
-	 const float edge0 = op[0]->value.f[e];
-	 const float edge1 = op[1]->value.f[e];
-	 if (edge0 == edge1) {
-	    data.f[c] = 0.0; /* Avoid a crash - results are undefined anyway */
-	 } else {
-	    const float numerator = op[2]->value.f[c] - edge0;
-	    const float denominator = edge1 - edge0;
-	    const float t = CLAMP(numerator/denominator, 0, 1);
-	    data.f[c] = t * t * (3 - 2 * t);
-	 }
-      }
-   } else if (strcmp(callee, "sqrt") == 0) {
-      expr = new(mem_ctx) ir_expression(ir_unop_sqrt, type, op[0], NULL);
-   } else if (strcmp(callee, "step") == 0) {
-      assert(op[0]->type->is_float() && op[1]->type->is_float());
-      /* op[0] (edge) may be either a scalar or a vector */
-      const unsigned c0_inc = op[0]->type->is_scalar() ? 0 : 1;
-      for (unsigned c = 0, c0 = 0; c < type->components(); c0 += c0_inc, c++)
-	 data.f[c] = (op[1]->value.f[c] < op[0]->value.f[c0]) ? 0.0F : 1.0F;
-   } else if (strcmp(callee, "tan") == 0) {
-      assert(op[0]->type->is_float());
-      for (unsigned c = 0; c < op[0]->type->components(); c++)
-	 data.f[c] = tanf(op[0]->value.f[c]);
-   } else if (strcmp(callee, "tanh") == 0) {
-      assert(op[0]->type->is_float());
-      for (unsigned c = 0; c < op[0]->type->components(); c++)
-	 data.f[c] = tanhf(op[0]->value.f[c]);
-   } else if (strcmp(callee, "transpose") == 0) {
-      assert(op[0]->type->is_matrix());
-      const unsigned n = op[0]->type->vector_elements;
-      const unsigned m = op[0]->type->matrix_columns;
-      for (unsigned j = 0; j < m; j++) {
-	 for (unsigned i = 0; i < n; i++) {
-	    data.f[m*i+j] += op[0]->value.f[i+n*j];
-	 }
-      }
-   } else if (strcmp(callee, "trunc") == 0) {
-      expr = new(mem_ctx) ir_expression(ir_unop_trunc, op[0]);
-   } else {
-      /* Unsupported builtin - some are not allowed in constant expressions. */
-      return NULL;
-   }
+   /* Now run the builtin function until something non-constant
+    * happens or we get the result.
+    */
+   if (constant_expression_evaluate_expression_list(origin ? origin->body : body, deref_hash, &result) && result)
+      result = result->clone(ralloc_parent(this), NULL);
 
-   if (expr != NULL)
-      return expr->constant_expression_value();
+   hash_table_dtor(deref_hash);
 
-   return new(mem_ctx) ir_constant(type, &data);
+   return result;
 }
diff --git a/src/glsl/opt_copy_propagation_elements.cpp b/src/glsl/opt_copy_propagation_elements.cpp
index 314db4e187f..11d9d7baf72 100644
--- a/src/glsl/opt_copy_propagation_elements.cpp
+++ b/src/glsl/opt_copy_propagation_elements.cpp
@@ -93,6 +93,7 @@ public:
    ir_copy_propagation_elements_visitor()
    {
       this->progress = false;
+      this->killed_all = false;
       this->mem_ctx = ralloc_context(NULL);
       this->shader_mem_ctx = NULL;
       this->acp = new(mem_ctx) exec_list;
diff --git a/src/glsl/standalone_scaffolding.cpp b/src/glsl/standalone_scaffolding.cpp
index 24cc64ad97b..f15f2d882de 100644
--- a/src/glsl/standalone_scaffolding.cpp
+++ b/src/glsl/standalone_scaffolding.cpp
@@ -41,6 +41,12 @@ _mesa_reference_shader(struct gl_context *ctx, struct gl_shader **ptr,
    *ptr = sh;
 }
 
+void
+_mesa_shader_debug(struct gl_context *, GLenum, GLuint,
+                   const char *, int)
+{
+}
+
 struct gl_shader *
 _mesa_new_shader(struct gl_context *ctx, GLuint name, GLenum type)
 {
diff --git a/src/glsl/standalone_scaffolding.h b/src/glsl/standalone_scaffolding.h
index 87733200670..41ce35befc6 100644
--- a/src/glsl/standalone_scaffolding.h
+++ b/src/glsl/standalone_scaffolding.h
@@ -40,6 +40,10 @@ _mesa_reference_shader(struct gl_context *ctx, struct gl_shader **ptr,
 extern "C" struct gl_shader *
 _mesa_new_shader(struct gl_context *ctx, GLuint name, GLenum type);
 
+extern "C" void
+_mesa_shader_debug(struct gl_context *ctx, GLenum type, GLuint id,
+                   const char *msg, int len);
+
 /**
  * Initialize the given gl_context structure to a reasonable set of
  * defaults representing the minimum capabilities required by the
diff --git a/src/glx/apple/Makefile b/src/glx/apple/Makefile
index dc64295f73c..68fe6ad745b 100644
--- a/src/glx/apple/Makefile
+++ b/src/glx/apple/Makefile
@@ -26,6 +26,7 @@ SOURCES = \
 	apple_glx.c \
 	apple_glx_context.c \
 	apple_glx_drawable.c \
+	apple_glx_log.c \
 	apple_glx_pbuffer.c \
 	apple_glx_pixmap.c \
 	apple_glx_surface.c \
diff --git a/src/glx/apple/apple_glx.c b/src/glx/apple/apple_glx.c
index d94c1e0fb16..56cff64a15b 100644
--- a/src/glx/apple/apple_glx.c
+++ b/src/glx/apple/apple_glx.c
@@ -33,6 +33,8 @@
 #include <assert.h>
 #include <stdarg.h>
 #include <dlfcn.h>
+#include <pthread.h>
+#include <inttypes.h>
 #include "appledri.h"
 #include "apple_glx.h"
 #include "apple_glx_context.h"
@@ -43,22 +45,6 @@ static int dri_event_base = 0;
 
 const GLuint __glXDefaultPixelStore[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 1 };
 
-static bool diagnostic = false;
-
-void
-apple_glx_diagnostic(const char *fmt, ...)
-{
-   va_list vl;
-
-   if (diagnostic) {
-      fprintf(stderr, "DIAG: ");
-
-      va_start(vl, fmt);
-      vfprintf(stderr, fmt, vl);
-      va_end(vl);
-   }
-}
-
 int
 apple_get_dri_event_base(void)
 {
@@ -125,10 +111,9 @@ apple_init_glx(Display * dpy)
    if (initialized)
       return false;
 
-   if (getenv("LIBGL_DIAGNOSTIC")) {
-      printf("initializing libGL in %s\n", __func__);
-      diagnostic = true;
-   }
+   apple_glx_log_init();
+
+   apple_glx_log(ASL_LEVEL_INFO, "Initializing libGL.");
 
    apple_cgl_init();
    (void) apple_glx_get_client_id();
diff --git a/src/glx/apple/apple_glx.h b/src/glx/apple/apple_glx.h
index ce8c4884d1a..0967f1812a0 100644
--- a/src/glx/apple/apple_glx.h
+++ b/src/glx/apple/apple_glx.h
@@ -38,7 +38,8 @@
 #define XP_NO_X_HEADERS
 #include <Xplugin.h>
 
-void apple_glx_diagnostic(const char *fmt, ...);
+#include "apple_glx_log.h"
+
 xp_client_id apple_glx_get_client_id(void);
 bool apple_init_glx(Display * dpy);
 void apple_glx_swap_buffers(void *ptr);
diff --git a/src/glx/apple/apple_glx_context.c b/src/glx/apple/apple_glx_context.c
index c58d05a59af..0bb25b42575 100644
--- a/src/glx/apple/apple_glx_context.c
+++ b/src/glx/apple/apple_glx_context.c
@@ -421,7 +421,7 @@ apple_glx_make_current_context(Display * dpy, void *oldptr, void *ptr,
     */
 
    if (same_drawable && ac->is_current) {
-      apple_glx_diagnostic("%s: same_drawable and ac->is_current\n");
+      apple_glx_diagnostic("same_drawable and ac->is_current\n");
       return false;
    }
 
diff --git a/src/glx/apple/apple_glx_drawable.c b/src/glx/apple/apple_glx_drawable.c
index db283023a63..3f84d560c53 100644
--- a/src/glx/apple/apple_glx_drawable.c
+++ b/src/glx/apple/apple_glx_drawable.c
@@ -32,6 +32,7 @@
 #include <stdlib.h>
 #include <assert.h>
 #include <pthread.h>
+#include <string.h>
 #include "apple_glx.h"
 #include "apple_glx_context.h"
 #include "apple_glx_drawable.h"
@@ -48,8 +49,8 @@ lock_drawables_list(void)
    err = pthread_mutex_lock(&drawables_lock);
 
    if (err) {
-      fprintf(stderr, "pthread_mutex_lock failure in %s: %d\n",
-              __func__, err);
+      fprintf(stderr, "pthread_mutex_lock failure in %s: %s\n",
+              __func__, strerror(err));
       abort();
    }
 }
@@ -62,8 +63,8 @@ unlock_drawables_list(void)
    err = pthread_mutex_unlock(&drawables_lock);
 
    if (err) {
-      fprintf(stderr, "pthread_mutex_unlock failure in %s: %d\n",
-              __func__, err);
+      fprintf(stderr, "pthread_mutex_unlock failure in %s: %s\n",
+              __func__, strerror(err));
       abort();
    }
 }
@@ -95,7 +96,7 @@ drawable_lock(struct apple_glx_drawable *agd)
    err = pthread_mutex_lock(&agd->mutex);
 
    if (err) {
-      fprintf(stderr, "pthread_mutex_lock error: %d\n", err);
+      fprintf(stderr, "pthread_mutex_lock error: %s\n", strerror(err));
       abort();
    }
 }
@@ -108,7 +109,7 @@ drawable_unlock(struct apple_glx_drawable *d)
    err = pthread_mutex_unlock(&d->mutex);
 
    if (err) {
-      fprintf(stderr, "pthread_mutex_unlock error: %d\n", err);
+      fprintf(stderr, "pthread_mutex_unlock error: %s\n", strerror(err));
       abort();
    }
 }
@@ -245,7 +246,7 @@ common_init(Display * dpy, GLXDrawable drawable, struct apple_glx_drawable *d)
    err = pthread_mutexattr_init(&attr);
 
    if (err) {
-      fprintf(stderr, "pthread_mutexattr_init error: %d\n", err);
+      fprintf(stderr, "pthread_mutexattr_init error: %s\n", strerror(err));
       abort();
    }
 
@@ -257,14 +258,14 @@ common_init(Display * dpy, GLXDrawable drawable, struct apple_glx_drawable *d)
    err = pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
 
    if (err) {
-      fprintf(stderr, "error: setting pthread mutex type: %d\n", err);
+      fprintf(stderr, "error: setting pthread mutex type: %s\n", strerror(err));
       abort();
    }
 
    err = pthread_mutex_init(&d->mutex, &attr);
 
    if (err) {
-      fprintf(stderr, "pthread_mutex_init error: %d\n", err);
+      fprintf(stderr, "pthread_mutex_init error: %s\n", strerror(err));
       abort();
    }
 
diff --git a/src/glx/apple/apple_glx_log.c b/src/glx/apple/apple_glx_log.c
new file mode 100644
index 00000000000..9ebf666c9b4
--- /dev/null
+++ b/src/glx/apple/apple_glx_log.c
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2012 Apple Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation files
+ * (the "Software"), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE ABOVE LISTED COPYRIGHT
+ * HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Except as contained in this notice, the name(s) of the above
+ * copyright holders shall not be used in advertising or otherwise to
+ * promote the sale, use or other dealings in this Software without
+ * prior written authorization.
+ */
+
+#include <sys/cdefs.h>
+#include <asl.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <pthread.h>
+#include "apple_glx_log.h"
+
+static bool diagnostic = false;
+static aslclient aslc;
+
+void apple_glx_log_init(void) {
+    if (getenv("LIBGL_DIAGNOSTIC")) {
+        diagnostic = true;
+    }
+
+    aslc = asl_open(NULL, NULL, 0);
+}
+
+void _apple_glx_log(int level, const char *file, const char *function,
+                    int line, const char *fmt, ...) {
+    va_list v;
+    va_start(v, fmt);
+    _apple_glx_vlog(level, file, function, line, fmt, v);
+    va_end(v);
+}
+
+static const char *
+_asl_level_string(int level)
+{
+        if (level == ASL_LEVEL_EMERG) return ASL_STRING_EMERG;
+        if (level == ASL_LEVEL_ALERT) return ASL_STRING_ALERT;
+        if (level == ASL_LEVEL_CRIT) return ASL_STRING_CRIT;
+        if (level == ASL_LEVEL_ERR) return ASL_STRING_ERR;
+        if (level == ASL_LEVEL_WARNING) return ASL_STRING_WARNING;
+        if (level == ASL_LEVEL_NOTICE) return ASL_STRING_NOTICE;
+        if (level == ASL_LEVEL_INFO) return ASL_STRING_INFO;
+        if (level == ASL_LEVEL_DEBUG) return ASL_STRING_DEBUG;
+        return "unknown";
+}
+
+void _apple_glx_vlog(int level, const char *file, const char *function,
+                     int line, const char *fmt, va_list args) {
+    aslmsg msg;
+    uint64_t thread = 0;
+
+    if (pthread_is_threaded_np()) {
+        pthread_threadid_np(NULL, &thread);
+    }
+
+    if (diagnostic) {
+        va_list args2;
+        va_copy(args2, args);
+
+        fprintf(stderr, "%-9s %24s:%-4d %s(%"PRIu64"): ",
+                _asl_level_string(level), file, line, function, thread);
+        vfprintf(stderr, fmt, args2);
+    }
+
+    msg = asl_new(ASL_TYPE_MSG);
+    if (msg) {
+        if (file)
+            asl_set(msg, "File", file);
+        if (function)
+            asl_set(msg, "Function", function);
+        if (line) {
+            char *_line;
+            asprintf(&_line, "%d", line);
+            if (_line) {
+                asl_set(msg, "Line", _line);
+                free(_line);
+            }
+        }
+        if (pthread_is_threaded_np()) {
+            char *_thread;
+            asprintf(&_thread, "%"PRIu64, thread);
+            if (_thread) {
+                asl_set(msg, "Thread", _thread);
+                free(_thread);
+            }
+        }
+    }
+
+    asl_vlog(aslc, msg, level, fmt, args);
+    if (msg)
+        asl_free(msg);
+}
diff --git a/src/glx/apple/apple_glx_log.h b/src/glx/apple/apple_glx_log.h
new file mode 100644
index 00000000000..4b1c531d27f
--- /dev/null
+++ b/src/glx/apple/apple_glx_log.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2012 Apple Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation files
+ * (the "Software"), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE ABOVE LISTED COPYRIGHT
+ * HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Except as contained in this notice, the name(s) of the above
+ * copyright holders shall not be used in advertising or otherwise to
+ * promote the sale, use or other dealings in this Software without
+ * prior written authorization.
+ */
+
+#ifndef APPLE_GLX_LOG_H
+#define APPLE_GLX_LOG_H
+
+#include <sys/cdefs.h>
+#include <asl.h>
+
+void apple_glx_log_init(void);
+
+__printflike(5, 6)
+void _apple_glx_log(int level, const char *file, const char *function,
+                    int line, const char *fmt, ...);
+#define apple_glx_log(l, f, args ...) \
+    _apple_glx_log(l, __FILE__, __FUNCTION__, __LINE__, f, ## args)
+
+
+__printflike(5, 0)
+void _apple_glx_vlog(int level, const char *file, const char *function,
+                     int line, const char *fmt, va_list v);
+#define apple_glx_vlog(l, f, v) \
+    _apple_glx_vlog(l, __FILE__, __FUNCTION__, __LINE__, f, v)
+
+/* This is just here to help the transition.
+ * TODO: Replace calls to apple_glx_diagnostic
+ */
+#define apple_glx_diagnostic(f, args ...) \
+    apple_glx_log(ASL_LEVEL_DEBUG, f, ## args)
+
+#endif
diff --git a/src/mesa/drivers/dri/i915/i830_state.c b/src/mesa/drivers/dri/i915/i830_state.c
index ea6e0beaeab..6f8bd69481f 100644
--- a/src/mesa/drivers/dri/i915/i830_state.c
+++ b/src/mesa/drivers/dri/i915/i830_state.c
@@ -30,6 +30,7 @@
 #include "main/context.h"
 #include "main/macros.h"
 #include "main/enums.h"
+#include "main/fbobject.h"
 #include "main/dd.h"
 #include "main/state.h"
 
@@ -545,7 +546,7 @@ i830Scissor(struct gl_context * ctx, GLint x, GLint y, GLsizei w, GLsizei h)
 
    DBG("%s %d,%d %dx%d\n", __FUNCTION__, x, y, w, h);
 
-   if (ctx->DrawBuffer->Name == 0) {
+   if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
       x1 = x;
       y1 = ctx->DrawBuffer->Height - (y + h);
       x2 = x + w - 1;
diff --git a/src/mesa/drivers/dri/i915/i915_state.c b/src/mesa/drivers/dri/i915/i915_state.c
index 94c7327830b..3ab75a9739c 100644
--- a/src/mesa/drivers/dri/i915/i915_state.c
+++ b/src/mesa/drivers/dri/i915/i915_state.c
@@ -30,6 +30,7 @@
 #include "main/context.h"
 #include "main/macros.h"
 #include "main/enums.h"
+#include "main/fbobject.h"
 #include "main/dd.h"
 #include "main/state.h"
 #include "tnl/tnl.h"
@@ -400,7 +401,7 @@ intelCalcViewport(struct gl_context * ctx)
 {
    struct intel_context *intel = intel_context(ctx);
 
-   if (ctx->DrawBuffer->Name == 0) {
+   if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
       _math_matrix_viewport(&intel->ViewportMatrix,
 			    ctx->Viewport.X,
 			    ctx->DrawBuffer->Height - ctx->Viewport.Y,
@@ -518,7 +519,7 @@ i915Scissor(struct gl_context * ctx, GLint x, GLint y, GLsizei w, GLsizei h)
 
    DBG("%s %d,%d %dx%d\n", __FUNCTION__, x, y, w, h);
 
-   if (ctx->DrawBuffer->Name == 0) {
+   if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
       x1 = x;
       y1 = ctx->DrawBuffer->Height - (y + h);
       x2 = x + w - 1;
@@ -577,7 +578,7 @@ i915CullFaceFrontFace(struct gl_context * ctx, GLenum unused)
    else if (ctx->Polygon.CullFaceMode != GL_FRONT_AND_BACK) {
       mode = S4_CULLMODE_CW;
 
-      if (ctx->DrawBuffer && ctx->DrawBuffer->Name != 0)
+      if (ctx->DrawBuffer && _mesa_is_user_fbo(ctx->DrawBuffer))
          mode ^= (S4_CULLMODE_CW ^ S4_CULLMODE_CCW);
       if (ctx->Polygon.CullFaceMode == GL_FRONT)
          mode ^= (S4_CULLMODE_CW ^ S4_CULLMODE_CCW);
diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index c564d95d44b..c99a034a462 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -88,7 +88,6 @@ i965_C_FILES = \
 	gen6_clip_state.c \
 	gen6_depthstencil.c \
 	gen6_gs_state.c \
-	gen6_hiz.c \
 	gen6_sampler_state.c \
 	gen6_scissor_state.c \
 	gen6_sf_state.c \
@@ -100,7 +99,6 @@ i965_C_FILES = \
 	gen7_cc_state.c \
 	gen7_clip_state.c \
 	gen7_disable.c \
-	gen7_hiz.c \
 	gen7_misc_state.c \
 	gen7_sampler_state.c \
 	gen7_sf_state.c \
@@ -127,6 +125,8 @@ i965_CXX_FILES = \
 	brw_vec4_emit.cpp \
 	brw_vec4_copy_propagation.cpp \
 	brw_vec4_reg_allocate.cpp \
-	brw_vec4_visitor.cpp
+	brw_vec4_visitor.cpp \
+	gen6_blorp.cpp \
+	gen7_blorp.cpp
 
 i965_ASM_FILES =
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 47f56e21f5d..141fb658d71 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -37,6 +37,9 @@
 #include "brw_structs.h"
 #include "main/imports.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 /* Glossary:
  *
@@ -1188,4 +1191,8 @@ brw_program_reloc(struct brw_context *brw, uint32_t state_offset,
 
 bool brw_do_cubemap_normalize(struct exec_list *instructions);
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
index da37b181f6e..813f7c8e570 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -531,7 +531,6 @@ retry:
 }
 
 void brw_draw_prims( struct gl_context *ctx,
-		     const struct gl_client_array *arrays[],
 		     const struct _mesa_prim *prim,
 		     GLuint nr_prims,
 		     const struct _mesa_index_buffer *ib,
@@ -540,6 +539,7 @@ void brw_draw_prims( struct gl_context *ctx,
 		     GLuint max_index,
 		     struct gl_transform_feedback_object *tfb_vertcount )
 {
+   const struct gl_client_array **arrays = ctx->Array._DrawArrays;
    bool retval;
 
    if (!_mesa_check_conditional_render(ctx))
diff --git a/src/mesa/drivers/dri/i965/brw_draw.h b/src/mesa/drivers/dri/i965/brw_draw.h
index b91041932e1..2cc4cb38379 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.h
+++ b/src/mesa/drivers/dri/i965/brw_draw.h
@@ -35,7 +35,6 @@ struct brw_context;
 
 
 void brw_draw_prims( struct gl_context *ctx,
-		     const struct gl_client_array *arrays[],
 		     const struct _mesa_prim *prims,
 		     GLuint nr_prims,
 		     const struct _mesa_index_buffer *ib,
diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
index c4c62b2bf4a..675b50a1456 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.h
+++ b/src/mesa/drivers/dri/i965/brw_eu.h
@@ -38,6 +38,10 @@
 #include "brw_defines.h"
 #include "program/prog_instruction.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define BRW_SWIZZLE4(a,b,c,d) (((a)<<0) | ((b)<<2) | ((c)<<4) | ((d)<<6))
 #define BRW_GET_SWZ(swz, idx) (((swz) >> ((idx)*2)) & 0x3)
 
@@ -1107,4 +1111,8 @@ void brw_optimize(struct brw_compile *p);
 void brw_remove_duplicate_mrf_moves(struct brw_compile *p);
 void brw_remove_grf_to_mrf_moves(struct brw_compile *p);
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 8af43000e3d..fd67318f550 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -35,6 +35,7 @@ extern "C" {
 #include "main/macros.h"
 #include "main/shaderobj.h"
 #include "main/uniforms.h"
+#include "main/fbobject.h"
 #include "program/prog_parameter.h"
 #include "program/prog_print.h"
 #include "program/register_allocate.h"
@@ -1828,7 +1829,7 @@ brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
 
    if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
       key.drawable_height = ctx->DrawBuffer->Height;
-      key.render_to_fbo = ctx->DrawBuffer->Name != 0;
+      key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
    }
 
    key.nr_color_regions = 1;
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 6d9a042ff1b..6b45c4ece96 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -582,6 +582,7 @@ public:
    void emit_assignment_writes(fs_reg &l, fs_reg &r,
 			       const glsl_type *type, bool predicated);
    void resolve_ud_negate(fs_reg *reg);
+   void resolve_bool_comparison(ir_rvalue *rvalue, fs_reg *reg);
 
    struct brw_reg interp_reg(int location, int channel);
    int setup_uniform_values(int loc, const glsl_type *type);
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index d4ebc79dbce..20d4c53a858 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -395,6 +395,9 @@ fs_visitor::visit(ir_expression *ir)
       resolve_ud_negate(&op[0]);
       resolve_ud_negate(&op[1]);
 
+      resolve_bool_comparison(ir->operands[0], &op[0]);
+      resolve_bool_comparison(ir->operands[1], &op[1]);
+
       inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
       inst->conditional_mod = brw_conditional_for_comparison(ir->operation);
       break;
@@ -1542,6 +1545,9 @@ fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
       case ir_binop_all_equal:
       case ir_binop_nequal:
       case ir_binop_any_nequal:
+	 resolve_bool_comparison(expr->operands[0], &op[0]);
+	 resolve_bool_comparison(expr->operands[1], &op[1]);
+
 	 inst = emit(BRW_OPCODE_CMP, reg_null_cmp, op[0], op[1]);
 	 inst->conditional_mod =
 	    brw_conditional_for_comparison(expr->operation);
@@ -2129,3 +2135,14 @@ fs_visitor::resolve_ud_negate(fs_reg *reg)
    emit(BRW_OPCODE_MOV, temp, *reg);
    *reg = temp;
 }
+
+void
+fs_visitor::resolve_bool_comparison(ir_rvalue *rvalue, fs_reg *reg)
+{
+   if (rvalue->type != glsl_type::bool_type)
+      return;
+
+   fs_reg temp = fs_reg(this, glsl_type::bool_type);
+   emit(BRW_OPCODE_AND, temp, *reg, fs_reg(1));
+   *reg = temp;
+}
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index 62bcc93eed2..0c0389f8bdf 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -40,6 +40,8 @@
 #include "brw_state.h"
 #include "brw_defines.h"
 
+#include "main/fbobject.h"
+
 /* Constant single cliprect for framebuffer object or DRI2 drawing */
 static void upload_drawing_rect(struct brw_context *brw)
 {
@@ -264,10 +266,45 @@ static void emit_depthbuffer(struct brw_context *brw)
    unsigned int len;
    bool separate_stencil = false;
 
+   /* Amount by which drawing should be offset in order to draw to the
+    * appropriate miplevel/zoffset/cubeface.  We will extract these values
+    * from depth_irb or stencil_irb once we determine which is present.
+    */
+   uint32_t draw_x = 0, draw_y = 0;
+
+   /* Masks used to determine how much of the draw_x and draw_y offsets should
+    * be performed using the fine adjustment of "depth coordinate offset X/Y"
+    * (dw5 of 3DSTATE_DEPTH_BUFFER).  Any remaining coarse adjustment will be
+    * performed by changing the base addresses of the buffers.
+    *
+    * Since the HiZ, depth, and stencil buffers all use the same "depth
+    * coordinate offset X/Y" values, we need to make sure that the coarse
+    * adjustment will be possible to apply to all three buffers.  Since coarse
+    * adjustment can only be applied in multiples of the tile size, we will OR
+    * together the tile masks of all the buffers to determine which offsets to
+    * perform as fine adjustments.
+    */
+   uint32_t tile_mask_x = 0, tile_mask_y = 0;
+
+   if (depth_irb) {
+      intel_region_get_tile_masks(depth_irb->mt->region,
+                                  &tile_mask_x, &tile_mask_y);
+   }
+
    if (depth_irb &&
        depth_irb->mt &&
        depth_irb->mt->hiz_mt) {
       hiz_region = depth_irb->mt->hiz_mt->region;
+
+      uint32_t hiz_tile_mask_x, hiz_tile_mask_y;
+      intel_region_get_tile_masks(hiz_region,
+                                  &hiz_tile_mask_x, &hiz_tile_mask_y);
+
+      /* Each HiZ row represents 2 rows of pixels */
+      hiz_tile_mask_y = hiz_tile_mask_y << 1 | 1;
+
+      tile_mask_x |= hiz_tile_mask_x;
+      tile_mask_y |= hiz_tile_mask_y;
    }
 
    /* 3DSTATE_DEPTH_BUFFER, 3DSTATE_STENCIL_BUFFER are both
@@ -284,8 +321,21 @@ static void emit_depthbuffer(struct brw_context *brw)
       if (stencil_mt->stencil_mt)
 	 stencil_mt = stencil_mt->stencil_mt;
 
-      if (stencil_mt->format == MESA_FORMAT_S8)
+      if (stencil_mt->format == MESA_FORMAT_S8) {
 	 separate_stencil = true;
+
+         /* Separate stencil buffer uses 64x64 tiles. */
+         tile_mask_x |= 63;
+         tile_mask_y |= 63;
+      } else {
+         uint32_t stencil_tile_mask_x, stencil_tile_mask_y;
+         intel_region_get_tile_masks(stencil_mt->region,
+                                     &stencil_tile_mask_x,
+                                     &stencil_tile_mask_y);
+
+         tile_mask_x |= stencil_tile_mask_x;
+         tile_mask_y |= stencil_tile_mask_y;
+      }
    }
 
    /* If there's a packed depth/stencil bound to stencil only, we need to
@@ -319,6 +369,8 @@ static void emit_depthbuffer(struct brw_context *brw)
       ADVANCE_BATCH();
 
    } else if (!depth_irb && separate_stencil) {
+      uint32_t tile_x, tile_y;
+
       /*
        * There exists a separate stencil buffer but no depth buffer.
        *
@@ -341,6 +393,29 @@ static void emit_depthbuffer(struct brw_context *brw)
        */
       assert(intel->has_separate_stencil);
 
+      draw_x = stencil_irb->draw_x;
+      draw_y = stencil_irb->draw_y;
+      tile_x = draw_x & tile_mask_x;
+      tile_y = draw_y & tile_mask_y;
+
+      /* According to the Sandy Bridge PRM, volume 2 part 1, pp326-327
+       * (3DSTATE_DEPTH_BUFFER dw5), in the documentation for "Depth
+       * Coordinate Offset X/Y":
+       *
+       *   "The 3 LSBs of both offsets must be zero to ensure correct
+       *   alignment"
+       *
+       * We have no guarantee that tile_x and tile_y are correctly aligned,
+       * since they are determined by the mipmap layout, which is only aligned
+       * to multiples of 4.
+       *
+       * So, to avoid hanging the GPU, just smash the low order 3 bits of
+       * tile_x and tile_y to 0.  This is a temporary workaround until we come
+       * up with a better solution.
+       */
+      tile_x &= ~7;
+      tile_y &= ~7;
+
       BEGIN_BATCH(len);
       OUT_BATCH(_3DSTATE_DEPTH_BUFFER << 16 | (len - 2));
       OUT_BATCH((BRW_DEPTHFORMAT_D32_FLOAT << 18) |
@@ -350,11 +425,15 @@ static void emit_depthbuffer(struct brw_context *brw)
 	        (1 << 27) | /* tiled surface */
 	        (BRW_SURFACE_2D << 29));
       OUT_BATCH(0);
-      OUT_BATCH(((stencil_irb->Base.Base.Width - 1) << 6) |
-	         (stencil_irb->Base.Base.Height - 1) << 19);
-      OUT_BATCH(0);
+      OUT_BATCH(((stencil_irb->Base.Base.Width + tile_x - 1) << 6) |
+	         (stencil_irb->Base.Base.Height + tile_y - 1) << 19);
       OUT_BATCH(0);
 
+      if (intel->is_g4x || intel->gen >= 5)
+         OUT_BATCH(tile_x | (tile_y << 16));
+      else
+	 assert(tile_x == 0 && tile_y == 0);
+
       if (intel->gen >= 6)
 	 OUT_BATCH(0);
 
@@ -367,11 +446,36 @@ static void emit_depthbuffer(struct brw_context *brw)
       /* If using separate stencil, hiz must be enabled. */
       assert(!separate_stencil || hiz_region);
 
-      offset = intel_renderbuffer_tile_offsets(depth_irb, &tile_x, &tile_y);
-
       assert(intel->gen < 6 || region->tiling == I915_TILING_Y);
       assert(!hiz_region || region->tiling == I915_TILING_Y);
 
+      draw_x = depth_irb->draw_x;
+      draw_y = depth_irb->draw_y;
+      tile_x = draw_x & tile_mask_x;
+      tile_y = draw_y & tile_mask_y;
+
+      /* According to the Sandy Bridge PRM, volume 2 part 1, pp326-327
+       * (3DSTATE_DEPTH_BUFFER dw5), in the documentation for "Depth
+       * Coordinate Offset X/Y":
+       *
+       *   "The 3 LSBs of both offsets must be zero to ensure correct
+       *   alignment"
+       *
+       * We have no guarantee that tile_x and tile_y are correctly aligned,
+       * since they are determined by the mipmap layout, which is only aligned
+       * to multiples of 4.
+       *
+       * So, to avoid hanging the GPU, just smash the low order 3 bits of
+       * tile_x and tile_y to 0.  This is a temporary workaround until we come
+       * up with a better solution.
+       */
+      tile_x &= ~7;
+      tile_y &= ~7;
+
+      offset = intel_region_get_aligned_offset(region,
+                                               draw_x & ~tile_mask_x,
+                                               draw_y & ~tile_mask_y);
+
       BEGIN_BATCH(len);
       OUT_BATCH(_3DSTATE_DEPTH_BUFFER << 16 | (len - 2));
       OUT_BATCH(((region->pitch * region->cpp) - 1) |
@@ -411,12 +515,17 @@ static void emit_depthbuffer(struct brw_context *brw)
 
       /* Emit hiz buffer. */
       if (hiz_region) {
+         uint32_t hiz_offset =
+            intel_region_get_aligned_offset(hiz_region,
+                                            draw_x & ~tile_mask_x,
+                                            (draw_y & ~tile_mask_y) / 2);
+
 	 BEGIN_BATCH(3);
 	 OUT_BATCH((_3DSTATE_HIER_DEPTH_BUFFER << 16) | (3 - 2));
 	 OUT_BATCH(hiz_region->pitch * hiz_region->cpp - 1);
 	 OUT_RELOC(hiz_region->bo,
 		   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-		   0);
+		   hiz_offset);
 	 ADVANCE_BATCH();
       } else {
 	 BEGIN_BATCH(3);
@@ -429,6 +538,15 @@ static void emit_depthbuffer(struct brw_context *brw)
       /* Emit stencil buffer. */
       if (separate_stencil) {
 	 struct intel_region *region = stencil_mt->region;
+
+         /* Note: we can't compute the stencil offset using
+          * intel_region_get_aligned_offset(), because stencil_region claims
+          * that the region is untiled; in fact it's W tiled.
+          */
+         uint32_t stencil_offset =
+            (draw_y & ~tile_mask_y) * region->pitch +
+            (draw_x & ~tile_mask_x) * 64;
+
 	 BEGIN_BATCH(3);
 	 OUT_BATCH((_3DSTATE_STENCIL_BUFFER << 16) | (3 - 2));
          /* The stencil buffer has quirky pitch requirements.  From Vol 2a,
@@ -439,7 +557,7 @@ static void emit_depthbuffer(struct brw_context *brw)
 	 OUT_BATCH(2 * region->pitch * region->cpp - 1);
 	 OUT_RELOC(region->bo,
 		   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-		   0);
+		   stencil_offset);
 	 ADVANCE_BATCH();
       } else {
 	 BEGIN_BATCH(3);
@@ -507,7 +625,7 @@ static void upload_polygon_stipple(struct brw_context *brw)
     * to a FBO (i.e. any named frame buffer object), we *don't*
     * need to invert - we already match the layout.
     */
-   if (ctx->DrawBuffer->Name == 0) {
+   if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
       for (i = 0; i < 32; i++)
 	  OUT_BATCH(ctx->PolygonStipple[31 - i]); /* invert */
    }
@@ -550,15 +668,13 @@ static void upload_polygon_stipple_offset(struct brw_context *brw)
 
    /* _NEW_BUFFERS
     *
-    * If we're drawing to a system window (ctx->DrawBuffer->Name == 0),
-    * we have to invert the Y axis in order to match the OpenGL
-    * pixel coordinate system, and our offset must be matched
-    * to the window position.  If we're drawing to a FBO
-    * (ctx->DrawBuffer->Name != 0), then our native pixel coordinate
-    * system works just fine, and there's no window system to
-    * worry about.
+    * If we're drawing to a system window we have to invert the Y axis
+    * in order to match the OpenGL pixel coordinate system, and our
+    * offset must be matched to the window position.  If we're drawing
+    * to a user-created FBO then our native pixel coordinate system
+    * works just fine, and there's no window system to worry about.
     */
-   if (brw->intel.ctx.DrawBuffer->Name == 0)
+   if (_mesa_is_winsys_fbo(brw->intel.ctx.DrawBuffer))
       OUT_BATCH((32 - (ctx->DrawBuffer->Height & 31)) & 31);
    else
       OUT_BATCH(0);
diff --git a/src/mesa/drivers/dri/i965/brw_sf.c b/src/mesa/drivers/dri/i965/brw_sf.c
index 37d1ee502d8..23a874aa8ec 100644
--- a/src/mesa/drivers/dri/i965/brw_sf.c
+++ b/src/mesa/drivers/dri/i965/brw_sf.c
@@ -32,7 +32,9 @@
 
 #include "main/glheader.h"
 #include "main/macros.h"
+#include "main/mtypes.h"
 #include "main/enums.h"
+#include "main/fbobject.h"
 
 #include "intel_batchbuffer.h"
 
@@ -136,7 +138,7 @@ brw_upload_sf_prog(struct brw_context *brw)
    struct gl_context *ctx = &brw->intel.ctx;
    struct brw_sf_prog_key key;
    /* _NEW_BUFFERS */
-   bool render_to_fbo = ctx->DrawBuffer->Name != 0;
+   bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
 
    memset(&key, 0, sizeof(key));
 
diff --git a/src/mesa/drivers/dri/i965/brw_sf_state.c b/src/mesa/drivers/dri/i965/brw_sf_state.c
index d7e7aa11f63..7c29ba27d1a 100644
--- a/src/mesa/drivers/dri/i965/brw_sf_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sf_state.c
@@ -31,10 +31,12 @@
    
 
 
+#include "main/mtypes.h"
+#include "main/macros.h"
+#include "main/fbobject.h"
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
-#include "main/macros.h"
 #include "brw_sf.h"
 
 static void upload_sf_vp(struct brw_context *brw)
@@ -44,7 +46,7 @@ static void upload_sf_vp(struct brw_context *brw)
    const GLfloat depth_scale = 1.0F / ctx->DrawBuffer->_DepthMaxF;
    struct brw_sf_viewport *sfv;
    GLfloat y_scale, y_bias;
-   const bool render_to_fbo = (ctx->DrawBuffer->Name != 0);
+   const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
    const GLfloat *v = ctx->Viewport._WindowMap.m;
 
    sfv = brw_state_batch(brw, AUB_TRACE_SF_VP_STATE,
@@ -142,7 +144,7 @@ static void upload_sf_unit( struct brw_context *brw )
    struct brw_sf_unit_state *sf;
    drm_intel_bo *bo = intel->batch.bo;
    int chipset_max_threads;
-   bool render_to_fbo = brw->intel.ctx.DrawBuffer->Name != 0;
+   bool render_to_fbo = _mesa_is_user_fbo(brw->intel.ctx.DrawBuffer);
 
    sf = brw_state_batch(brw, AUB_TRACE_SF_STATE,
 			sizeof(*sf), 64, &brw->sf.state_offset);
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index 8a0e92fab08..659cb0a2898 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -35,6 +35,10 @@
 
 #include "brw_context.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 extern const struct brw_tracked_state brw_blend_constant_color;
 extern const struct brw_tracked_state brw_cc_vp;
 extern const struct brw_tracked_state brw_cc_unit;
@@ -208,4 +212,8 @@ uint32_t
 get_attr_override(struct brw_vue_map *vue_map, int urb_entry_read_offset,
                   int fs_attr, bool two_side_color);
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_tex_layout.c b/src/mesa/drivers/dri/i965/brw_tex_layout.c
index 7a1b91f3721..8bf1d3ddbcd 100644
--- a/src/mesa/drivers/dri/i965/brw_tex_layout.c
+++ b/src/mesa/drivers/dri/i965/brw_tex_layout.c
@@ -115,6 +115,8 @@ brw_miptree_layout(struct intel_context *intel, struct intel_mipmap_tree *mt)
 	       intel_miptree_set_image_offset(mt, level, q, x, y);
 	       x += pack_x_pitch;
 	    }
+            if (x > mt->total_width)
+               mt->total_width = x;
 
 	    x = 0;
 	    y += pack_y_pitch;
@@ -135,10 +137,9 @@ brw_miptree_layout(struct intel_context *intel, struct intel_mipmap_tree *mt)
 	       pack_x_nr <<= 1;
 	    }
 	 } else {
+            pack_x_nr <<= 1;
 	    if (pack_x_pitch > 4) {
 	       pack_x_pitch >>= 1;
-	       pack_x_nr <<= 1;
-	       assert(pack_x_pitch * pack_x_nr <= mt->total_width);
 	    }
 
 	    if (pack_y_pitch > 2) {
diff --git a/src/mesa/drivers/dri/i965/brw_vtbl.c b/src/mesa/drivers/dri/i965/brw_vtbl.c
index 5e7345648db..733193425d3 100644
--- a/src/mesa/drivers/dri/i965/brw_vtbl.c
+++ b/src/mesa/drivers/dri/i965/brw_vtbl.c
@@ -49,8 +49,8 @@
 #include "brw_vs.h"
 #include "brw_wm.h"
 
-#include "gen6_hiz.h"
-#include "gen7_hiz.h"
+#include "gen6_blorp.h"
+#include "gen7_blorp.h"
 
 #include "glsl/ralloc.h"
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index b358306a928..63c74ad8f7b 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -33,6 +33,7 @@
 #include "brw_wm.h"
 #include "brw_state.h"
 #include "main/formats.h"
+#include "main/fbobject.h"
 #include "main/samplerobj.h"
 #include "program/prog_parameter.h"
 
@@ -516,7 +517,7 @@ static void brw_wm_populate_key( struct brw_context *brw,
     */
    if (fp->program.Base.InputsRead & FRAG_BIT_WPOS) {
       key->drawable_height = ctx->DrawBuffer->Height;
-      key->render_to_fbo = ctx->DrawBuffer->Name != 0;
+      key->render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
    }
 
    /* _NEW_BUFFERS */
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index 69af0eecee1..0bb9414ed58 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -291,6 +291,8 @@ brw_format_for_mesa_format(gl_format mesa_format)
       [MESA_FORMAT_RGBA8888_REV] = BRW_SURFACEFORMAT_R8G8B8A8_UNORM,
       [MESA_FORMAT_ARGB8888] = BRW_SURFACEFORMAT_B8G8R8A8_UNORM,
       [MESA_FORMAT_ARGB8888_REV] = 0,
+      [MESA_FORMAT_RGBX8888] = 0,
+      [MESA_FORMAT_RGBX8888_REV] = BRW_SURFACEFORMAT_R8G8B8X8_UNORM,
       [MESA_FORMAT_XRGB8888] = BRW_SURFACEFORMAT_B8G8R8X8_UNORM,
       [MESA_FORMAT_XRGB8888_REV] = 0,
       [MESA_FORMAT_RGB888] = 0,
@@ -959,8 +961,11 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
 
    switch (rb_format) {
    case MESA_FORMAT_SARGB8:
-      /* without GL_EXT_framebuffer_sRGB we shouldn't bind sRGB
-	 surfaces to the blend/update as sRGB */
+      /* _NEW_BUFFERS
+       *
+       * Without GL_EXT_framebuffer_sRGB we shouldn't bind sRGB surfaces to the
+       * blend/update as sRGB.
+       */
       if (ctx->Color.sRGBEnabled)
 	 format = brw_format_for_mesa_format(rb_format);
       else
diff --git a/src/mesa/drivers/dri/i965/gen6_blorp.cpp b/src/mesa/drivers/dri/i965/gen6_blorp.cpp
new file mode 100644
index 00000000000..604d380e702
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/gen6_blorp.cpp
@@ -0,0 +1,662 @@
+/*
+ * Copyright Â© 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+
+#include "intel_batchbuffer.h"
+#include "intel_fbo.h"
+#include "intel_mipmap_tree.h"
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_state.h"
+
+#include "gen6_blorp.h"
+
+/**
+ * \name Constants for HiZ VBO
+ * \{
+ *
+ * \see brw_context::hiz::vertex_bo
+ */
+#define GEN6_HIZ_NUM_VERTICES 3
+#define GEN6_HIZ_NUM_VUE_ELEMS 8
+#define GEN6_HIZ_VBO_SIZE (GEN6_HIZ_NUM_VERTICES \
+                           * GEN6_HIZ_NUM_VUE_ELEMS \
+                           * sizeof(float))
+/** \} */
+
+void
+gen6_hiz_emit_batch_head(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->intel.ctx;
+   struct intel_context *intel = &brw->intel;
+
+   /* To ensure that the batch contains only the resolve, flush the batch
+    * before beginning and after finishing emitting the resolve packets.
+    *
+    * Ideally, we would not need to flush for the resolve op. But, I suspect
+    * that it's unsafe for CMD_PIPELINE_SELECT to occur multiple times in
+    * a single batch, and there is no safe way to ensure that other than by
+    * fencing the resolve with flushes. Ideally, we would just detect if
+    * a batch is in progress and do the right thing, but that would require
+    * the ability to *safely* access brw_context::state::dirty::brw
+    * outside of the brw_upload_state() codepath.
+    */
+   intel_flush(ctx);
+
+   /* CMD_PIPELINE_SELECT
+    *
+    * Select the 3D pipeline, as opposed to the media pipeline.
+    */
+   {
+      BEGIN_BATCH(1);
+      OUT_BATCH(brw->CMD_PIPELINE_SELECT << 16);
+      ADVANCE_BATCH();
+   }
+
+   /* 3DSTATE_MULTISAMPLE */
+   {
+      int length = intel->gen == 7 ? 4 : 3;
+
+      BEGIN_BATCH(length);
+      OUT_BATCH(_3DSTATE_MULTISAMPLE << 16 | (length - 2));
+      OUT_BATCH(MS_PIXEL_LOCATION_CENTER |
+                MS_NUMSAMPLES_1);
+      OUT_BATCH(0);
+      if (length >= 4)
+         OUT_BATCH(0);
+      ADVANCE_BATCH();
+
+   }
+
+   /* 3DSTATE_SAMPLE_MASK */
+   {
+      BEGIN_BATCH(2);
+      OUT_BATCH(_3DSTATE_SAMPLE_MASK << 16 | (2 - 2));
+      OUT_BATCH(1);
+      ADVANCE_BATCH();
+   }
+
+   /* CMD_STATE_BASE_ADDRESS
+    *
+    * From the Sandy Bridge PRM, Volume 1, Part 1, Table STATE_BASE_ADDRESS:
+    *     The following commands must be reissued following any change to the
+    *     base addresses:
+    *         3DSTATE_CC_POINTERS
+    *         3DSTATE_BINDING_TABLE_POINTERS
+    *         3DSTATE_SAMPLER_STATE_POINTERS
+    *         3DSTATE_VIEWPORT_STATE_POINTERS
+    *         MEDIA_STATE_POINTERS
+    */
+   {
+      BEGIN_BATCH(10);
+      OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (10 - 2));
+      OUT_BATCH(1); /* GeneralStateBaseAddressModifyEnable */
+      /* SurfaceStateBaseAddress */
+      OUT_RELOC(intel->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0, 1);
+      /* DynamicStateBaseAddress */
+      OUT_RELOC(intel->batch.bo, (I915_GEM_DOMAIN_RENDER |
+                                  I915_GEM_DOMAIN_INSTRUCTION), 0, 1);
+      OUT_BATCH(1); /* IndirectObjectBaseAddress */
+      OUT_BATCH(1); /* InstructionBaseAddress */
+      OUT_BATCH(1); /* GeneralStateUpperBound */
+      OUT_BATCH(1); /* DynamicStateUpperBound */
+      OUT_BATCH(1); /* IndirectObjectUpperBound*/
+      OUT_BATCH(1); /* InstructionAccessUpperBound */
+      ADVANCE_BATCH();
+   }
+}
+
+void
+gen6_hiz_emit_vertices(struct brw_context *brw,
+                       struct intel_mipmap_tree *mt,
+                       unsigned int level,
+                       unsigned int layer)
+{
+   struct intel_context *intel = &brw->intel;
+   uint32_t vertex_offset;
+
+   /* Setup VBO for the rectangle primitive..
+    *
+    * A rectangle primitive (3DPRIM_RECTLIST) consists of only three
+    * vertices. The vertices reside in screen space with DirectX coordinates
+    * (that is, (0, 0) is the upper left corner).
+    *
+    *   v2 ------ implied
+    *    |        |
+    *    |        |
+    *   v0 ----- v1
+    *
+    * Since the VS is disabled, the clipper loads each VUE directly from
+    * the URB. This is controlled by the 3DSTATE_VERTEX_BUFFERS and
+    * 3DSTATE_VERTEX_ELEMENTS packets below. The VUE contents are as follows:
+    *   dw0: Reserved, MBZ.
+    *   dw1: Render Target Array Index. The HiZ op does not use indexed
+    *        vertices, so set the dword to 0.
+    *   dw2: Viewport Index. The HiZ op disables viewport mapping and
+    *        scissoring, so set the dword to 0.
+    *   dw3: Point Width: The HiZ op does not emit the POINTLIST primitive, so
+    *        set the dword to 0.
+    *   dw4: Vertex Position X.
+    *   dw5: Vertex Position Y.
+    *   dw6: Vertex Position Z.
+    *   dw7: Vertex Position W.
+    *
+    * For details, see the Sandybridge PRM, Volume 2, Part 1, Section 1.5.1
+    * "Vertex URB Entry (VUE) Formats".
+    */
+   {
+      const int width = mt->level[level].width;
+      const int height = mt->level[level].height;
+      float *vertex_data;
+
+      const float vertices[GEN6_HIZ_VBO_SIZE] = {
+         /* v0 */ 0, 0, 0, 0,         0, height, 0, 1,
+         /* v1 */ 0, 0, 0, 0,     width, height, 0, 1,
+         /* v2 */ 0, 0, 0, 0,         0,      0, 0, 1,
+      };
+
+      vertex_data = (float *) brw_state_batch(brw, AUB_TRACE_NO_TYPE,
+                                              GEN6_HIZ_VBO_SIZE, 32,
+                                              &vertex_offset);
+      memcpy(vertex_data, vertices, GEN6_HIZ_VBO_SIZE);
+   }
+
+   /* 3DSTATE_VERTEX_BUFFERS */
+   {
+      const int num_buffers = 1;
+      const int batch_length = 1 + 4 * num_buffers;
+
+      uint32_t dw0 = GEN6_VB0_ACCESS_VERTEXDATA |
+                     (GEN6_HIZ_NUM_VUE_ELEMS * sizeof(float)) << BRW_VB0_PITCH_SHIFT;
+
+      if (intel->gen >= 7)
+         dw0 |= GEN7_VB0_ADDRESS_MODIFYENABLE;
+
+      BEGIN_BATCH(batch_length);
+      OUT_BATCH((_3DSTATE_VERTEX_BUFFERS << 16) | (batch_length - 2));
+      OUT_BATCH(dw0);
+      /* start address */
+      OUT_RELOC(intel->batch.bo, I915_GEM_DOMAIN_VERTEX, 0,
+		vertex_offset);
+      /* end address */
+      OUT_RELOC(intel->batch.bo, I915_GEM_DOMAIN_VERTEX, 0,
+		vertex_offset + GEN6_HIZ_VBO_SIZE - 1);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }
+
+   /* 3DSTATE_VERTEX_ELEMENTS
+    *
+    * Fetch dwords 0 - 7 from each VUE. See the comments above where
+    * hiz->vertex_bo is filled with data.
+    */
+   {
+      const int num_elements = 2;
+      const int batch_length = 1 + 2 * num_elements;
+
+      BEGIN_BATCH(batch_length);
+      OUT_BATCH((_3DSTATE_VERTEX_ELEMENTS << 16) | (batch_length - 2));
+      /* Element 0 */
+      OUT_BATCH(GEN6_VE0_VALID |
+                BRW_SURFACEFORMAT_R32G32B32A32_FLOAT << BRW_VE0_FORMAT_SHIFT |
+                0 << BRW_VE0_SRC_OFFSET_SHIFT);
+      OUT_BATCH(BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_0_SHIFT |
+                BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_1_SHIFT |
+                BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_2_SHIFT |
+                BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_3_SHIFT);
+      /* Element 1 */
+      OUT_BATCH(GEN6_VE0_VALID |
+                BRW_SURFACEFORMAT_R32G32B32A32_FLOAT << BRW_VE0_FORMAT_SHIFT |
+                16 << BRW_VE0_SRC_OFFSET_SHIFT);
+      OUT_BATCH(BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_0_SHIFT |
+                BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_1_SHIFT |
+                BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_2_SHIFT |
+                BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_3_SHIFT);
+      ADVANCE_BATCH();
+   }
+}
+
+/**
+ * \brief Execute a HiZ op on a miptree slice.
+ *
+ * To execute the HiZ op, this function manually constructs and emits a batch
+ * to "draw" the HiZ op's rectangle primitive. The batchbuffer is flushed
+ * before constructing and after emitting the batch.
+ *
+ * This function alters no GL state.
+ *
+ * For an overview of HiZ ops, see the following sections of the Sandy Bridge
+ * PRM, Volume 1, Part 2:
+ *   - 7.5.3.1 Depth Buffer Clear
+ *   - 7.5.3.2 Depth Buffer Resolve
+ *   - 7.5.3.3 Hierarchical Depth Buffer Resolve
+ */
+static void
+gen6_hiz_exec(struct intel_context *intel,
+              struct intel_mipmap_tree *mt,
+              unsigned int level,
+              unsigned int layer,
+              enum gen6_hiz_op op)
+{
+   struct gl_context *ctx = &intel->ctx;
+   struct brw_context *brw = brw_context(ctx);
+   uint32_t draw_x, draw_y;
+   uint32_t tile_mask_x, tile_mask_y;
+
+   assert(op != GEN6_HIZ_OP_DEPTH_CLEAR); /* Not implemented yet. */
+   assert(mt->hiz_mt != NULL);
+   intel_miptree_check_level_layer(mt, level, layer);
+
+   {
+      /* Construct a dummy renderbuffer just to extract tile offsets. */
+      struct intel_renderbuffer rb;
+      rb.mt = mt;
+      rb.mt_level = level;
+      rb.mt_layer = layer;
+      intel_renderbuffer_set_draw_offset(&rb);
+      draw_x = rb.draw_x;
+      draw_y = rb.draw_y;
+   }
+
+   /* Compute masks to determine how much of draw_x and draw_y should be
+    * performed using the fine adjustment of "depth coordinate offset X/Y"
+    * (dw5 of 3DSTATE_DEPTH_BUFFER).  See the emit_depthbuffer() function for
+    * details.
+    */
+   {
+      uint32_t depth_mask_x, depth_mask_y, hiz_mask_x, hiz_mask_y;
+      intel_region_get_tile_masks(mt->region, &depth_mask_x, &depth_mask_y);
+      intel_region_get_tile_masks(mt->hiz_mt->region,
+                                  &hiz_mask_x, &hiz_mask_y);
+
+      /* Each HiZ row represents 2 rows of pixels */
+      hiz_mask_y = hiz_mask_y << 1 | 1;
+
+      tile_mask_x = depth_mask_x | hiz_mask_x;
+      tile_mask_y = depth_mask_y | hiz_mask_y;
+   }
+
+   gen6_hiz_emit_batch_head(brw);
+   gen6_hiz_emit_vertices(brw, mt, level, layer);
+
+   /* 3DSTATE_URB
+    *
+    * Assign the entire URB to the VS. Even though the VS disabled, URB space
+    * is still needed because the clipper loads the VUE's from the URB. From
+    * the Sandybridge PRM, Volume 2, Part 1, Section 3DSTATE,
+    * Dword 1.15:0 "VS Number of URB Entries":
+    *     This field is always used (even if VS Function Enable is DISABLED).
+    *
+    * The warning below appears in the PRM (Section 3DSTATE_URB), but we can
+    * safely ignore it because this batch contains only one draw call.
+    *     Because of URB corruption caused by allocating a previous GS unit
+    *     URB entry to the VS unit, software is required to send a âGS NULL
+    *     Fenceâ (Send URB fence with VS URB size == 1 and GS URB size == 0)
+    *     plus a dummy DRAW call before any case where VS will be taking over
+    *     GS URB space.
+    */
+   {
+      BEGIN_BATCH(3);
+      OUT_BATCH(_3DSTATE_URB << 16 | (3 - 2));
+      OUT_BATCH(brw->urb.max_vs_entries << GEN6_URB_VS_ENTRIES_SHIFT);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }
+
+   /* 3DSTATE_CC_STATE_POINTERS
+    *
+    * The pointer offsets are relative to
+    * CMD_STATE_BASE_ADDRESS.DynamicStateBaseAddress.
+    *
+    * The HiZ op doesn't use BLEND_STATE or COLOR_CALC_STATE.
+    */
+   {
+      uint32_t depthstencil_offset;
+      gen6_hiz_emit_depth_stencil_state(brw, op, &depthstencil_offset);
+
+      BEGIN_BATCH(4);
+      OUT_BATCH(_3DSTATE_CC_STATE_POINTERS << 16 | (4 - 2));
+      OUT_BATCH(1); /* BLEND_STATE offset */
+      OUT_BATCH(depthstencil_offset | 1); /* DEPTH_STENCIL_STATE offset */
+      OUT_BATCH(1); /* COLOR_CALC_STATE offset */
+      ADVANCE_BATCH();
+   }
+
+   /* 3DSTATE_VS
+    *
+    * Disable vertex shader.
+    */
+   {
+      /* From the BSpec, Volume 2a, Part 3 "Vertex Shader", Section
+       * 3DSTATE_VS, Dword 5.0 "VS Function Enable":
+       *   [DevSNB] A pipeline flush must be programmed prior to a 3DSTATE_VS
+       *   command that causes the VS Function Enable to toggle. Pipeline
+       *   flush can be executed by sending a PIPE_CONTROL command with CS
+       *   stall bit set and a post sync operation.
+       */
+      intel_emit_post_sync_nonzero_flush(intel);
+
+      BEGIN_BATCH(6);
+      OUT_BATCH(_3DSTATE_VS << 16 | (6 - 2));
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }
+
+   /* 3DSTATE_GS
+    *
+    * Disable the geometry shader.
+    */
+   {
+      BEGIN_BATCH(7);
+      OUT_BATCH(_3DSTATE_GS << 16 | (7 - 2));
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }
+
+   /* 3DSTATE_CLIP
+    *
+    * Disable the clipper.
+    *
+    * The HiZ op emits a rectangle primitive, which requires clipping to
+    * be disabled. From page 10 of the Sandy Bridge PRM Volume 2 Part 1
+    * Section 1.3 "3D Primitives Overview":
+    *    RECTLIST:
+    *    Either the CLIP unit should be DISABLED, or the CLIP unit's Clip
+    *    Mode should be set to a value other than CLIPMODE_NORMAL.
+    *
+    * Also disable perspective divide. This doesn't change the clipper's
+    * output, but does spare a few electrons.
+    */
+   {
+      BEGIN_BATCH(4);
+      OUT_BATCH(_3DSTATE_CLIP << 16 | (4 - 2));
+      OUT_BATCH(0);
+      OUT_BATCH(GEN6_CLIP_PERSPECTIVE_DIVIDE_DISABLE);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }
+
+   /* 3DSTATE_SF
+    *
+    * Disable ViewportTransformEnable (dw2.1)
+    *
+    * From the SandyBridge PRM, Volume 2, Part 1, Section 1.3, "3D
+    * Primitives Overview":
+    *     RECTLIST: Viewport Mapping must be DISABLED (as is typical with the
+    *     use of screen- space coordinates).
+    *
+    * A solid rectangle must be rendered, so set FrontFaceFillMode (dw2.4:3)
+    * and BackFaceFillMode (dw2.5:6) to SOLID(0).
+    *
+    * From the Sandy Bridge PRM, Volume 2, Part 1, Section
+    * 6.4.1.1 3DSTATE_SF, Field FrontFaceFillMode:
+    *     SOLID: Any triangle or rectangle object found to be front-facing
+    *     is rendered as a solid object. This setting is required when
+    *     (rendering rectangle (RECTLIST) objects.
+    */
+   {
+      BEGIN_BATCH(20);
+      OUT_BATCH(_3DSTATE_SF << 16 | (20 - 2));
+      OUT_BATCH((1 - 1) << GEN6_SF_NUM_OUTPUTS_SHIFT | /* only position */
+                1 << GEN6_SF_URB_ENTRY_READ_LENGTH_SHIFT |
+                0 << GEN6_SF_URB_ENTRY_READ_OFFSET_SHIFT);
+      for (int i = 0; i < 18; ++i)
+         OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }
+
+   /* 3DSTATE_WM
+    *
+    * Disable thread dispatch (dw5.19) and enable the HiZ op.
+    *
+    * Even though thread dispatch is disabled, max threads (dw5.25:31) must be
+    * nonzero to prevent the GPU from hanging. See the valid ranges in the
+    * BSpec, Volume 2a.11 Windower, Section 3DSTATE_WM, Dword 5.25:31
+    * "Maximum Number Of Threads".
+    */
+   {
+      uint32_t dw4 = 0;
+
+      switch (op) {
+      case GEN6_HIZ_OP_DEPTH_CLEAR:
+         assert(!"not implemented");
+         dw4 |= GEN6_WM_DEPTH_CLEAR;
+         break;
+      case GEN6_HIZ_OP_DEPTH_RESOLVE:
+         dw4 |= GEN6_WM_DEPTH_RESOLVE;
+         break;
+      case GEN6_HIZ_OP_HIZ_RESOLVE:
+         dw4 |= GEN6_WM_HIERARCHICAL_DEPTH_RESOLVE;
+         break;
+      default:
+         assert(0);
+         break;
+      }
+
+      BEGIN_BATCH(9);
+      OUT_BATCH(_3DSTATE_WM << 16 | (9 - 2));
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(dw4);
+      OUT_BATCH((brw->max_wm_threads - 1) << GEN6_WM_MAX_THREADS_SHIFT);
+      OUT_BATCH((1 - 1) << GEN6_WM_NUM_SF_OUTPUTS_SHIFT); /* only position */
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }
+
+   /* 3DSTATE_DEPTH_BUFFER */
+   {
+      uint32_t width = mt->level[level].width;
+      uint32_t height = mt->level[level].height;
+
+      uint32_t tile_x = draw_x & tile_mask_x;
+      uint32_t tile_y = draw_y & tile_mask_y;
+      uint32_t offset = intel_region_get_aligned_offset(mt->region,
+                                                        draw_x & ~tile_mask_x,
+                                                        draw_y & ~tile_mask_y);
+
+      /* According to the Sandy Bridge PRM, volume 2 part 1, pp326-327
+       * (3DSTATE_DEPTH_BUFFER dw5), in the documentation for "Depth
+       * Coordinate Offset X/Y":
+       *
+       *   "The 3 LSBs of both offsets must be zero to ensure correct
+       *   alignment"
+       *
+       * We have no guarantee that tile_x and tile_y are correctly aligned,
+       * since they are determined by the mipmap layout, which is only aligned
+       * to multiples of 4.
+       *
+       * So, to avoid hanging the GPU, just smash the low order 3 bits of
+       * tile_x and tile_y to 0.  This is a temporary workaround until we come
+       * up with a better solution.
+       */
+      tile_x &= ~7;
+      tile_y &= ~7;
+
+      uint32_t format;
+      switch (mt->format) {
+      case MESA_FORMAT_Z16:       format = BRW_DEPTHFORMAT_D16_UNORM; break;
+      case MESA_FORMAT_Z32_FLOAT: format = BRW_DEPTHFORMAT_D32_FLOAT; break;
+      case MESA_FORMAT_X8_Z24:    format = BRW_DEPTHFORMAT_D24_UNORM_X8_UINT; break;
+      default:                    assert(0); break;
+      }
+
+      intel_emit_post_sync_nonzero_flush(intel);
+      intel_emit_depth_stall_flushes(intel);
+
+      BEGIN_BATCH(7);
+      OUT_BATCH(_3DSTATE_DEPTH_BUFFER << 16 | (7 - 2));
+      OUT_BATCH(((mt->region->pitch * mt->region->cpp) - 1) |
+                format << 18 |
+                1 << 21 | /* separate stencil enable */
+                1 << 22 | /* hiz enable */
+                BRW_TILEWALK_YMAJOR << 26 |
+                1 << 27 | /* y-tiled */
+                BRW_SURFACE_2D << 29);
+      OUT_RELOC(mt->region->bo,
+                I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
+                offset);
+      OUT_BATCH(BRW_SURFACE_MIPMAPLAYOUT_BELOW << 1 |
+                (width + tile_x - 1) << 6 |
+                (height + tile_y - 1) << 19);
+      OUT_BATCH(0);
+      OUT_BATCH(tile_x |
+                tile_y << 16);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }
+
+   /* 3DSTATE_HIER_DEPTH_BUFFER */
+   {
+      struct intel_region *hiz_region = mt->hiz_mt->region;
+      uint32_t hiz_offset =
+         intel_region_get_aligned_offset(hiz_region,
+                                         draw_x & ~tile_mask_x,
+                                         (draw_y & ~tile_mask_y) / 2);
+
+      BEGIN_BATCH(3);
+      OUT_BATCH((_3DSTATE_HIER_DEPTH_BUFFER << 16) | (3 - 2));
+      OUT_BATCH(hiz_region->pitch * hiz_region->cpp - 1);
+      OUT_RELOC(hiz_region->bo,
+                I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
+                hiz_offset);
+      ADVANCE_BATCH();
+   }
+
+   /* 3DSTATE_STENCIL_BUFFER */
+   {
+      BEGIN_BATCH(3);
+      OUT_BATCH((_3DSTATE_STENCIL_BUFFER << 16) | (3 - 2));
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }
+
+   /* 3DSTATE_CLEAR_PARAMS
+    *
+    * From the Sandybridge PRM, Volume 2, Part 1, Section 3DSTATE_CLEAR_PARAMS:
+    *   [DevSNB] 3DSTATE_CLEAR_PARAMS packet must follow the DEPTH_BUFFER_STATE
+    *   packet when HiZ is enabled and the DEPTH_BUFFER_STATE changes.
+    */
+   {
+      BEGIN_BATCH(2);
+      OUT_BATCH(_3DSTATE_CLEAR_PARAMS << 16 | (2 - 2));
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }
+
+   /* 3DSTATE_DRAWING_RECTANGLE */
+   {
+      BEGIN_BATCH(4);
+      OUT_BATCH(_3DSTATE_DRAWING_RECTANGLE << 16 | (4 - 2));
+      OUT_BATCH(0);
+      OUT_BATCH(((mt->level[level].width - 1) & 0xffff) |
+                ((mt->level[level].height - 1) << 16));
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }
+
+   /* 3DPRIMITIVE */
+   {
+     BEGIN_BATCH(6);
+     OUT_BATCH(CMD_3D_PRIM << 16 | (6 - 2) |
+               _3DPRIM_RECTLIST << GEN4_3DPRIM_TOPOLOGY_TYPE_SHIFT |
+               GEN4_3DPRIM_VERTEXBUFFER_ACCESS_SEQUENTIAL);
+     OUT_BATCH(3); /* vertex count per instance */
+     OUT_BATCH(0);
+     OUT_BATCH(1); /* instance count */
+     OUT_BATCH(0);
+     OUT_BATCH(0);
+     ADVANCE_BATCH();
+   }
+
+   /* See comments above at first invocation of intel_flush() in
+    * gen6_hiz_emit_batch_head().
+    */
+   intel_flush(ctx);
+
+   /* Be safe. */
+   brw->state.dirty.brw = ~0;
+   brw->state.dirty.cache = ~0;
+}
+
+/**
+ * \param out_offset is relative to
+ *        CMD_STATE_BASE_ADDRESS.DynamicStateBaseAddress.
+ */
+void
+gen6_hiz_emit_depth_stencil_state(struct brw_context *brw,
+                                  enum gen6_hiz_op op,
+                                  uint32_t *out_offset)
+{
+   struct gen6_depth_stencil_state *state;
+   state = (struct gen6_depth_stencil_state *)
+      brw_state_batch(brw, AUB_TRACE_DEPTH_STENCIL_STATE,
+                      sizeof(*state), 64,
+                      out_offset);
+   memset(state, 0, sizeof(*state));
+
+   /* See the following sections of the Sandy Bridge PRM, Volume 1, Part2:
+    *   - 7.5.3.1 Depth Buffer Clear
+    *   - 7.5.3.2 Depth Buffer Resolve
+    *   - 7.5.3.3 Hierarchical Depth Buffer Resolve
+    */
+   state->ds2.depth_write_enable = 1;
+   if (op == GEN6_HIZ_OP_DEPTH_RESOLVE) {
+      state->ds2.depth_test_enable = 1;
+      state->ds2.depth_test_func = COMPAREFUNC_NEVER;
+   }
+}
+
+/** \see intel_context::vtbl::resolve_hiz_slice */
+void
+gen6_resolve_hiz_slice(struct intel_context *intel,
+                       struct intel_mipmap_tree *mt,
+                       uint32_t level,
+                       uint32_t layer)
+{
+   gen6_hiz_exec(intel, mt, level, layer, GEN6_HIZ_OP_HIZ_RESOLVE);
+}
+
+/** \see intel_context::vtbl::resolve_depth_slice */
+void
+gen6_resolve_depth_slice(struct intel_context *intel,
+                         struct intel_mipmap_tree *mt,
+                         uint32_t level,
+                         uint32_t layer)
+{
+   gen6_hiz_exec(intel, mt, level, layer, GEN6_HIZ_OP_DEPTH_RESOLVE);
+}
diff --git a/src/mesa/drivers/dri/i965/gen6_blorp.h b/src/mesa/drivers/dri/i965/gen6_blorp.h
new file mode 100644
index 00000000000..5d6eefc2d64
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/gen6_blorp.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright Â© 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct intel_context;
+struct intel_mipmap_tree;
+
+/**
+ * For an overview of the HiZ operations, see the following sections of the
+ * Sandy Bridge PRM, Volume 1, Part2:
+ *   - 7.5.3.1 Depth Buffer Clear
+ *   - 7.5.3.2 Depth Buffer Resolve
+ *   - 7.5.3.3 Hierarchical Depth Buffer Resolve
+ */
+enum gen6_hiz_op {
+   GEN6_HIZ_OP_DEPTH_CLEAR,
+   GEN6_HIZ_OP_DEPTH_RESOLVE,
+   GEN6_HIZ_OP_HIZ_RESOLVE,
+};
+
+/**
+ * \name HiZ internals
+ * \{
+ *
+ * Used internally by gen6_hiz_exec() and gen7_hiz_exec().
+ */
+
+void
+gen6_hiz_init(struct brw_context *brw);
+
+void
+gen6_hiz_emit_batch_head(struct brw_context *brw);
+
+void
+gen6_hiz_emit_vertices(struct brw_context *brw,
+                       struct intel_mipmap_tree *mt,
+                       unsigned int level,
+                       unsigned int layer);
+
+void
+gen6_hiz_emit_depth_stencil_state(struct brw_context *brw,
+                                  enum gen6_hiz_op op,
+                                  uint32_t *out_offset);
+/** \} */
+
+void
+gen6_resolve_hiz_slice(struct intel_context *intel,
+                       struct intel_mipmap_tree *mt,
+                       uint32_t level,
+                       uint32_t layer);
+
+void
+gen6_resolve_depth_slice(struct intel_context *intel,
+                         struct intel_mipmap_tree *mt,
+                         uint32_t level,
+                         uint32_t layer);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/mesa/drivers/dri/i965/gen6_cc.c b/src/mesa/drivers/dri/i965/gen6_cc.c
index 8a805fa6d77..b4c5329772f 100644
--- a/src/mesa/drivers/dri/i965/gen6_cc.c
+++ b/src/mesa/drivers/dri/i965/gen6_cc.c
@@ -92,12 +92,12 @@ gen6_upload_blend_state(struct brw_context *brw)
 	       intel_translate_logic_op(ctx->Color.LogicOp);
 	 }
       } else if (ctx->Color.BlendEnabled & (1 << b) && !integer) {
-	 GLenum eqRGB = ctx->Color.Blend[0].EquationRGB;
-	 GLenum eqA = ctx->Color.Blend[0].EquationA;
-	 GLenum srcRGB = ctx->Color.Blend[0].SrcRGB;
-	 GLenum dstRGB = ctx->Color.Blend[0].DstRGB;
-	 GLenum srcA = ctx->Color.Blend[0].SrcA;
-	 GLenum dstA = ctx->Color.Blend[0].DstA;
+	 GLenum eqRGB = ctx->Color.Blend[b].EquationRGB;
+	 GLenum eqA = ctx->Color.Blend[b].EquationA;
+	 GLenum srcRGB = ctx->Color.Blend[b].SrcRGB;
+	 GLenum dstRGB = ctx->Color.Blend[b].DstRGB;
+	 GLenum srcA = ctx->Color.Blend[b].SrcA;
+	 GLenum dstA = ctx->Color.Blend[b].DstA;
 
 	 if (eqRGB == GL_MIN || eqRGB == GL_MAX) {
 	    srcRGB = dstRGB = GL_ONE;
diff --git a/src/mesa/drivers/dri/i965/gen6_hiz.c b/src/mesa/drivers/dri/i965/gen6_hiz.c
deleted file mode 100644
index 9837b1fc54d..00000000000
--- a/src/mesa/drivers/dri/i965/gen6_hiz.c
+++ /dev/null
@@ -1,614 +0,0 @@
-/*
- * Copyright Â© 2011 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include <assert.h>
-
-#include "intel_batchbuffer.h"
-#include "intel_fbo.h"
-#include "intel_mipmap_tree.h"
-
-#include "brw_context.h"
-#include "brw_defines.h"
-#include "brw_state.h"
-
-#include "gen6_hiz.h"
-
-/**
- * \name Constants for HiZ VBO
- * \{
- *
- * \see brw_context::hiz::vertex_bo
- */
-#define GEN6_HIZ_NUM_VERTICES 3
-#define GEN6_HIZ_NUM_VUE_ELEMS 8
-#define GEN6_HIZ_VBO_SIZE (GEN6_HIZ_NUM_VERTICES \
-                           * GEN6_HIZ_NUM_VUE_ELEMS \
-                           * sizeof(float))
-/** \} */
-
-void
-gen6_hiz_emit_batch_head(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->intel.ctx;
-   struct intel_context *intel = &brw->intel;
-
-   /* To ensure that the batch contains only the resolve, flush the batch
-    * before beginning and after finishing emitting the resolve packets.
-    *
-    * Ideally, we would not need to flush for the resolve op. But, I suspect
-    * that it's unsafe for CMD_PIPELINE_SELECT to occur multiple times in
-    * a single batch, and there is no safe way to ensure that other than by
-    * fencing the resolve with flushes. Ideally, we would just detect if
-    * a batch is in progress and do the right thing, but that would require
-    * the ability to *safely* access brw_context::state::dirty::brw
-    * outside of the brw_upload_state() codepath.
-    */
-   intel_flush(ctx);
-
-   /* CMD_PIPELINE_SELECT
-    *
-    * Select the 3D pipeline, as opposed to the media pipeline.
-    */
-   {
-      BEGIN_BATCH(1);
-      OUT_BATCH(brw->CMD_PIPELINE_SELECT << 16);
-      ADVANCE_BATCH();
-   }
-
-   /* 3DSTATE_MULTISAMPLE */
-   {
-      int length = intel->gen == 7 ? 4 : 3;
-
-      BEGIN_BATCH(length);
-      OUT_BATCH(_3DSTATE_MULTISAMPLE << 16 | (length - 2));
-      OUT_BATCH(MS_PIXEL_LOCATION_CENTER |
-                MS_NUMSAMPLES_1);
-      OUT_BATCH(0);
-      if (length >= 4)
-         OUT_BATCH(0);
-      ADVANCE_BATCH();
-
-   }
-
-   /* 3DSTATE_SAMPLE_MASK */
-   {
-      BEGIN_BATCH(2);
-      OUT_BATCH(_3DSTATE_SAMPLE_MASK << 16 | (2 - 2));
-      OUT_BATCH(1);
-      ADVANCE_BATCH();
-   }
-
-   /* CMD_STATE_BASE_ADDRESS
-    *
-    * From the Sandy Bridge PRM, Volume 1, Part 1, Table STATE_BASE_ADDRESS:
-    *     The following commands must be reissued following any change to the
-    *     base addresses:
-    *         3DSTATE_CC_POINTERS
-    *         3DSTATE_BINDING_TABLE_POINTERS
-    *         3DSTATE_SAMPLER_STATE_POINTERS
-    *         3DSTATE_VIEWPORT_STATE_POINTERS
-    *         MEDIA_STATE_POINTERS
-    */
-   {
-      BEGIN_BATCH(10);
-      OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (10 - 2));
-      OUT_BATCH(1); /* GeneralStateBaseAddressModifyEnable */
-      /* SurfaceStateBaseAddress */
-      OUT_RELOC(intel->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0, 1);
-      /* DynamicStateBaseAddress */
-      OUT_RELOC(intel->batch.bo, (I915_GEM_DOMAIN_RENDER |
-                                  I915_GEM_DOMAIN_INSTRUCTION), 0, 1);
-      OUT_BATCH(1); /* IndirectObjectBaseAddress */
-      OUT_BATCH(1); /* InstructionBaseAddress */
-      OUT_BATCH(1); /* GeneralStateUpperBound */
-      OUT_BATCH(1); /* DynamicStateUpperBound */
-      OUT_BATCH(1); /* IndirectObjectUpperBound*/
-      OUT_BATCH(1); /* InstructionAccessUpperBound */
-      ADVANCE_BATCH();
-   }
-}
-
-void
-gen6_hiz_emit_vertices(struct brw_context *brw,
-                       struct intel_mipmap_tree *mt,
-                       unsigned int level,
-                       unsigned int layer)
-{
-   struct intel_context *intel = &brw->intel;
-   uint32_t vertex_offset;
-
-   /* Setup VBO for the rectangle primitive..
-    *
-    * A rectangle primitive (3DPRIM_RECTLIST) consists of only three
-    * vertices. The vertices reside in screen space with DirectX coordinates
-    * (that is, (0, 0) is the upper left corner).
-    *
-    *   v2 ------ implied
-    *    |        |
-    *    |        |
-    *   v0 ----- v1
-    *
-    * Since the VS is disabled, the clipper loads each VUE directly from
-    * the URB. This is controlled by the 3DSTATE_VERTEX_BUFFERS and
-    * 3DSTATE_VERTEX_ELEMENTS packets below. The VUE contents are as follows:
-    *   dw0: Reserved, MBZ.
-    *   dw1: Render Target Array Index. The HiZ op does not use indexed
-    *        vertices, so set the dword to 0.
-    *   dw2: Viewport Index. The HiZ op disables viewport mapping and
-    *        scissoring, so set the dword to 0.
-    *   dw3: Point Width: The HiZ op does not emit the POINTLIST primitive, so
-    *        set the dword to 0.
-    *   dw4: Vertex Position X.
-    *   dw5: Vertex Position Y.
-    *   dw6: Vertex Position Z.
-    *   dw7: Vertex Position W.
-    *
-    * For details, see the Sandybridge PRM, Volume 2, Part 1, Section 1.5.1
-    * "Vertex URB Entry (VUE) Formats".
-    */
-   {
-      const int width = mt->level[level].width;
-      const int height = mt->level[level].height;
-      float *vertex_data;
-
-      const float vertices[GEN6_HIZ_VBO_SIZE] = {
-         /* v0 */ 0, 0, 0, 0,         0, height, 0, 1,
-         /* v1 */ 0, 0, 0, 0,     width, height, 0, 1,
-         /* v2 */ 0, 0, 0, 0,         0,      0, 0, 1,
-      };
-
-      vertex_data = brw_state_batch(brw, AUB_TRACE_NO_TYPE,
-				    GEN6_HIZ_VBO_SIZE, 32, &vertex_offset);
-      memcpy(vertex_data, vertices, GEN6_HIZ_VBO_SIZE);
-   }
-
-   /* 3DSTATE_VERTEX_BUFFERS */
-   {
-      const int num_buffers = 1;
-      const int batch_length = 1 + 4 * num_buffers;
-
-      uint32_t dw0 = GEN6_VB0_ACCESS_VERTEXDATA |
-                     (GEN6_HIZ_NUM_VUE_ELEMS * sizeof(float)) << BRW_VB0_PITCH_SHIFT;
-
-      if (intel->gen >= 7)
-         dw0 |= GEN7_VB0_ADDRESS_MODIFYENABLE;
-
-      BEGIN_BATCH(batch_length);
-      OUT_BATCH((_3DSTATE_VERTEX_BUFFERS << 16) | (batch_length - 2));
-      OUT_BATCH(dw0);
-      /* start address */
-      OUT_RELOC(intel->batch.bo, I915_GEM_DOMAIN_VERTEX, 0,
-		vertex_offset);
-      /* end address */
-      OUT_RELOC(intel->batch.bo, I915_GEM_DOMAIN_VERTEX, 0,
-		vertex_offset + GEN6_HIZ_VBO_SIZE - 1);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-
-   /* 3DSTATE_VERTEX_ELEMENTS
-    *
-    * Fetch dwords 0 - 7 from each VUE. See the comments above where
-    * hiz->vertex_bo is filled with data.
-    */
-   {
-      const int num_elements = 2;
-      const int batch_length = 1 + 2 * num_elements;
-
-      BEGIN_BATCH(batch_length);
-      OUT_BATCH((_3DSTATE_VERTEX_ELEMENTS << 16) | (batch_length - 2));
-      /* Element 0 */
-      OUT_BATCH(GEN6_VE0_VALID |
-                BRW_SURFACEFORMAT_R32G32B32A32_FLOAT << BRW_VE0_FORMAT_SHIFT |
-                0 << BRW_VE0_SRC_OFFSET_SHIFT);
-      OUT_BATCH(BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_0_SHIFT |
-                BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_1_SHIFT |
-                BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_2_SHIFT |
-                BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_3_SHIFT);
-      /* Element 1 */
-      OUT_BATCH(GEN6_VE0_VALID |
-                BRW_SURFACEFORMAT_R32G32B32A32_FLOAT << BRW_VE0_FORMAT_SHIFT |
-                16 << BRW_VE0_SRC_OFFSET_SHIFT);
-      OUT_BATCH(BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_0_SHIFT |
-                BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_1_SHIFT |
-                BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_2_SHIFT |
-                BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_3_SHIFT);
-      ADVANCE_BATCH();
-   }
-}
-
-/**
- * \brief Execute a HiZ op on a miptree slice.
- *
- * To execute the HiZ op, this function manually constructs and emits a batch
- * to "draw" the HiZ op's rectangle primitive. The batchbuffer is flushed
- * before constructing and after emitting the batch.
- *
- * This function alters no GL state.
- *
- * For an overview of HiZ ops, see the following sections of the Sandy Bridge
- * PRM, Volume 1, Part 2:
- *   - 7.5.3.1 Depth Buffer Clear
- *   - 7.5.3.2 Depth Buffer Resolve
- *   - 7.5.3.3 Hierarchical Depth Buffer Resolve
- */
-static void
-gen6_hiz_exec(struct intel_context *intel,
-              struct intel_mipmap_tree *mt,
-              unsigned int level,
-              unsigned int layer,
-              enum gen6_hiz_op op)
-{
-   struct gl_context *ctx = &intel->ctx;
-   struct brw_context *brw = brw_context(ctx);
-
-   assert(op != GEN6_HIZ_OP_DEPTH_CLEAR); /* Not implemented yet. */
-   assert(mt->hiz_mt != NULL);
-   intel_miptree_check_level_layer(mt, level, layer);
-
-   gen6_hiz_emit_batch_head(brw);
-   gen6_hiz_emit_vertices(brw, mt, level, layer);
-
-   /* 3DSTATE_URB
-    *
-    * Assign the entire URB to the VS. Even though the VS disabled, URB space
-    * is still needed because the clipper loads the VUE's from the URB. From
-    * the Sandybridge PRM, Volume 2, Part 1, Section 3DSTATE,
-    * Dword 1.15:0 "VS Number of URB Entries":
-    *     This field is always used (even if VS Function Enable is DISABLED).
-    *
-    * The warning below appears in the PRM (Section 3DSTATE_URB), but we can
-    * safely ignore it because this batch contains only one draw call.
-    *     Because of URB corruption caused by allocating a previous GS unit
-    *     URB entry to the VS unit, software is required to send a âGS NULL
-    *     Fenceâ (Send URB fence with VS URB size == 1 and GS URB size == 0)
-    *     plus a dummy DRAW call before any case where VS will be taking over
-    *     GS URB space.
-    */
-   {
-      BEGIN_BATCH(3);
-      OUT_BATCH(_3DSTATE_URB << 16 | (3 - 2));
-      OUT_BATCH(brw->urb.max_vs_entries << GEN6_URB_VS_ENTRIES_SHIFT);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-
-   /* 3DSTATE_CC_STATE_POINTERS
-    *
-    * The pointer offsets are relative to
-    * CMD_STATE_BASE_ADDRESS.DynamicStateBaseAddress.
-    *
-    * The HiZ op doesn't use BLEND_STATE or COLOR_CALC_STATE.
-    */
-   {
-      uint32_t depthstencil_offset;
-      gen6_hiz_emit_depth_stencil_state(brw, op, &depthstencil_offset);
-
-      BEGIN_BATCH(4);
-      OUT_BATCH(_3DSTATE_CC_STATE_POINTERS << 16 | (4 - 2));
-      OUT_BATCH(1); /* BLEND_STATE offset */
-      OUT_BATCH(depthstencil_offset | 1); /* DEPTH_STENCIL_STATE offset */
-      OUT_BATCH(1); /* COLOR_CALC_STATE offset */
-      ADVANCE_BATCH();
-   }
-
-   /* 3DSTATE_VS
-    *
-    * Disable vertex shader.
-    */
-   {
-      /* From the BSpec, Volume 2a, Part 3 "Vertex Shader", Section
-       * 3DSTATE_VS, Dword 5.0 "VS Function Enable":
-       *   [DevSNB] A pipeline flush must be programmed prior to a 3DSTATE_VS
-       *   command that causes the VS Function Enable to toggle. Pipeline
-       *   flush can be executed by sending a PIPE_CONTROL command with CS
-       *   stall bit set and a post sync operation.
-       */
-      intel_emit_post_sync_nonzero_flush(intel);
-
-      BEGIN_BATCH(6);
-      OUT_BATCH(_3DSTATE_VS << 16 | (6 - 2));
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-
-   /* 3DSTATE_GS
-    *
-    * Disable the geometry shader.
-    */
-   {
-      BEGIN_BATCH(7);
-      OUT_BATCH(_3DSTATE_GS << 16 | (7 - 2));
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-
-   /* 3DSTATE_CLIP
-    *
-    * Disable the clipper.
-    *
-    * The HiZ op emits a rectangle primitive, which requires clipping to
-    * be disabled. From page 10 of the Sandy Bridge PRM Volume 2 Part 1
-    * Section 1.3 "3D Primitives Overview":
-    *    RECTLIST:
-    *    Either the CLIP unit should be DISABLED, or the CLIP unit's Clip
-    *    Mode should be set to a value other than CLIPMODE_NORMAL.
-    *
-    * Also disable perspective divide. This doesn't change the clipper's
-    * output, but does spare a few electrons.
-    */
-   {
-      BEGIN_BATCH(4);
-      OUT_BATCH(_3DSTATE_CLIP << 16 | (4 - 2));
-      OUT_BATCH(0);
-      OUT_BATCH(GEN6_CLIP_PERSPECTIVE_DIVIDE_DISABLE);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-
-   /* 3DSTATE_SF
-    *
-    * Disable ViewportTransformEnable (dw2.1)
-    *
-    * From the SandyBridge PRM, Volume 2, Part 1, Section 1.3, "3D
-    * Primitives Overview":
-    *     RECTLIST: Viewport Mapping must be DISABLED (as is typical with the
-    *     use of screen- space coordinates).
-    *
-    * A solid rectangle must be rendered, so set FrontFaceFillMode (dw2.4:3)
-    * and BackFaceFillMode (dw2.5:6) to SOLID(0).
-    *
-    * From the Sandy Bridge PRM, Volume 2, Part 1, Section
-    * 6.4.1.1 3DSTATE_SF, Field FrontFaceFillMode:
-    *     SOLID: Any triangle or rectangle object found to be front-facing
-    *     is rendered as a solid object. This setting is required when
-    *     (rendering rectangle (RECTLIST) objects.
-    */
-   {
-      BEGIN_BATCH(20);
-      OUT_BATCH(_3DSTATE_SF << 16 | (20 - 2));
-      OUT_BATCH((1 - 1) << GEN6_SF_NUM_OUTPUTS_SHIFT | /* only position */
-                1 << GEN6_SF_URB_ENTRY_READ_LENGTH_SHIFT |
-                0 << GEN6_SF_URB_ENTRY_READ_OFFSET_SHIFT);
-      for (int i = 0; i < 18; ++i)
-         OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-
-   /* 3DSTATE_WM
-    *
-    * Disable thread dispatch (dw5.19) and enable the HiZ op.
-    *
-    * Even though thread dispatch is disabled, max threads (dw5.25:31) must be
-    * nonzero to prevent the GPU from hanging. See the valid ranges in the
-    * BSpec, Volume 2a.11 Windower, Section 3DSTATE_WM, Dword 5.25:31
-    * "Maximum Number Of Threads".
-    */
-   {
-      uint32_t dw4 = 0;
-
-      switch (op) {
-      case GEN6_HIZ_OP_DEPTH_CLEAR:
-         assert(!"not implemented");
-         dw4 |= GEN6_WM_DEPTH_CLEAR;
-         break;
-      case GEN6_HIZ_OP_DEPTH_RESOLVE:
-         dw4 |= GEN6_WM_DEPTH_RESOLVE;
-         break;
-      case GEN6_HIZ_OP_HIZ_RESOLVE:
-         dw4 |= GEN6_WM_HIERARCHICAL_DEPTH_RESOLVE;
-         break;
-      default:
-         assert(0);
-         break;
-      }
-
-      BEGIN_BATCH(9);
-      OUT_BATCH(_3DSTATE_WM << 16 | (9 - 2));
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(dw4);
-      OUT_BATCH((brw->max_wm_threads - 1) << GEN6_WM_MAX_THREADS_SHIFT);
-      OUT_BATCH((1 - 1) << GEN6_WM_NUM_SF_OUTPUTS_SHIFT); /* only position */
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-
-   /* 3DSTATE_DEPTH_BUFFER */
-   {
-      uint32_t width = mt->level[level].width;
-      uint32_t height = mt->level[level].height;
-
-      uint32_t tile_x;
-      uint32_t tile_y;
-      uint32_t offset;
-      {
-         /* Construct a dummy renderbuffer just to extract tile offsets. */
-         struct intel_renderbuffer rb;
-         rb.mt = mt;
-         rb.mt_level = level;
-         rb.mt_layer = layer;
-         intel_renderbuffer_set_draw_offset(&rb);
-         offset = intel_renderbuffer_tile_offsets(&rb, &tile_x, &tile_y);
-      }
-
-      uint32_t format;
-      switch (mt->format) {
-      case MESA_FORMAT_Z16:       format = BRW_DEPTHFORMAT_D16_UNORM; break;
-      case MESA_FORMAT_Z32_FLOAT: format = BRW_DEPTHFORMAT_D32_FLOAT; break;
-      case MESA_FORMAT_X8_Z24:    format = BRW_DEPTHFORMAT_D24_UNORM_X8_UINT; break;
-      default:                    assert(0); break;
-      }
-
-      intel_emit_post_sync_nonzero_flush(intel);
-      intel_emit_depth_stall_flushes(intel);
-
-      BEGIN_BATCH(7);
-      OUT_BATCH(_3DSTATE_DEPTH_BUFFER << 16 | (7 - 2));
-      OUT_BATCH(((mt->region->pitch * mt->region->cpp) - 1) |
-                format << 18 |
-                1 << 21 | /* separate stencil enable */
-                1 << 22 | /* hiz enable */
-                BRW_TILEWALK_YMAJOR << 26 |
-                1 << 27 | /* y-tiled */
-                BRW_SURFACE_2D << 29);
-      OUT_RELOC(mt->region->bo,
-                I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                offset);
-      OUT_BATCH(BRW_SURFACE_MIPMAPLAYOUT_BELOW << 1 |
-                (width + tile_x - 1) << 6 |
-                (height + tile_y - 1) << 19);
-      OUT_BATCH(0);
-      OUT_BATCH(tile_x |
-                tile_y << 16);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-
-   /* 3DSTATE_HIER_DEPTH_BUFFER */
-   {
-      struct intel_region *hiz_region = mt->hiz_mt->region;
-
-      BEGIN_BATCH(3);
-      OUT_BATCH((_3DSTATE_HIER_DEPTH_BUFFER << 16) | (3 - 2));
-      OUT_BATCH(hiz_region->pitch * hiz_region->cpp - 1);
-      OUT_RELOC(hiz_region->bo,
-                I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                0);
-      ADVANCE_BATCH();
-   }
-
-   /* 3DSTATE_STENCIL_BUFFER */
-   {
-      BEGIN_BATCH(3);
-      OUT_BATCH((_3DSTATE_STENCIL_BUFFER << 16) | (3 - 2));
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-
-   /* 3DSTATE_CLEAR_PARAMS
-    *
-    * From the Sandybridge PRM, Volume 2, Part 1, Section 3DSTATE_CLEAR_PARAMS:
-    *   [DevSNB] 3DSTATE_CLEAR_PARAMS packet must follow the DEPTH_BUFFER_STATE
-    *   packet when HiZ is enabled and the DEPTH_BUFFER_STATE changes.
-    */
-   {
-      BEGIN_BATCH(2);
-      OUT_BATCH(_3DSTATE_CLEAR_PARAMS << 16 | (2 - 2));
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-
-   /* 3DSTATE_DRAWING_RECTANGLE */
-   {
-      BEGIN_BATCH(4);
-      OUT_BATCH(_3DSTATE_DRAWING_RECTANGLE << 16 | (4 - 2));
-      OUT_BATCH(0);
-      OUT_BATCH(((mt->level[level].width - 1) & 0xffff) |
-                ((mt->level[level].height - 1) << 16));
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-
-   /* 3DPRIMITIVE */
-   {
-     BEGIN_BATCH(6);
-     OUT_BATCH(CMD_3D_PRIM << 16 | (6 - 2) |
-               _3DPRIM_RECTLIST << GEN4_3DPRIM_TOPOLOGY_TYPE_SHIFT |
-               GEN4_3DPRIM_VERTEXBUFFER_ACCESS_SEQUENTIAL);
-     OUT_BATCH(3); /* vertex count per instance */
-     OUT_BATCH(0);
-     OUT_BATCH(1); /* instance count */
-     OUT_BATCH(0);
-     OUT_BATCH(0);
-     ADVANCE_BATCH();
-   }
-
-   /* See comments above at first invocation of intel_flush() in
-    * gen6_hiz_emit_batch_head().
-    */
-   intel_flush(ctx);
-
-   /* Be safe. */
-   brw->state.dirty.brw = ~0;
-   brw->state.dirty.cache = ~0;
-}
-
-/**
- * \param out_offset is relative to
- *        CMD_STATE_BASE_ADDRESS.DynamicStateBaseAddress.
- */
-void
-gen6_hiz_emit_depth_stencil_state(struct brw_context *brw,
-                                  enum gen6_hiz_op op,
-                                  uint32_t *out_offset)
-{
-   struct gen6_depth_stencil_state *state;
-   state = brw_state_batch(brw, AUB_TRACE_DEPTH_STENCIL_STATE,
-                              sizeof(*state), 64,
-                              out_offset);
-   memset(state, 0, sizeof(*state));
-
-   /* See the following sections of the Sandy Bridge PRM, Volume 1, Part2:
-    *   - 7.5.3.1 Depth Buffer Clear
-    *   - 7.5.3.2 Depth Buffer Resolve
-    *   - 7.5.3.3 Hierarchical Depth Buffer Resolve
-    */
-   state->ds2.depth_write_enable = 1;
-   if (op == GEN6_HIZ_OP_DEPTH_RESOLVE) {
-      state->ds2.depth_test_enable = 1;
-      state->ds2.depth_test_func = COMPAREFUNC_NEVER;
-   }
-}
-
-/** \see intel_context::vtbl::resolve_hiz_slice */
-void
-gen6_resolve_hiz_slice(struct intel_context *intel,
-                       struct intel_mipmap_tree *mt,
-                       uint32_t level,
-                       uint32_t layer)
-{
-   gen6_hiz_exec(intel, mt, level, layer, GEN6_HIZ_OP_HIZ_RESOLVE);
-}
-
-/** \see intel_context::vtbl::resolve_depth_slice */
-void
-gen6_resolve_depth_slice(struct intel_context *intel,
-                         struct intel_mipmap_tree *mt,
-                         uint32_t level,
-                         uint32_t layer)
-{
-   gen6_hiz_exec(intel, mt, level, layer, GEN6_HIZ_OP_DEPTH_RESOLVE);
-}
diff --git a/src/mesa/drivers/dri/i965/gen6_hiz.h b/src/mesa/drivers/dri/i965/gen6_hiz.h
deleted file mode 100644
index 0a13ba076ac..00000000000
--- a/src/mesa/drivers/dri/i965/gen6_hiz.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright Â© 2011 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#pragma once
-
-#include <stdint.h>
-
-struct intel_context;
-struct intel_mipmap_tree;
-
-/**
- * For an overview of the HiZ operations, see the following sections of the
- * Sandy Bridge PRM, Volume 1, Part2:
- *   - 7.5.3.1 Depth Buffer Clear
- *   - 7.5.3.2 Depth Buffer Resolve
- *   - 7.5.3.3 Hierarchical Depth Buffer Resolve
- */
-enum gen6_hiz_op {
-   GEN6_HIZ_OP_DEPTH_CLEAR,
-   GEN6_HIZ_OP_DEPTH_RESOLVE,
-   GEN6_HIZ_OP_HIZ_RESOLVE,
-};
-
-/**
- * \name HiZ internals
- * \{
- *
- * Used internally by gen6_hiz_exec() and gen7_hiz_exec().
- */
-
-void
-gen6_hiz_init(struct brw_context *brw);
-
-void
-gen6_hiz_emit_batch_head(struct brw_context *brw);
-
-void
-gen6_hiz_emit_vertices(struct brw_context *brw,
-                       struct intel_mipmap_tree *mt,
-                       unsigned int level,
-                       unsigned int layer);
-
-void
-gen6_hiz_emit_depth_stencil_state(struct brw_context *brw,
-                                  enum gen6_hiz_op op,
-                                  uint32_t *out_offset);
-/** \} */
-
-void
-gen6_resolve_hiz_slice(struct intel_context *intel,
-                       struct intel_mipmap_tree *mt,
-                       uint32_t level,
-                       uint32_t layer);
-
-void
-gen6_resolve_depth_slice(struct intel_context *intel,
-                         struct intel_mipmap_tree *mt,
-                         uint32_t level,
-                         uint32_t layer);
diff --git a/src/mesa/drivers/dri/i965/gen6_scissor_state.c b/src/mesa/drivers/dri/i965/gen6_scissor_state.c
index feeca28d459..4c7f81c1dd3 100644
--- a/src/mesa/drivers/dri/i965/gen6_scissor_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_scissor_state.c
@@ -29,13 +29,14 @@
 #include "brw_state.h"
 #include "brw_defines.h"
 #include "intel_batchbuffer.h"
+#include "main/fbobject.h"
 
 static void
 gen6_upload_scissor_state(struct brw_context *brw)
 {
    struct intel_context *intel = &brw->intel;
    struct gl_context *ctx = &intel->ctx;
-   const bool render_to_fbo = (ctx->DrawBuffer->Name != 0);
+   const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
    struct gen6_scissor_rect *scissor;
    uint32_t scissor_state_offset;
 
diff --git a/src/mesa/drivers/dri/i965/gen6_sf_state.c b/src/mesa/drivers/dri/i965/gen6_sf_state.c
index 95ed1f74ce4..5c4293ca91e 100644
--- a/src/mesa/drivers/dri/i965/gen6_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_sf_state.c
@@ -30,6 +30,7 @@
 #include "brw_defines.h"
 #include "brw_util.h"
 #include "main/macros.h"
+#include "main/fbobject.h"
 #include "intel_batchbuffer.h"
 
 /**
@@ -120,7 +121,7 @@ upload_sf_state(struct brw_context *brw)
    uint32_t dw1, dw2, dw3, dw4, dw16, dw17;
    int i;
    /* _NEW_BUFFER */
-   bool render_to_fbo = brw->intel.ctx.DrawBuffer->Name != 0;
+   bool render_to_fbo = _mesa_is_user_fbo(brw->intel.ctx.DrawBuffer);
    int attr = 0, input_index = 0;
    int urb_entry_read_offset = 1;
    float point_size;
diff --git a/src/mesa/drivers/dri/i965/gen6_viewport_state.c b/src/mesa/drivers/dri/i965/gen6_viewport_state.c
index 9ff2bd95680..f787ac71164 100644
--- a/src/mesa/drivers/dri/i965/gen6_viewport_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_viewport_state.c
@@ -29,6 +29,7 @@
 #include "brw_state.h"
 #include "brw_defines.h"
 #include "intel_batchbuffer.h"
+#include "main/fbobject.h"
 
 /* The clip VP defines the guardband region where expensive clipping is skipped
  * and fragments are allowed to be generated and clipped out cheaply by the SF.
@@ -70,7 +71,7 @@ gen6_upload_sf_vp(struct brw_context *brw)
    const GLfloat depth_scale = 1.0F / ctx->DrawBuffer->_DepthMaxF;
    struct brw_sf_viewport *sfv;
    GLfloat y_scale, y_bias;
-   const bool render_to_fbo = (ctx->DrawBuffer->Name != 0);
+   const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
    const GLfloat *v = ctx->Viewport._WindowMap.m;
 
    sfv = brw_state_batch(brw, AUB_TRACE_SF_VP_STATE,
diff --git a/src/mesa/drivers/dri/i965/gen7_blorp.cpp b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
new file mode 100644
index 00000000000..9d21ec97819
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
@@ -0,0 +1,501 @@
+/*
+ * Copyright Â© 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+
+#include "intel_batchbuffer.h"
+#include "intel_fbo.h"
+#include "intel_mipmap_tree.h"
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_state.h"
+
+#include "gen6_blorp.h"
+#include "gen7_blorp.h"
+
+/**
+ * \copydoc gen6_hiz_exec()
+ */
+static void
+gen7_hiz_exec(struct intel_context *intel,
+              struct intel_mipmap_tree *mt,
+              unsigned int level,
+              unsigned int layer,
+              enum gen6_hiz_op op)
+{
+   struct gl_context *ctx = &intel->ctx;
+   struct brw_context *brw = brw_context(ctx);
+   uint32_t draw_x, draw_y;
+   uint32_t tile_mask_x, tile_mask_y;
+
+   assert(op != GEN6_HIZ_OP_DEPTH_CLEAR); /* Not implemented yet. */
+   assert(mt->hiz_mt != NULL);
+   intel_miptree_check_level_layer(mt, level, layer);
+
+   uint32_t depth_format;
+   switch (mt->format) {
+   case MESA_FORMAT_Z16:       depth_format = BRW_DEPTHFORMAT_D16_UNORM; break;
+   case MESA_FORMAT_Z32_FLOAT: depth_format = BRW_DEPTHFORMAT_D32_FLOAT; break;
+   case MESA_FORMAT_X8_Z24:    depth_format = BRW_DEPTHFORMAT_D24_UNORM_X8_UINT; break;
+   default:                    assert(0); break;
+   }
+
+   {
+      /* Construct a dummy renderbuffer just to extract tile offsets. */
+      struct intel_renderbuffer rb;
+      rb.mt = mt;
+      rb.mt_level = level;
+      rb.mt_layer = layer;
+      intel_renderbuffer_set_draw_offset(&rb);
+      draw_x = rb.draw_x;
+      draw_y = rb.draw_y;
+   }
+
+   /* Compute masks to determine how much of draw_x and draw_y should be
+    * performed using the fine adjustment of "depth coordinate offset X/Y"
+    * (dw5 of 3DSTATE_DEPTH_BUFFER).  See the emit_depthbuffer() function for
+    * details.
+    */
+   {
+      uint32_t depth_mask_x, depth_mask_y, hiz_mask_x, hiz_mask_y;
+      intel_region_get_tile_masks(mt->region, &depth_mask_x, &depth_mask_y);
+      intel_region_get_tile_masks(mt->hiz_mt->region,
+                                  &hiz_mask_x, &hiz_mask_y);
+
+      /* Each HiZ row represents 2 rows of pixels */
+      hiz_mask_y = hiz_mask_y << 1 | 1;
+
+      tile_mask_x = depth_mask_x | hiz_mask_x;
+      tile_mask_y = depth_mask_y | hiz_mask_y;
+   }
+
+   gen6_hiz_emit_batch_head(brw);
+   gen6_hiz_emit_vertices(brw, mt, level, layer);
+
+   /* 3DSTATE_URB_VS
+    * 3DSTATE_URB_HS
+    * 3DSTATE_URB_DS
+    * 3DSTATE_URB_GS
+    *
+    * If the 3DSTATE_URB_VS is emitted, than the others must be also. From the
+    * BSpec, Volume 2a "3D Pipeline Overview", Section 1.7.1 3DSTATE_URB_VS:
+    *     3DSTATE_URB_HS, 3DSTATE_URB_DS, and 3DSTATE_URB_GS must also be
+    *     programmed in order for the programming of this state to be
+    *     valid.
+    */
+   {
+      /* The minimum valid value is 32. See 3DSTATE_URB_VS,
+       * Dword 1.15:0 "VS Number of URB Entries".
+       */
+      int num_vs_entries = 32;
+
+      BEGIN_BATCH(2);
+      OUT_BATCH(_3DSTATE_URB_VS << 16 | (2 - 2));
+      OUT_BATCH(1 << GEN7_URB_ENTRY_SIZE_SHIFT |
+                0 << GEN7_URB_STARTING_ADDRESS_SHIFT |
+                num_vs_entries);
+      ADVANCE_BATCH();
+
+      BEGIN_BATCH(2);
+      OUT_BATCH(_3DSTATE_URB_GS << 16 | (2 - 2));
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+
+      BEGIN_BATCH(2);
+      OUT_BATCH(_3DSTATE_URB_HS << 16 | (2 - 2));
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+
+      BEGIN_BATCH(2);
+      OUT_BATCH(_3DSTATE_URB_DS << 16 | (2 - 2));
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }
+
+   /* 3DSTATE_DEPTH_STENCIL_STATE_POINTERS
+    *
+    * The offset is relative to CMD_STATE_BASE_ADDRESS.DynamicStateBaseAddress.
+    */
+   {
+      uint32_t depthstencil_offset;
+      gen6_hiz_emit_depth_stencil_state(brw, op, &depthstencil_offset);
+
+      BEGIN_BATCH(2);
+      OUT_BATCH(_3DSTATE_DEPTH_STENCIL_STATE_POINTERS << 16 | (2 - 2));
+      OUT_BATCH(depthstencil_offset | 1);
+      ADVANCE_BATCH();
+   }
+
+   /* 3DSTATE_VS
+    *
+    * Disable vertex shader.
+    */
+   {
+      BEGIN_BATCH(6);
+      OUT_BATCH(_3DSTATE_VS << 16 | (6 - 2));
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }
+
+   /* 3DSTATE_HS
+    *
+    * Disable the hull shader.
+    */
+   {
+      BEGIN_BATCH(7);
+      OUT_BATCH(_3DSTATE_HS << 16 | (7 - 2));
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }
+
+   /* 3DSTATE_TE
+    *
+    * Disable the tesselation engine.
+    */
+   {
+      BEGIN_BATCH(4);
+      OUT_BATCH(_3DSTATE_TE << 16 | (4 - 2));
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }
+
+   /* 3DSTATE_DS
+    *
+    * Disable the domain shader.
+    */
+   {
+      BEGIN_BATCH(6);
+      OUT_BATCH(_3DSTATE_DS << 16 | (6 - 2));
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }
+
+   /* 3DSTATE_GS
+    *
+    * Disable the geometry shader.
+    */
+   {
+      BEGIN_BATCH(7);
+      OUT_BATCH(_3DSTATE_GS << 16 | (7 - 2));
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }
+
+   /* 3DSTATE_STREAMOUT
+    *
+    * Disable streamout.
+    */
+   {
+      BEGIN_BATCH(3);
+      OUT_BATCH(_3DSTATE_STREAMOUT << 16 | (3 - 2));
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }
+
+   /* 3DSTATE_CLIP
+    *
+    * Disable the clipper.
+    *
+    * The HiZ op emits a rectangle primitive, which requires clipping to
+    * be disabled. From page 10 of the Sandy Bridge PRM Volume 2 Part 1
+    * Section 1.3 "3D Primitives Overview":
+    *    RECTLIST:
+    *    Either the CLIP unit should be DISABLED, or the CLIP unit's Clip
+    *    Mode should be set to a value other than CLIPMODE_NORMAL.
+    *
+    * Also disable perspective divide. This doesn't change the clipper's
+    * output, but does spare a few electrons.
+    */
+   {
+      BEGIN_BATCH(4);
+      OUT_BATCH(_3DSTATE_CLIP << 16 | (4 - 2));
+      OUT_BATCH(0);
+      OUT_BATCH(GEN6_CLIP_PERSPECTIVE_DIVIDE_DISABLE);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }
+
+   /* 3DSTATE_SF
+    *
+    * Disable ViewportTransformEnable (dw1.1)
+    *
+    * From the SandyBridge PRM, Volume 2, Part 1, Section 1.3, "3D
+    * Primitives Overview":
+    *     RECTLIST: Viewport Mapping must be DISABLED (as is typical with the
+    *     use of screen- space coordinates).
+    *
+    * A solid rectangle must be rendered, so set FrontFaceFillMode (dw1.6:5)
+    * and BackFaceFillMode (dw1.4:3) to SOLID(0).
+    *
+    * From the Sandy Bridge PRM, Volume 2, Part 1, Section
+    * 6.4.1.1 3DSTATE_SF, Field FrontFaceFillMode:
+    *     SOLID: Any triangle or rectangle object found to be front-facing
+    *     is rendered as a solid object. This setting is required when
+    *     (rendering rectangle (RECTLIST) objects.
+    */
+   {
+      BEGIN_BATCH(7);
+      OUT_BATCH(_3DSTATE_SF << 16 | (7 - 2));
+      OUT_BATCH(depth_format << GEN7_SF_DEPTH_BUFFER_SURFACE_FORMAT_SHIFT);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }
+
+   /* 3DSTATE_SBE */
+   {
+      BEGIN_BATCH(14);
+      OUT_BATCH(_3DSTATE_SBE << 16 | (14 - 2));
+      OUT_BATCH((1 - 1) << GEN7_SBE_NUM_OUTPUTS_SHIFT | /* only position */
+                1 << GEN7_SBE_URB_ENTRY_READ_LENGTH_SHIFT |
+                0 << GEN7_SBE_URB_ENTRY_READ_OFFSET_SHIFT);
+      for (int i = 0; i < 12; ++i)
+         OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }
+
+   /* 3DSTATE_WM
+    *
+    * Disable PS thread dispatch (dw1.29) and enable the HiZ op.
+    */
+   {
+      uint32_t dw1 = 0;
+
+      switch (op) {
+      case GEN6_HIZ_OP_DEPTH_CLEAR:
+         assert(!"not implemented");
+         dw1 |= GEN7_WM_DEPTH_CLEAR;
+         break;
+      case GEN6_HIZ_OP_DEPTH_RESOLVE:
+         dw1 |= GEN7_WM_DEPTH_RESOLVE;
+         break;
+      case GEN6_HIZ_OP_HIZ_RESOLVE:
+         dw1 |= GEN7_WM_HIERARCHICAL_DEPTH_RESOLVE;
+         break;
+      default:
+         assert(0);
+         break;
+      }
+
+      BEGIN_BATCH(3);
+      OUT_BATCH(_3DSTATE_WM << 16 | (3 - 2));
+      OUT_BATCH(dw1);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }
+
+   /* 3DSTATE_PS
+    *
+    * Pixel shader dispatch is disabled above in 3DSTATE_WM, dw1.29. Despite
+    * that, thread dispatch info must still be specified.
+    *     - Maximum Number of Threads (dw4.24:31) must be nonzero, as the BSpec
+    *       states that the valid range for this field is [0x3, 0x2f].
+    *     - A dispatch mode must be given; that is, at least one of the
+    *       "N Pixel Dispatch Enable" (N=8,16,32) fields must be set. This was
+    *       discovered through simulator error messages.
+    */
+   {
+      BEGIN_BATCH(8);
+      OUT_BATCH(_3DSTATE_PS << 16 | (8 - 2));
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(((brw->max_wm_threads - 1) << IVB_PS_MAX_THREADS_SHIFT) |
+		GEN7_PS_32_DISPATCH_ENABLE);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }
+
+   /* 3DSTATE_DEPTH_BUFFER */
+   {
+      uint32_t width = mt->level[level].width;
+      uint32_t height = mt->level[level].height;
+
+      uint32_t tile_x = draw_x & tile_mask_x;
+      uint32_t tile_y = draw_y & tile_mask_y;
+      uint32_t offset = intel_region_get_aligned_offset(mt->region,
+                                                        draw_x & ~tile_mask_x,
+                                                        draw_y & ~tile_mask_y);
+
+      /* According to the Sandy Bridge PRM, volume 2 part 1, pp326-327
+       * (3DSTATE_DEPTH_BUFFER dw5), in the documentation for "Depth
+       * Coordinate Offset X/Y":
+       *
+       *   "The 3 LSBs of both offsets must be zero to ensure correct
+       *   alignment"
+       *
+       * We have no guarantee that tile_x and tile_y are correctly aligned,
+       * since they are determined by the mipmap layout, which is only aligned
+       * to multiples of 4.
+       *
+       * So, to avoid hanging the GPU, just smash the low order 3 bits of
+       * tile_x and tile_y to 0.  This is a temporary workaround until we come
+       * up with a better solution.
+       */
+      tile_x &= ~7;
+      tile_y &= ~7;
+
+      intel_emit_depth_stall_flushes(intel);
+
+      BEGIN_BATCH(7);
+      OUT_BATCH(GEN7_3DSTATE_DEPTH_BUFFER << 16 | (7 - 2));
+      OUT_BATCH(((mt->region->pitch * mt->region->cpp) - 1) |
+                depth_format << 18 |
+                1 << 22 | /* hiz enable */
+                1 << 28 | /* depth write */
+                BRW_SURFACE_2D << 29);
+      OUT_RELOC(mt->region->bo,
+                I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
+                offset);
+      OUT_BATCH((width + tile_x - 1) << 4 |
+                (height + tile_y - 1) << 18);
+      OUT_BATCH(0);
+      OUT_BATCH(tile_x |
+                tile_y << 16);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }
+
+   /* 3DSTATE_HIER_DEPTH_BUFFER */
+   {
+      struct intel_region *hiz_region = mt->hiz_mt->region;
+      uint32_t hiz_offset =
+         intel_region_get_aligned_offset(hiz_region,
+                                         draw_x & ~tile_mask_x,
+                                         (draw_y & ~tile_mask_y) / 2);
+
+      BEGIN_BATCH(3);
+      OUT_BATCH((GEN7_3DSTATE_HIER_DEPTH_BUFFER << 16) | (3 - 2));
+      OUT_BATCH(hiz_region->pitch * hiz_region->cpp - 1);
+      OUT_RELOC(hiz_region->bo,
+                I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
+                hiz_offset);
+      ADVANCE_BATCH();
+   }
+
+   /* 3DSTATE_STENCIL_BUFFER */
+   {
+      BEGIN_BATCH(3);
+      OUT_BATCH((GEN7_3DSTATE_STENCIL_BUFFER << 16) | (3 - 2));
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }
+
+   /* 3DSTATE_CLEAR_PARAMS
+    *
+    * From the BSpec, Volume 2a.11 Windower, Section 1.5.6.3.2
+    * 3DSTATE_CLEAR_PARAMS:
+    *    [DevIVB] 3DSTATE_CLEAR_PARAMS must always be programmed in the along
+    *    with the other Depth/Stencil state commands(i.e.  3DSTATE_DEPTH_BUFFER,
+    *    3DSTATE_STENCIL_BUFFER, or 3DSTATE_HIER_DEPTH_BUFFER).
+    */
+   {
+      BEGIN_BATCH(3);
+      OUT_BATCH(GEN7_3DSTATE_CLEAR_PARAMS << 16 | (3 - 2));
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }
+
+   /* 3DSTATE_DRAWING_RECTANGLE */
+   {
+      BEGIN_BATCH(4);
+      OUT_BATCH(_3DSTATE_DRAWING_RECTANGLE << 16 | (4 - 2));
+      OUT_BATCH(0);
+      OUT_BATCH(((mt->level[level].width - 1) & 0xffff) |
+                ((mt->level[level].height - 1) << 16));
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }
+
+   /* 3DPRIMITIVE */
+   {
+     BEGIN_BATCH(7);
+     OUT_BATCH(CMD_3D_PRIM << 16 | (7 - 2));
+     OUT_BATCH(GEN7_3DPRIM_VERTEXBUFFER_ACCESS_SEQUENTIAL |
+               _3DPRIM_RECTLIST);
+     OUT_BATCH(3); /* vertex count per instance */
+     OUT_BATCH(0);
+     OUT_BATCH(1); /* instance count */
+     OUT_BATCH(0);
+     OUT_BATCH(0);
+     ADVANCE_BATCH();
+   }
+
+   /* See comments above at first invocation of intel_flush() in
+    * gen6_hiz_emit_batch_head().
+    */
+   intel_flush(ctx);
+
+   /* Be safe. */
+   brw->state.dirty.brw = ~0;
+   brw->state.dirty.cache = ~0;
+}
+
+/** \copydoc gen6_resolve_hiz_slice() */
+void
+gen7_resolve_hiz_slice(struct intel_context *intel,
+                       struct intel_mipmap_tree *mt,
+                       uint32_t level,
+                       uint32_t layer)
+{
+   gen7_hiz_exec(intel, mt, level, layer, GEN6_HIZ_OP_HIZ_RESOLVE);
+}
+
+/** \copydoc gen6_resolve_depth_slice() */
+void
+gen7_resolve_depth_slice(struct intel_context *intel,
+                         struct intel_mipmap_tree *mt,
+                         uint32_t level,
+                         uint32_t layer)
+{
+   gen7_hiz_exec(intel, mt, level, layer, GEN6_HIZ_OP_DEPTH_RESOLVE);
+}
diff --git a/src/mesa/drivers/dri/i965/gen7_blorp.h b/src/mesa/drivers/dri/i965/gen7_blorp.h
new file mode 100644
index 00000000000..6c0fc84eca1
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/gen7_blorp.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright Â© 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct intel_context;
+struct intel_mipmap_tree;
+
+/** \copydoc gen6_resolve_hiz_slice() */
+void
+gen7_resolve_hiz_slice(struct intel_context *intel,
+                       struct intel_mipmap_tree *mt,
+                       uint32_t level,
+                       uint32_t layer);
+
+/** \copydoc gen6_resolve_depth_slice() */
+void
+gen7_resolve_depth_slice(struct intel_context *intel,
+                         struct intel_mipmap_tree *mt,
+                         uint32_t level,
+                         uint32_t layer);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/mesa/drivers/dri/i965/gen7_clip_state.c b/src/mesa/drivers/dri/i965/gen7_clip_state.c
index f78b089484d..5fede36a20d 100644
--- a/src/mesa/drivers/dri/i965/gen7_clip_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_clip_state.c
@@ -26,6 +26,7 @@
 #include "brw_defines.h"
 #include "brw_util.h"
 #include "intel_batchbuffer.h"
+#include "main/fbobject.h"
 
 static void
 upload_clip_state(struct brw_context *brw)
@@ -38,7 +39,7 @@ upload_clip_state(struct brw_context *brw)
    uint32_t nonperspective_barycentric_enable_flag = 0;
 
    /* _NEW_BUFFERS */
-   bool render_to_fbo = brw->intel.ctx.DrawBuffer->Name != 0;
+   bool render_to_fbo = _mesa_is_user_fbo(brw->intel.ctx.DrawBuffer);
 
    /* CACHE_NEW_WM_PROG */
    if (brw->wm.prog_data->barycentric_interp_modes &
diff --git a/src/mesa/drivers/dri/i965/gen7_hiz.c b/src/mesa/drivers/dri/i965/gen7_hiz.c
deleted file mode 100644
index 18c178eb041..00000000000
--- a/src/mesa/drivers/dri/i965/gen7_hiz.c
+++ /dev/null
@@ -1,455 +0,0 @@
-/*
- * Copyright Â© 2011 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include <assert.h>
-
-#include "intel_batchbuffer.h"
-#include "intel_fbo.h"
-#include "intel_mipmap_tree.h"
-
-#include "brw_context.h"
-#include "brw_defines.h"
-#include "brw_state.h"
-
-#include "gen6_hiz.h"
-#include "gen7_hiz.h"
-
-/**
- * \copydoc gen6_hiz_exec()
- */
-static void
-gen7_hiz_exec(struct intel_context *intel,
-              struct intel_mipmap_tree *mt,
-              unsigned int level,
-              unsigned int layer,
-              enum gen6_hiz_op op)
-{
-   struct gl_context *ctx = &intel->ctx;
-   struct brw_context *brw = brw_context(ctx);
-
-   assert(op != GEN6_HIZ_OP_DEPTH_CLEAR); /* Not implemented yet. */
-   assert(mt->hiz_mt != NULL);
-   intel_miptree_check_level_layer(mt, level, layer);
-
-   uint32_t depth_format;
-   switch (mt->format) {
-   case MESA_FORMAT_Z16:       depth_format = BRW_DEPTHFORMAT_D16_UNORM; break;
-   case MESA_FORMAT_Z32_FLOAT: depth_format = BRW_DEPTHFORMAT_D32_FLOAT; break;
-   case MESA_FORMAT_X8_Z24:    depth_format = BRW_DEPTHFORMAT_D24_UNORM_X8_UINT; break;
-   default:                    assert(0); break;
-   }
-
-   gen6_hiz_emit_batch_head(brw);
-   gen6_hiz_emit_vertices(brw, mt, level, layer);
-
-   /* 3DSTATE_URB_VS
-    * 3DSTATE_URB_HS
-    * 3DSTATE_URB_DS
-    * 3DSTATE_URB_GS
-    *
-    * If the 3DSTATE_URB_VS is emitted, than the others must be also. From the
-    * BSpec, Volume 2a "3D Pipeline Overview", Section 1.7.1 3DSTATE_URB_VS:
-    *     3DSTATE_URB_HS, 3DSTATE_URB_DS, and 3DSTATE_URB_GS must also be
-    *     programmed in order for the programming of this state to be
-    *     valid.
-    */
-   {
-      /* The minimum valid value is 32. See 3DSTATE_URB_VS,
-       * Dword 1.15:0 "VS Number of URB Entries".
-       */
-      int num_vs_entries = 32;
-
-      BEGIN_BATCH(2);
-      OUT_BATCH(_3DSTATE_URB_VS << 16 | (2 - 2));
-      OUT_BATCH(1 << GEN7_URB_ENTRY_SIZE_SHIFT |
-                0 << GEN7_URB_STARTING_ADDRESS_SHIFT |
-                num_vs_entries);
-      ADVANCE_BATCH();
-
-      BEGIN_BATCH(2);
-      OUT_BATCH(_3DSTATE_URB_GS << 16 | (2 - 2));
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-
-      BEGIN_BATCH(2);
-      OUT_BATCH(_3DSTATE_URB_HS << 16 | (2 - 2));
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-
-      BEGIN_BATCH(2);
-      OUT_BATCH(_3DSTATE_URB_DS << 16 | (2 - 2));
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-
-   /* 3DSTATE_DEPTH_STENCIL_STATE_POINTERS
-    *
-    * The offset is relative to CMD_STATE_BASE_ADDRESS.DynamicStateBaseAddress.
-    */
-   {
-      uint32_t depthstencil_offset;
-      gen6_hiz_emit_depth_stencil_state(brw, op, &depthstencil_offset);
-
-      BEGIN_BATCH(2);
-      OUT_BATCH(_3DSTATE_DEPTH_STENCIL_STATE_POINTERS << 16 | (2 - 2));
-      OUT_BATCH(depthstencil_offset | 1);
-      ADVANCE_BATCH();
-   }
-
-   /* 3DSTATE_VS
-    *
-    * Disable vertex shader.
-    */
-   {
-      BEGIN_BATCH(6);
-      OUT_BATCH(_3DSTATE_VS << 16 | (6 - 2));
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-
-   /* 3DSTATE_HS
-    *
-    * Disable the hull shader.
-    */
-   {
-      BEGIN_BATCH(7);
-      OUT_BATCH(_3DSTATE_HS << 16 | (7 - 2));
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-
-   /* 3DSTATE_TE
-    *
-    * Disable the tesselation engine.
-    */
-   {
-      BEGIN_BATCH(4);
-      OUT_BATCH(_3DSTATE_TE << 16 | (4 - 2));
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-
-   /* 3DSTATE_DS
-    *
-    * Disable the domain shader.
-    */
-   {
-      BEGIN_BATCH(6);
-      OUT_BATCH(_3DSTATE_DS << 16 | (6 - 2));
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-
-   /* 3DSTATE_GS
-    *
-    * Disable the geometry shader.
-    */
-   {
-      BEGIN_BATCH(7);
-      OUT_BATCH(_3DSTATE_GS << 16 | (7 - 2));
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-
-   /* 3DSTATE_STREAMOUT
-    *
-    * Disable streamout.
-    */
-   {
-      BEGIN_BATCH(3);
-      OUT_BATCH(_3DSTATE_STREAMOUT << 16 | (3 - 2));
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-
-   /* 3DSTATE_CLIP
-    *
-    * Disable the clipper.
-    *
-    * The HiZ op emits a rectangle primitive, which requires clipping to
-    * be disabled. From page 10 of the Sandy Bridge PRM Volume 2 Part 1
-    * Section 1.3 "3D Primitives Overview":
-    *    RECTLIST:
-    *    Either the CLIP unit should be DISABLED, or the CLIP unit's Clip
-    *    Mode should be set to a value other than CLIPMODE_NORMAL.
-    *
-    * Also disable perspective divide. This doesn't change the clipper's
-    * output, but does spare a few electrons.
-    */
-   {
-      BEGIN_BATCH(4);
-      OUT_BATCH(_3DSTATE_CLIP << 16 | (4 - 2));
-      OUT_BATCH(0);
-      OUT_BATCH(GEN6_CLIP_PERSPECTIVE_DIVIDE_DISABLE);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-
-   /* 3DSTATE_SF
-    *
-    * Disable ViewportTransformEnable (dw1.1)
-    *
-    * From the SandyBridge PRM, Volume 2, Part 1, Section 1.3, "3D
-    * Primitives Overview":
-    *     RECTLIST: Viewport Mapping must be DISABLED (as is typical with the
-    *     use of screen- space coordinates).
-    *
-    * A solid rectangle must be rendered, so set FrontFaceFillMode (dw1.6:5)
-    * and BackFaceFillMode (dw1.4:3) to SOLID(0).
-    *
-    * From the Sandy Bridge PRM, Volume 2, Part 1, Section
-    * 6.4.1.1 3DSTATE_SF, Field FrontFaceFillMode:
-    *     SOLID: Any triangle or rectangle object found to be front-facing
-    *     is rendered as a solid object. This setting is required when
-    *     (rendering rectangle (RECTLIST) objects.
-    */
-   {
-      BEGIN_BATCH(7);
-      OUT_BATCH(_3DSTATE_SF << 16 | (7 - 2));
-      OUT_BATCH(depth_format << GEN7_SF_DEPTH_BUFFER_SURFACE_FORMAT_SHIFT);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-
-   /* 3DSTATE_SBE */
-   {
-      BEGIN_BATCH(14);
-      OUT_BATCH(_3DSTATE_SBE << 16 | (14 - 2));
-      OUT_BATCH((1 - 1) << GEN7_SBE_NUM_OUTPUTS_SHIFT | /* only position */
-                1 << GEN7_SBE_URB_ENTRY_READ_LENGTH_SHIFT |
-                0 << GEN7_SBE_URB_ENTRY_READ_OFFSET_SHIFT);
-      for (int i = 0; i < 12; ++i)
-         OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-
-   /* 3DSTATE_WM
-    *
-    * Disable PS thread dispatch (dw1.29) and enable the HiZ op.
-    */
-   {
-      uint32_t dw1 = 0;
-
-      switch (op) {
-      case GEN6_HIZ_OP_DEPTH_CLEAR:
-         assert(!"not implemented");
-         dw1 |= GEN7_WM_DEPTH_CLEAR;
-         break;
-      case GEN6_HIZ_OP_DEPTH_RESOLVE:
-         dw1 |= GEN7_WM_DEPTH_RESOLVE;
-         break;
-      case GEN6_HIZ_OP_HIZ_RESOLVE:
-         dw1 |= GEN7_WM_HIERARCHICAL_DEPTH_RESOLVE;
-         break;
-      default:
-         assert(0);
-         break;
-      }
-
-      BEGIN_BATCH(3);
-      OUT_BATCH(_3DSTATE_WM << 16 | (3 - 2));
-      OUT_BATCH(dw1);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-
-   /* 3DSTATE_PS
-    *
-    * Pixel shader dispatch is disabled above in 3DSTATE_WM, dw1.29. Despite
-    * that, thread dispatch info must still be specified.
-    *     - Maximum Number of Threads (dw4.24:31) must be nonzero, as the BSpec
-    *       states that the valid range for this field is [0x3, 0x2f].
-    *     - A dispatch mode must be given; that is, at least one of the
-    *       "N Pixel Dispatch Enable" (N=8,16,32) fields must be set. This was
-    *       discovered through simulator error messages.
-    */
-   {
-      BEGIN_BATCH(8);
-      OUT_BATCH(_3DSTATE_PS << 16 | (8 - 2));
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(((brw->max_wm_threads - 1) << IVB_PS_MAX_THREADS_SHIFT) |
-		GEN7_PS_32_DISPATCH_ENABLE);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-
-   /* 3DSTATE_DEPTH_BUFFER */
-   {
-      uint32_t width = mt->level[level].width;
-      uint32_t height = mt->level[level].height;
-
-      uint32_t tile_x;
-      uint32_t tile_y;
-      uint32_t offset;
-      {
-         /* Construct a dummy renderbuffer just to extract tile offsets. */
-         struct intel_renderbuffer rb;
-         rb.mt = mt;
-         rb.mt_level = level;
-         rb.mt_layer = layer;
-         intel_renderbuffer_set_draw_offset(&rb);
-         offset = intel_renderbuffer_tile_offsets(&rb, &tile_x, &tile_y);
-      }
-
-      intel_emit_depth_stall_flushes(intel);
-
-      BEGIN_BATCH(7);
-      OUT_BATCH(GEN7_3DSTATE_DEPTH_BUFFER << 16 | (7 - 2));
-      OUT_BATCH(((mt->region->pitch * mt->region->cpp) - 1) |
-                depth_format << 18 |
-                1 << 22 | /* hiz enable */
-                1 << 28 | /* depth write */
-                BRW_SURFACE_2D << 29);
-      OUT_RELOC(mt->region->bo,
-                I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                offset);
-      OUT_BATCH((width + tile_x - 1) << 4 |
-                (height + tile_y - 1) << 18);
-      OUT_BATCH(0);
-      OUT_BATCH(tile_x |
-                tile_y << 16);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-
-   /* 3DSTATE_HIER_DEPTH_BUFFER */
-   {
-      struct intel_region *hiz_region = mt->hiz_mt->region;
-
-      BEGIN_BATCH(3);
-      OUT_BATCH((GEN7_3DSTATE_HIER_DEPTH_BUFFER << 16) | (3 - 2));
-      OUT_BATCH(hiz_region->pitch * hiz_region->cpp - 1);
-      OUT_RELOC(hiz_region->bo,
-                I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                0);
-      ADVANCE_BATCH();
-   }
-
-   /* 3DSTATE_STENCIL_BUFFER */
-   {
-      BEGIN_BATCH(3);
-      OUT_BATCH((GEN7_3DSTATE_STENCIL_BUFFER << 16) | (3 - 2));
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-
-   /* 3DSTATE_CLEAR_PARAMS
-    *
-    * From the BSpec, Volume 2a.11 Windower, Section 1.5.6.3.2
-    * 3DSTATE_CLEAR_PARAMS:
-    *    [DevIVB] 3DSTATE_CLEAR_PARAMS must always be programmed in the along
-    *    with the other Depth/Stencil state commands(i.e.  3DSTATE_DEPTH_BUFFER,
-    *    3DSTATE_STENCIL_BUFFER, or 3DSTATE_HIER_DEPTH_BUFFER).
-    */
-   {
-      BEGIN_BATCH(3);
-      OUT_BATCH(GEN7_3DSTATE_CLEAR_PARAMS << 16 | (3 - 2));
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-
-   /* 3DSTATE_DRAWING_RECTANGLE */
-   {
-      BEGIN_BATCH(4);
-      OUT_BATCH(_3DSTATE_DRAWING_RECTANGLE << 16 | (4 - 2));
-      OUT_BATCH(0);
-      OUT_BATCH(((mt->level[level].width - 1) & 0xffff) |
-                ((mt->level[level].height - 1) << 16));
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-
-   /* 3DPRIMITIVE */
-   {
-     BEGIN_BATCH(7);
-     OUT_BATCH(CMD_3D_PRIM << 16 | (7 - 2));
-     OUT_BATCH(GEN7_3DPRIM_VERTEXBUFFER_ACCESS_SEQUENTIAL |
-               _3DPRIM_RECTLIST);
-     OUT_BATCH(3); /* vertex count per instance */
-     OUT_BATCH(0);
-     OUT_BATCH(1); /* instance count */
-     OUT_BATCH(0);
-     OUT_BATCH(0);
-     ADVANCE_BATCH();
-   }
-
-   /* See comments above at first invocation of intel_flush() in
-    * gen6_hiz_emit_batch_head().
-    */
-   intel_flush(ctx);
-
-   /* Be safe. */
-   brw->state.dirty.brw = ~0;
-   brw->state.dirty.cache = ~0;
-}
-
-/** \copydoc gen6_resolve_hiz_slice() */
-void
-gen7_resolve_hiz_slice(struct intel_context *intel,
-                       struct intel_mipmap_tree *mt,
-                       uint32_t level,
-                       uint32_t layer)
-{
-   gen7_hiz_exec(intel, mt, level, layer, GEN6_HIZ_OP_HIZ_RESOLVE);
-}
-
-/** \copydoc gen6_resolve_depth_slice() */
-void
-gen7_resolve_depth_slice(struct intel_context *intel,
-                         struct intel_mipmap_tree *mt,
-                         uint32_t level,
-                         uint32_t layer)
-{
-   gen7_hiz_exec(intel, mt, level, layer, GEN6_HIZ_OP_DEPTH_RESOLVE);
-}
diff --git a/src/mesa/drivers/dri/i965/gen7_hiz.h b/src/mesa/drivers/dri/i965/gen7_hiz.h
deleted file mode 100644
index b89ffb00711..00000000000
--- a/src/mesa/drivers/dri/i965/gen7_hiz.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright Â© 2011 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#pragma once
-
-#include <stdint.h>
-
-struct intel_context;
-struct intel_mipmap_tree;
-
-/** \copydoc gen6_resolve_hiz_slice() */
-void
-gen7_resolve_hiz_slice(struct intel_context *intel,
-                       struct intel_mipmap_tree *mt,
-                       uint32_t level,
-                       uint32_t layer);
-
-/** \copydoc gen6_resolve_depth_slice() */
-void
-gen7_resolve_depth_slice(struct intel_context *intel,
-                         struct intel_mipmap_tree *mt,
-                         uint32_t level,
-                         uint32_t layer);
diff --git a/src/mesa/drivers/dri/i965/gen7_misc_state.c b/src/mesa/drivers/dri/i965/gen7_misc_state.c
index 3a6144f2838..4a5b5a6ae2e 100644
--- a/src/mesa/drivers/dri/i965/gen7_misc_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_misc_state.c
@@ -42,18 +42,58 @@ static void emit_depthbuffer(struct brw_context *brw)
 			    *stencil_mt = NULL,
 			    *hiz_mt = NULL;
 
+   /* Amount by which drawing should be offset in order to draw to the
+    * appropriate miplevel/zoffset/cubeface.  We will extract these values
+    * from depth_irb or stencil_irb once we determine which is present.
+    */
+   uint32_t draw_x = 0, draw_y = 0;
+
+   /* Masks used to determine how much of the draw_x and draw_y offsets should
+    * be performed using the fine adjustment of "depth coordinate offset X/Y"
+    * (dw5 of 3DSTATE_DEPTH_BUFFER).  Any remaining coarse adjustment will be
+    * performed by changing the base addresses of the buffers.
+    *
+    * Since the HiZ, depth, and stencil buffers all use the same "depth
+    * coordinate offset X/Y" values, we need to make sure that the coarse
+    * adjustment will be possible to apply to all three buffers.  Since coarse
+    * adjustment can only be applied in multiples of the tile size, we will OR
+    * together the tile masks of all the buffers to determine which offsets to
+    * perform as fine adjustments.
+    */
+   uint32_t tile_mask_x = 0, tile_mask_y = 0;
+
    if (drb)
       depth_mt = drb->mt;
 
-   if (depth_mt)
+   if (depth_mt) {
       hiz_mt = depth_mt->hiz_mt;
 
+      intel_region_get_tile_masks(depth_mt->region,
+                                  &tile_mask_x, &tile_mask_y);
+
+      if (hiz_mt) {
+         uint32_t hiz_tile_mask_x, hiz_tile_mask_y;
+         intel_region_get_tile_masks(hiz_mt->region,
+                                     &hiz_tile_mask_x, &hiz_tile_mask_y);
+
+         /* Each HiZ row represents 2 rows of pixels */
+         hiz_tile_mask_y = hiz_tile_mask_y << 1 | 1;
+
+         tile_mask_x |= hiz_tile_mask_x;
+         tile_mask_y |= hiz_tile_mask_y;
+      }
+   }
+
    if (srb) {
       stencil_mt = srb->mt;
       if (stencil_mt->stencil_mt)
 	 stencil_mt = stencil_mt->stencil_mt;
 
       assert(stencil_mt->format == MESA_FORMAT_S8);
+
+      /* Stencil buffer uses 64x64 tiles. */
+      tile_mask_x |= 63;
+      tile_mask_y |= 63;
    }
 
    /* Gen7 doesn't support packed depth/stencil */
@@ -65,6 +105,7 @@ static void emit_depthbuffer(struct brw_context *brw)
    if (depth_mt == NULL) {
       uint32_t dw1 = BRW_DEPTHFORMAT_D32_FLOAT << 18;
       uint32_t dw3 = 0;
+      uint32_t tile_x, tile_y;
 
       if (stencil_mt == NULL) {
 	 dw1 |= (BRW_SURFACE_NULL << 29);
@@ -72,10 +113,33 @@ static void emit_depthbuffer(struct brw_context *brw)
 	 /* _NEW_STENCIL: enable stencil buffer writes */
 	 dw1 |= ((ctx->Stencil.WriteMask != 0) << 27);
 
+         draw_x = srb->draw_x;
+         draw_y = srb->draw_y;
+         tile_x = draw_x & tile_mask_x;
+         tile_y = draw_y & tile_mask_y;
+
+         /* According to the Sandy Bridge PRM, volume 2 part 1, pp326-327
+          * (3DSTATE_DEPTH_BUFFER dw5), in the documentation for "Depth
+          * Coordinate Offset X/Y":
+          *
+          *   "The 3 LSBs of both offsets must be zero to ensure correct
+          *   alignment"
+          *
+          * We have no guarantee that tile_x and tile_y are correctly aligned,
+          * since they are determined by the mipmap layout, which is only
+          * aligned to multiples of 4.
+          *
+          * So, to avoid hanging the GPU, just smash the low order 3 bits of
+          * tile_x and tile_y to 0.  This is a temporary workaround until we
+          * come up with a better solution.
+          */
+         tile_x &= ~7;
+         tile_y &= ~7;
+
 	 /* 3DSTATE_STENCIL_BUFFER inherits surface type and dimensions. */
 	 dw1 |= (BRW_SURFACE_2D << 29);
-	 dw3 = ((srb->Base.Base.Width - 1) << 4) |
-	       ((srb->Base.Base.Height - 1) << 18);
+	 dw3 = ((srb->Base.Base.Width + tile_x - 1) << 4) |
+	       ((srb->Base.Base.Height + tile_y - 1) << 18);
       }
 
       BEGIN_BATCH(7);
@@ -84,14 +148,39 @@ static void emit_depthbuffer(struct brw_context *brw)
       OUT_BATCH(0);
       OUT_BATCH(dw3);
       OUT_BATCH(0);
-      OUT_BATCH(0);
+      OUT_BATCH(tile_x | (tile_y << 16));
       OUT_BATCH(0);
       ADVANCE_BATCH();
    } else {
       struct intel_region *region = depth_mt->region;
       uint32_t tile_x, tile_y, offset;
 
-      offset = intel_renderbuffer_tile_offsets(drb, &tile_x, &tile_y);
+      draw_x = drb->draw_x;
+      draw_y = drb->draw_y;
+      tile_x = draw_x & tile_mask_x;
+      tile_y = draw_y & tile_mask_y;
+
+      /* According to the Sandy Bridge PRM, volume 2 part 1, pp326-327
+       * (3DSTATE_DEPTH_BUFFER dw5), in the documentation for "Depth
+       * Coordinate Offset X/Y":
+       *
+       *   "The 3 LSBs of both offsets must be zero to ensure correct
+       *   alignment"
+       *
+       * We have no guarantee that tile_x and tile_y are correctly aligned,
+       * since they are determined by the mipmap layout, which is only aligned
+       * to multiples of 4.
+       *
+       * So, to avoid hanging the GPU, just smash the low order 3 bits of
+       * tile_x and tile_y to 0.  This is a temporary workaround until we come
+       * up with a better solution.
+       */
+      tile_x &= ~7;
+      tile_y &= ~7;
+
+      offset = intel_region_get_aligned_offset(region,
+                                               draw_x & ~tile_mask_x,
+                                               draw_y & ~tile_mask_y);
 
       assert(region->tiling == I915_TILING_Y);
 
@@ -122,13 +211,17 @@ static void emit_depthbuffer(struct brw_context *brw)
       OUT_BATCH(0);
       ADVANCE_BATCH();
    } else {
+      uint32_t hiz_offset =
+         intel_region_get_aligned_offset(hiz_mt->region,
+                                         draw_x & ~tile_mask_x,
+                                         (draw_y & ~tile_mask_y) / 2);
       BEGIN_BATCH(3);
       OUT_BATCH(GEN7_3DSTATE_HIER_DEPTH_BUFFER << 16 | (3 - 2));
       OUT_BATCH(hiz_mt->region->pitch * hiz_mt->region->cpp - 1);
       OUT_RELOC(hiz_mt->region->bo,
                 I915_GEM_DOMAIN_RENDER,
                 I915_GEM_DOMAIN_RENDER,
-                0);
+                hiz_offset);
       ADVANCE_BATCH();
    }
 
@@ -141,6 +234,14 @@ static void emit_depthbuffer(struct brw_context *brw)
    } else {
       const int enabled = intel->is_haswell ? HSW_STENCIL_ENABLED : 0;
 
+      /* Note: We can't compute the stencil offset using
+       * intel_region_get_aligned_offset(), because the stencil region claims
+       * that the region is untiled; in fact it's W tiled.
+       */
+      uint32_t stencil_offset =
+         (draw_y & ~tile_mask_y) * stencil_mt->region->pitch +
+         (draw_x & ~tile_mask_x) * 64;
+
       BEGIN_BATCH(3);
       OUT_BATCH(GEN7_3DSTATE_STENCIL_BUFFER << 16 | (3 - 2));
       /* The stencil buffer has quirky pitch requirements.  From the Graphics
@@ -161,7 +262,7 @@ static void emit_depthbuffer(struct brw_context *brw)
 	        (2 * stencil_mt->region->pitch * stencil_mt->region->cpp - 1));
       OUT_RELOC(stencil_mt->region->bo,
 	        I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-		0);
+		stencil_offset);
       ADVANCE_BATCH();
    }
 
diff --git a/src/mesa/drivers/dri/i965/gen7_sf_state.c b/src/mesa/drivers/dri/i965/gen7_sf_state.c
index 5c51abc5a0e..5c6fcedcedf 100644
--- a/src/mesa/drivers/dri/i965/gen7_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_sf_state.c
@@ -26,6 +26,7 @@
 #include "brw_defines.h"
 #include "brw_util.h"
 #include "main/macros.h"
+#include "main/fbobject.h"
 #include "intel_batchbuffer.h"
 
 static void
@@ -44,7 +45,7 @@ upload_sbe_state(struct brw_context *brw)
    int urb_entry_read_offset = 1;
    uint16_t attr_overrides[FRAG_ATTRIB_MAX];
    /* _NEW_BUFFERS */
-   bool render_to_fbo = ctx->DrawBuffer->Name != 0;
+   bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
    uint32_t point_sprite_origin;
 
    /* CACHE_NEW_VS_PROG */
@@ -159,7 +160,7 @@ upload_sf_state(struct brw_context *brw)
    uint32_t dw1, dw2, dw3;
    float point_size;
    /* _NEW_BUFFERS */
-   bool render_to_fbo = brw->intel.ctx.DrawBuffer->Name != 0;
+   bool render_to_fbo = _mesa_is_user_fbo(brw->intel.ctx.DrawBuffer);
 
    dw1 = GEN6_SF_STATISTICS_ENABLE |
          GEN6_SF_VIEWPORT_TRANSFORM_ENABLE;
diff --git a/src/mesa/drivers/dri/i965/gen7_viewport_state.c b/src/mesa/drivers/dri/i965/gen7_viewport_state.c
index d0b89d5fe57..2bcf338b85d 100644
--- a/src/mesa/drivers/dri/i965/gen7_viewport_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_viewport_state.c
@@ -25,6 +25,7 @@
 #include "brw_state.h"
 #include "brw_defines.h"
 #include "intel_batchbuffer.h"
+#include "main/fbobject.h"
 
 static void
 gen7_upload_sf_clip_viewport(struct brw_context *brw)
@@ -33,7 +34,7 @@ gen7_upload_sf_clip_viewport(struct brw_context *brw)
    struct gl_context *ctx = &intel->ctx;
    const GLfloat depth_scale = 1.0F / ctx->DrawBuffer->_DepthMaxF;
    GLfloat y_scale, y_bias;
-   const bool render_to_fbo = (ctx->DrawBuffer->Name != 0);
+   const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
    const GLfloat *v = ctx->Viewport._WindowMap.m;
    struct gen7_sf_clip_viewport *vp;
 
diff --git a/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c b/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c
index cbccd2b852d..8f62c040b6a 100644
--- a/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c
@@ -289,8 +289,11 @@ gen7_update_renderbuffer_surface(struct brw_context *brw,
 
    switch (rb_format) {
    case MESA_FORMAT_SARGB8:
-      /* without GL_EXT_framebuffer_sRGB we shouldn't bind sRGB
-	 surfaces to the blend/update as sRGB */
+      /* _NEW_BUFFERS
+       *
+       * Without GL_EXT_framebuffer_sRGB we shouldn't bind sRGB surfaces to the
+       * blend/update as sRGB.
+       */
       if (ctx->Color.sRGBEnabled)
 	 surf->ss0.surface_format = brw_format_for_mesa_format(rb_format);
       else
diff --git a/src/mesa/drivers/dri/intel/intel_batchbuffer.h b/src/mesa/drivers/dri/intel/intel_batchbuffer.h
index 751ec992c97..d2744e44ac2 100644
--- a/src/mesa/drivers/dri/intel/intel_batchbuffer.h
+++ b/src/mesa/drivers/dri/intel/intel_batchbuffer.h
@@ -7,8 +7,14 @@
 #include "intel_bufmgr.h"
 #include "intel_reg.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define BATCH_RESERVED 16
 
+struct intel_batchbuffer;
+
 void intel_batchbuffer_init(struct intel_context *intel);
 void intel_batchbuffer_reset(struct intel_context *intel);
 void intel_batchbuffer_free(struct intel_context *intel);
@@ -152,4 +158,8 @@ void intel_batchbuffer_cached_advance(struct intel_context *intel);
 #define ADVANCE_BATCH() intel_batchbuffer_advance(intel);
 #define CACHED_BATCH() intel_batchbuffer_cached_advance(intel);
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/src/mesa/drivers/dri/intel/intel_buffers.c b/src/mesa/drivers/dri/intel/intel_buffers.c
index 9809f79bbee..2b0276327d2 100644
--- a/src/mesa/drivers/dri/intel/intel_buffers.c
+++ b/src/mesa/drivers/dri/intel/intel_buffers.c
@@ -30,6 +30,7 @@
 #include "intel_fbo.h"
 #include "intel_mipmap_tree.h"
 
+#include "main/fbobject.h"
 #include "main/framebuffer.h"
 #include "main/renderbuffer.h"
 
@@ -82,7 +83,7 @@ intel_check_front_buffer_rendering(struct intel_context *intel)
 static void
 intelDrawBuffer(struct gl_context * ctx, GLenum mode)
 {
-   if ((ctx->DrawBuffer != NULL) && (ctx->DrawBuffer->Name == 0)) {
+   if (ctx->DrawBuffer && _mesa_is_winsys_fbo(ctx->DrawBuffer)) {
       struct intel_context *const intel = intel_context(ctx);
       const bool was_front_buffer_rendering =
 	intel->is_front_buffer_rendering;
@@ -105,7 +106,7 @@ intelDrawBuffer(struct gl_context * ctx, GLenum mode)
 static void
 intelReadBuffer(struct gl_context * ctx, GLenum mode)
 {
-   if ((ctx->DrawBuffer != NULL) && (ctx->DrawBuffer->Name == 0)) {
+   if (ctx->DrawBuffer && _mesa_is_winsys_fbo(ctx->DrawBuffer)) {
       struct intel_context *const intel = intel_context(ctx);
       const bool was_front_buffer_reading =
 	intel->is_front_buffer_reading;
diff --git a/src/mesa/drivers/dri/intel/intel_context.c b/src/mesa/drivers/dri/intel/intel_context.c
index b8472b6fd38..f572f382882 100644
--- a/src/mesa/drivers/dri/intel/intel_context.c
+++ b/src/mesa/drivers/dri/intel/intel_context.c
@@ -215,7 +215,7 @@ intel_flush_front(struct gl_context *ctx)
     __DRIcontext *driContext = intel->driContext;
     __DRIscreen *const screen = intel->intelScreen->driScrnPriv;
 
-   if ((ctx->DrawBuffer->Name == 0) && intel->front_buffer_dirty) {
+    if (_mesa_is_winsys_fbo(ctx->DrawBuffer) && intel->front_buffer_dirty) {
       if (screen->dri2.loader &&
           (screen->dri2.loader->base.version >= 2)
 	  && (screen->dri2.loader->flushFrontBuffer != NULL) &&
@@ -454,7 +454,7 @@ intel_viewport(struct gl_context *ctx, GLint x, GLint y, GLsizei w, GLsizei h)
     if (intel->saved_viewport)
 	intel->saved_viewport(ctx, x, y, w, h);
 
-    if (ctx->DrawBuffer->Name == 0) {
+    if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
        dri2InvalidateDrawable(driContext->driDrawablePriv);
        dri2InvalidateDrawable(driContext->driReadablePriv);
     }
diff --git a/src/mesa/drivers/dri/intel/intel_context.h b/src/mesa/drivers/dri/intel/intel_context.h
index 8ba727030fc..065f1d6d01a 100644
--- a/src/mesa/drivers/dri/intel/intel_context.h
+++ b/src/mesa/drivers/dri/intel/intel_context.h
@@ -49,7 +49,6 @@ extern "C" {
 
 #ifdef __cplusplus
 	#undef virtual
-}
 #endif
 
 #include "tnl/t_vertex.h"
@@ -117,6 +116,32 @@ struct intel_sync_object {
 
 struct brw_context;
 
+struct intel_batchbuffer {
+   /** Current batchbuffer being queued up. */
+   drm_intel_bo *bo;
+   /** Last BO submitted to the hardware.  Used for glFinish(). */
+   drm_intel_bo *last_bo;
+   /** BO for post-sync nonzero writes for gen6 workaround. */
+   drm_intel_bo *workaround_bo;
+   bool need_workaround_flush;
+
+   struct cached_batch_item *cached_items;
+
+   uint16_t emit, total;
+   uint16_t used, reserved_space;
+   uint32_t map[8192];
+#define BATCH_SZ (8192*sizeof(uint32_t))
+
+   uint32_t state_batch_offset;
+   bool is_blit;
+   bool needs_sol_reset;
+
+   struct {
+      uint16_t used;
+      int reloc_count;
+   } saved;
+};
+
 /**
  * intel_context is derived from Mesa's context class: struct gl_context.
  */
@@ -219,31 +244,7 @@ struct intel_context
 
    int urb_size;
 
-   struct intel_batchbuffer {
-      /** Current batchbuffer being queued up. */
-      drm_intel_bo *bo;
-      /** Last BO submitted to the hardware.  Used for glFinish(). */
-      drm_intel_bo *last_bo;
-      /** BO for post-sync nonzero writes for gen6 workaround. */
-      drm_intel_bo *workaround_bo;
-      bool need_workaround_flush;
-
-      struct cached_batch_item *cached_items;
-
-      uint16_t emit, total;
-      uint16_t used, reserved_space;
-      uint32_t map[8192];
-#define BATCH_SZ (8192*sizeof(uint32_t))
-
-      uint32_t state_batch_offset;
-      bool is_blit;
-      bool needs_sol_reset;
-
-      struct {
-	 uint16_t used;
-	 int reloc_count;
-      } saved;
-   } batch;
+   struct intel_batchbuffer batch;
 
    drm_intel_bo *first_post_swapbuffers_batch;
    bool need_throttle;
@@ -607,4 +608,8 @@ is_power_of_two(uint32_t value)
    return (value & (value - 1)) == 0;
 }
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/src/mesa/drivers/dri/intel/intel_extensions.c b/src/mesa/drivers/dri/intel/intel_extensions.c
index a50ab777e9b..d4713c98589 100644
--- a/src/mesa/drivers/dri/intel/intel_extensions.c
+++ b/src/mesa/drivers/dri/intel/intel_extensions.c
@@ -100,6 +100,10 @@ intelInitExtensions(struct gl_context *ctx)
        (intel->gen == 7 && intel->intelScreen->kernel_has_gen7_sol_reset))
       ctx->Extensions.EXT_transform_feedback = true;
 
+   if (intel->gen >= 6) {
+      ctx->Extensions.ARB_draw_buffers_blend = true;
+   }
+
    if (intel->gen >= 5)
       ctx->Extensions.EXT_timer_query = true;
 
diff --git a/src/mesa/drivers/dri/intel/intel_extensions_es.c b/src/mesa/drivers/dri/intel/intel_extensions_es.c
index 29eb8eab529..b42907c89bb 100644
--- a/src/mesa/drivers/dri/intel/intel_extensions_es.c
+++ b/src/mesa/drivers/dri/intel/intel_extensions_es.c
@@ -66,7 +66,6 @@ static const char *es1_extensions[] = {
    "GL_EXT_blend_func_separate",
    "GL_EXT_blend_subtract",
    "GL_OES_draw_texture",
-   "GL_ARB_framebuffer_object",
    "GL_EXT_framebuffer_object",
    "GL_ARB_point_sprite",
    "GL_EXT_stencil_wrap",
@@ -92,7 +91,6 @@ static const char *es2_extensions[] = {
    "GL_NV_blend_square",
 
    /* Optional GLES2 */
-   "GL_ARB_framebuffer_object",
    "GL_ARB_depth_texture",
    "GL_EXT_framebuffer_object",
 
diff --git a/src/mesa/drivers/dri/intel/intel_fbo.c b/src/mesa/drivers/dri/intel/intel_fbo.c
index 2d1a7985602..2f95ad06025 100644
--- a/src/mesa/drivers/dri/intel/intel_fbo.c
+++ b/src/mesa/drivers/dri/intel/intel_fbo.c
@@ -237,6 +237,9 @@ intel_alloc_renderbuffer_storage(struct gl_context * ctx, struct gl_renderbuffer
        _mesa_lookup_enum_by_nr(internalFormat),
        _mesa_get_format_name(rb->Format), width, height);
 
+   if (width == 0 || height == 0)
+      return true;
+
    irb->mt = intel_miptree_create_for_renderbuffer(intel, rb->Format,
 						   width, height);
    if (!irb->mt)
@@ -532,25 +535,14 @@ intel_renderbuffer_tile_offsets(struct intel_renderbuffer *irb,
 				uint32_t *tile_y)
 {
    struct intel_region *region = irb->mt->region;
-   int cpp = region->cpp;
-   uint32_t pitch = region->pitch * cpp;
-
-   if (region->tiling == I915_TILING_NONE) {
-      *tile_x = 0;
-      *tile_y = 0;
-      return irb->draw_x * cpp + irb->draw_y * pitch;
-   } else if (region->tiling == I915_TILING_X) {
-      *tile_x = irb->draw_x % (512 / cpp);
-      *tile_y = irb->draw_y % 8;
-      return ((irb->draw_y / 8) * (8 * pitch) +
-	      (irb->draw_x - *tile_x) / (512 / cpp) * 4096);
-   } else {
-      assert(region->tiling == I915_TILING_Y);
-      *tile_x = irb->draw_x % (128 / cpp);
-      *tile_y = irb->draw_y % 32;
-      return ((irb->draw_y / 32) * (32 * pitch) +
-	      (irb->draw_x - *tile_x) / (128 / cpp) * 4096);
-   }
+   uint32_t mask_x, mask_y;
+
+   intel_region_get_tile_masks(region, &mask_x, &mask_y);
+
+   *tile_x = irb->draw_x & mask_x;
+   *tile_y = irb->draw_y & mask_y;
+   return intel_region_get_aligned_offset(region, irb->draw_x & ~mask_x,
+                                          irb->draw_y & ~mask_y);
 }
 
 /**
diff --git a/src/mesa/drivers/dri/intel/intel_fbo.h b/src/mesa/drivers/dri/intel/intel_fbo.h
index 724f141535c..758b4a0ec32 100644
--- a/src/mesa/drivers/dri/intel/intel_fbo.h
+++ b/src/mesa/drivers/dri/intel/intel_fbo.h
@@ -34,6 +34,10 @@
 #include "intel_context.h"
 #include "intel_screen.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 struct intel_context;
 struct intel_mipmap_tree;
 struct intel_texture_image;
@@ -153,6 +157,11 @@ intel_flip_renderbuffers(struct gl_framebuffer *fb);
 void
 intel_renderbuffer_set_draw_offset(struct intel_renderbuffer *irb);
 
+void
+intel_renderbuffer_fine_offset_masks(struct intel_renderbuffer *irb,
+                                     uint32_t *fine_offset_mask_x,
+                                     uint32_t *fine_offset_mask_y);
+
 uint32_t
 intel_renderbuffer_tile_offsets(struct intel_renderbuffer *irb,
 				uint32_t *tile_x,
@@ -192,4 +201,8 @@ bool
 intel_renderbuffer_resolve_depth(struct intel_context *intel,
 				 struct intel_renderbuffer *irb);
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif /* INTEL_FBO_H */
diff --git a/src/mesa/drivers/dri/intel/intel_mipmap_tree.h b/src/mesa/drivers/dri/intel/intel_mipmap_tree.h
index 90828642019..0886c95f234 100644
--- a/src/mesa/drivers/dri/intel/intel_mipmap_tree.h
+++ b/src/mesa/drivers/dri/intel/intel_mipmap_tree.h
@@ -33,6 +33,10 @@
 #include "intel_regions.h"
 #include "intel_resolve_map.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /* A layer on top of the intel_regions code which adds:
  *
  * - Code to size and layout a region to hold a set of mipmaps.
@@ -413,4 +417,8 @@ intel_miptree_unmap(struct intel_context *intel,
 		    unsigned int level,
 		    unsigned int slice);
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c b/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c
index bdfb9abd27d..6821b69f0fc 100644
--- a/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c
+++ b/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c
@@ -337,9 +337,5 @@ intelBitmap(struct gl_context * ctx,
                           unpack, pixels))
       return;
 
-   /* FIXME */
-   if (intel->gen == 6)
-       return _swrast_Bitmap(ctx, x, y, width, height, unpack, pixels);
-
    _mesa_meta_Bitmap(ctx, x, y, width, height, unpack, pixels);
 }
diff --git a/src/mesa/drivers/dri/intel/intel_pixel_read.c b/src/mesa/drivers/dri/intel/intel_pixel_read.c
index 34fed3d1f44..ab4e581c400 100644
--- a/src/mesa/drivers/dri/intel/intel_pixel_read.c
+++ b/src/mesa/drivers/dri/intel/intel_pixel_read.c
@@ -29,6 +29,7 @@
 #include "main/enums.h"
 #include "main/mtypes.h"
 #include "main/macros.h"
+#include "main/fbobject.h"
 #include "main/image.h"
 #include "main/bufferobj.h"
 #include "main/readpix.h"
@@ -116,7 +117,7 @@ do_blit_readpixels(struct gl_context * ctx,
       return false;
    }
    else {
-      if (ctx->ReadBuffer->Name == 0)
+      if (_mesa_is_winsys_fbo(ctx->ReadBuffer))
 	 rowLength = -rowLength;
    }
 
@@ -145,7 +146,7 @@ do_blit_readpixels(struct gl_context * ctx,
 				       all ? INTEL_WRITE_FULL :
 				       INTEL_WRITE_PART);
 
-   if (ctx->ReadBuffer->Name == 0)
+   if (_mesa_is_winsys_fbo(ctx->ReadBuffer))
       y = ctx->ReadBuffer->Height - (y + height);
 
    if (!intelEmitCopyBlit(intel,
diff --git a/src/mesa/drivers/dri/intel/intel_regions.c b/src/mesa/drivers/dri/intel/intel_regions.c
index abea2bd0f71..1ef1ac663c5 100644
--- a/src/mesa/drivers/dri/intel/intel_regions.c
+++ b/src/mesa/drivers/dri/intel/intel_regions.c
@@ -390,3 +390,59 @@ intel_region_copy(struct intel_context *intel,
 			    srcx, srcy, dstx, dsty, width, height,
 			    logicop);
 }
+
+/**
+ * This function computes masks that may be used to select the bits of the X
+ * and Y coordinates that indicate the offset within a tile.  If the region is
+ * untiled, the masks are set to 0.
+ */
+void
+intel_region_get_tile_masks(struct intel_region *region,
+                            uint32_t *mask_x, uint32_t *mask_y)
+{
+   int cpp = region->cpp;
+
+   switch (region->tiling) {
+   default:
+      assert(false);
+   case I915_TILING_NONE:
+      *mask_x = *mask_y = 0;
+      break;
+   case I915_TILING_X:
+      *mask_x = 512 / cpp - 1;
+      *mask_y = 7;
+      break;
+   case I915_TILING_Y:
+      *mask_x = 128 / cpp - 1;
+      *mask_y = 31;
+      break;
+   }
+}
+
+/**
+ * Compute the offset (in bytes) from the start of the region to the given x
+ * and y coordinate.  For tiled regions, caller must ensure that x and y are
+ * multiples of the tile size.
+ */
+uint32_t
+intel_region_get_aligned_offset(struct intel_region *region, uint32_t x,
+                                uint32_t y)
+{
+   int cpp = region->cpp;
+   uint32_t pitch = region->pitch * cpp;
+
+   switch (region->tiling) {
+   default:
+      assert(false);
+   case I915_TILING_NONE:
+      return y * pitch + x * cpp;
+   case I915_TILING_X:
+      assert((x % (512 / cpp)) == 0);
+      assert((y % 8) == 0);
+      return y * pitch + x / (512 / cpp) * 4096;
+   case I915_TILING_Y:
+      assert((x % (128 / cpp)) == 0);
+      assert((y % 32) == 0);
+      return y * pitch + x / (128 / cpp) * 4096;
+   }
+}
diff --git a/src/mesa/drivers/dri/intel/intel_regions.h b/src/mesa/drivers/dri/intel/intel_regions.h
index 4ea970ad6d1..2fb30eb48f1 100644
--- a/src/mesa/drivers/dri/intel/intel_regions.h
+++ b/src/mesa/drivers/dri/intel/intel_regions.h
@@ -41,6 +41,10 @@
 #include "main/mtypes.h"
 #include "intel_bufmgr.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 struct intel_context;
 struct intel_buffer_object;
 
@@ -129,13 +133,26 @@ void _mesa_copy_rect(GLubyte * dst,
                 const GLubyte * src,
                 GLuint src_pitch, GLuint src_x, GLuint src_y);
 
+void
+intel_region_get_tile_masks(struct intel_region *region,
+                            uint32_t *mask_x, uint32_t *mask_y);
+
+uint32_t
+intel_region_get_aligned_offset(struct intel_region *region, uint32_t x,
+                                uint32_t y);
+
 struct __DRIimageRec {
    struct intel_region *region;
    GLenum internal_format;
+   uint32_t usage;
    uint32_t dri_format;
    GLuint format;
    GLenum data_type;
    void *data;
 };
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/src/mesa/drivers/dri/intel/intel_screen.c b/src/mesa/drivers/dri/intel/intel_screen.c
index e823792ff93..458178fe927 100644
--- a/src/mesa/drivers/dri/intel/intel_screen.c
+++ b/src/mesa/drivers/dri/intel/intel_screen.c
@@ -188,6 +188,8 @@ intel_create_image_from_name(__DRIscreen *screen,
     if (image == NULL)
 	return NULL;
 
+    image->dri_format = format;
+
     switch (format) {
     case __DRI_IMAGE_FORMAT_RGB565:
        image->format = MESA_FORMAT_RGB565;
@@ -209,6 +211,11 @@ intel_create_image_from_name(__DRIscreen *screen,
        image->internal_format = GL_RGBA;
        image->data_type = GL_UNSIGNED_BYTE;
        break;
+    case __DRI_IMAGE_FORMAT_XBGR8888:
+       image->format = MESA_FORMAT_RGBX8888_REV;
+       image->internal_format = GL_RGB;
+       image->data_type = GL_UNSIGNED_BYTE;
+       break;
     default:
        free(image);
        return NULL;
@@ -255,6 +262,21 @@ intel_create_image_from_renderbuffer(__DRIcontext *context,
    image->data = loaderPrivate;
    intel_region_reference(&image->region, irb->mt->region);
 
+   switch (image->format) {
+   case MESA_FORMAT_RGB565:
+      image->dri_format = __DRI_IMAGE_FORMAT_RGB565;
+      break;
+   case MESA_FORMAT_XRGB8888:
+      image->dri_format = __DRI_IMAGE_FORMAT_XRGB8888;
+      break;
+   case MESA_FORMAT_ARGB8888:
+      image->dri_format = __DRI_IMAGE_FORMAT_ARGB8888;
+      break;
+   case MESA_FORMAT_RGBA8888_REV:
+      image->dri_format = __DRI_IMAGE_FORMAT_ABGR8888;
+      break;
+   }
+
    return image;
 }
 
@@ -283,10 +305,16 @@ intel_create_image(__DRIscreen *screen,
       tiling = I915_TILING_NONE;
    }
 
+   /* We only support write for cursor drm images */
+   if ((use & __DRI_IMAGE_USE_WRITE) &&
+       use != (__DRI_IMAGE_USE_WRITE | __DRI_IMAGE_USE_CURSOR))
+      return NULL;
+
    image = CALLOC(sizeof *image);
    if (image == NULL)
       return NULL;
 
+   image->usage = use;
    image->dri_format = format;
 
    switch (format) {
@@ -310,6 +338,11 @@ intel_create_image(__DRIscreen *screen,
        image->internal_format = GL_RGBA;
        image->data_type = GL_UNSIGNED_BYTE;
        break;
+    case __DRI_IMAGE_FORMAT_XBGR8888:
+       image->format = MESA_FORMAT_RGBX8888_REV;
+       image->internal_format = GL_RGB;
+       image->data_type = GL_UNSIGNED_BYTE;
+       break;
    default:
       free(image);
       return NULL;
@@ -342,7 +375,8 @@ intel_query_image(__DRIimage *image, int attrib, int *value)
    case __DRI_IMAGE_ATTRIB_NAME:
       return intel_region_flink(image->region, (uint32_t *) value);
    case __DRI_IMAGE_ATTRIB_FORMAT:
-      return image->dri_format;
+      *value = image->dri_format;
+      return true;
    default:
       return false;
    }
@@ -364,6 +398,8 @@ intel_dup_image(__DRIimage *orig_image, void *loaderPrivate)
    }
 
    image->internal_format = orig_image->internal_format;
+   image->usage           = orig_image->usage;
+   image->dri_format      = orig_image->dri_format;
    image->format          = orig_image->format;
    image->data_type       = orig_image->data_type;
    image->data            = loaderPrivate;
@@ -379,18 +415,39 @@ intel_validate_usage(__DRIimage *image, unsigned int use)
 	 return GL_FALSE;
    }
 
+   /* We only support write for cursor drm images */
+   if ((use & __DRI_IMAGE_USE_WRITE) &&
+       use != (__DRI_IMAGE_USE_WRITE | __DRI_IMAGE_USE_CURSOR))
+      return GL_FALSE;
+
    return GL_TRUE;
 }
 
+static int
+intel_image_write(__DRIimage *image, const void *buf, size_t count)
+{
+   if (image->region->map_refcount)
+      return -1;
+   if (!(image->usage & __DRI_IMAGE_USE_WRITE))
+      return -1;
+
+   drm_intel_bo_map(image->region->bo, true);
+   memcpy(image->region->bo->virtual, buf, count);
+   drm_intel_bo_unmap(image->region->bo);
+
+   return 0;
+}
+
 static struct __DRIimageExtensionRec intelImageExtension = {
-    { __DRI_IMAGE, 3 },
+    { __DRI_IMAGE, 4 },
     intel_create_image_from_name,
     intel_create_image_from_renderbuffer,
     intel_destroy_image,
     intel_create_image,
     intel_query_image,
     intel_dup_image,
-    intel_validate_usage
+    intel_validate_usage,
+    intel_image_write
 };
 
 static const __DRIextension *intelScreenExtensions[] = {
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_context.c b/src/mesa/drivers/dri/nouveau/nouveau_context.c
index 48457675fc8..2625b76d14f 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_context.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_context.c
@@ -33,6 +33,7 @@
 
 #include "main/dd.h"
 #include "main/framebuffer.h"
+#include "main/fbobject.h"
 #include "main/light.h"
 #include "main/state.h"
 #include "main/version.h"
@@ -396,11 +397,11 @@ nouveau_validate_framebuffer(struct gl_context *ctx)
 	__DRIdrawable *dri_draw = dri_ctx->driDrawablePriv;
 	__DRIdrawable *dri_read = dri_ctx->driReadablePriv;
 
-	if (ctx->DrawBuffer->Name == 0)
+	if (_mesa_is_winsys_fbo(ctx->DrawBuffer))
 		validate_framebuffer(dri_ctx, dri_draw,
 				     &dri_ctx->dri2.draw_stamp);
 
-	if (ctx->ReadBuffer->Name == 0)
+	if (_mesa_is_winsys_fbo(ctx->ReadBuffer))
 		validate_framebuffer(dri_ctx, dri_read,
 				     &dri_ctx->dri2.read_stamp);
 
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_driver.c b/src/mesa/drivers/dri/nouveau/nouveau_driver.c
index 7222f68b439..69e5cac426a 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_driver.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_driver.c
@@ -25,6 +25,8 @@
  */
 
 #include "main/mfeatures.h"
+#include "main/mtypes.h"
+#include "main/fbobject.h"
 
 #include "nouveau_driver.h"
 #include "nouveau_context.h"
@@ -61,7 +63,7 @@ nouveau_flush(struct gl_context *ctx)
 
 	PUSH_KICK(push);
 
-	if (ctx->DrawBuffer->Name == 0 &&
+	if (_mesa_is_winsys_fbo(ctx->DrawBuffer) &&
 	    ctx->DrawBuffer->_ColorDrawBufferIndexes[0] == BUFFER_FRONT_LEFT) {
 		__DRIscreen *screen = nctx->screen->dri_screen;
 		__DRIdri2LoaderExtension *dri2 = screen->dri2.loader;
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_screen.c b/src/mesa/drivers/dri/nouveau/nouveau_screen.c
index 7e51b94a2ff..2a15c08c679 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_screen.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_screen.c
@@ -33,6 +33,7 @@
 #include "nv20_driver.h"
 
 #include "main/framebuffer.h"
+#include "main/fbobject.h"
 #include "main/renderbuffer.h"
 #include "swrast/s_renderbuffer.h"
 
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_surface.c b/src/mesa/drivers/dri/nouveau/nouveau_surface.c
index f2521149088..ffac309b9ef 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_surface.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_surface.c
@@ -28,6 +28,8 @@
 #include "nouveau_context.h"
 #include "nouveau_util.h"
 
+#include "main/formats.h"
+
 void
 nouveau_surface_alloc(struct gl_context *ctx, struct nouveau_surface *s,
 		      enum nouveau_surface_layout layout,
@@ -45,7 +47,7 @@ nouveau_surface_alloc(struct gl_context *ctx, struct nouveau_surface *s,
 		.width = width,
 		.height = height,
 		.cpp = cpp,
-		.pitch = width * cpp,
+		.pitch = _mesa_format_row_stride(format, width),
 	};
 
 	if (layout == TILED) {
@@ -64,7 +66,8 @@ nouveau_surface_alloc(struct gl_context *ctx, struct nouveau_surface *s,
 		s->pitch = align(s->pitch, 64);
 	}
 
-	ret = nouveau_bo_new(context_dev(ctx), flags, 0, s->pitch * height,
+	ret = nouveau_bo_new(context_dev(ctx), flags, 0,
+			     get_format_blocksy(format, height) * s->pitch,
 			     &config, &s->bo);
 	assert(!ret);
 }
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_texture.c b/src/mesa/drivers/dri/nouveau/nouveau_texture.c
index eadbeb45d7a..a2e96aa1684 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_texture.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_texture.c
@@ -91,6 +91,7 @@ nouveau_teximage_map(struct gl_context *ctx, struct gl_texture_image *ti,
 	if (s->bo) {
 		if (!(access & GL_MAP_READ_BIT) &&
 		    nouveau_pushbuf_refd(context_push(ctx), s->bo)) {
+			unsigned size;
 			/*
 			 * Heuristic: use a bounce buffer to pipeline
 			 * teximage transfers.
@@ -104,7 +105,8 @@ nouveau_teximage_map(struct gl_context *ctx, struct gl_texture_image *ti,
 			nti->transfer.x = x;
 			nti->transfer.y = y;
 
-			nti->base.Map = nouveau_get_scratch(ctx, st->pitch * h,
+			size = get_format_blocksy(st->format, h) * st->pitch;
+			nti->base.Map = nouveau_get_scratch(ctx, size,
 						       &st->bo, &st->offset);
 
 		} else {
@@ -120,7 +122,10 @@ nouveau_teximage_map(struct gl_context *ctx, struct gl_texture_image *ti,
 				assert(!ret);
 			}
 
-			nti->base.Map = s->bo->map + y * s->pitch + x * s->cpp;
+			nti->base.Map = s->bo->map +
+				get_format_blocksy(s->format, y) * s->pitch +
+				get_format_blocksx(s->format, x) * s->cpp;
+
 		}
 	}
 }
@@ -163,6 +168,7 @@ nouveau_map_texture_image(struct gl_context *ctx,
 	if (s->bo) {
 		if (!(mode & GL_MAP_READ_BIT) &&
 		    nouveau_pushbuf_refd(context_push(ctx), s->bo)) {
+			unsigned size;
 			/*
 			 * Heuristic: use a bounce buffer to pipeline
 			 * teximage transfers.
@@ -176,8 +182,9 @@ nouveau_map_texture_image(struct gl_context *ctx,
 			nti->transfer.x = x;
 			nti->transfer.y = y;
 
-			*map = nouveau_get_scratch(ctx, st->pitch * h,
-						   &st->bo, &st->offset);
+			size = get_format_blocksy(st->format, h) * st->pitch;
+			*map = nouveau_get_scratch(ctx, size,
+					  &st->bo, &st->offset);
 			*stride = st->pitch;
 		} else {
 			int ret, flags = 0;
@@ -192,11 +199,15 @@ nouveau_map_texture_image(struct gl_context *ctx,
 				assert(!ret);
 			}
 
-			*map = s->bo->map + y * s->pitch + x * s->cpp;
+			*map = s->bo->map +
+				get_format_blocksy(s->format, y) * s->pitch +
+				get_format_blocksx(s->format, x) * s->cpp;
 			*stride = s->pitch;
 		}
 	} else {
-		*map = nti->base.Map + y * s->pitch + x * s->cpp;
+		*map = nti->base.Map +
+			get_format_blocksy(s->format, y) * s->pitch +
+			get_format_blocksx(s->format, x) * s->cpp;
 		*stride = s->pitch;
 	}
 }
@@ -286,6 +297,22 @@ nouveau_choose_tex_format(struct gl_context *ctx, GLint internalFormat,
 	case GL_INTENSITY8:
 		return MESA_FORMAT_I8;
 
+	case GL_RGB_S3TC:
+	case GL_RGB4_S3TC:
+	case GL_COMPRESSED_RGB_S3TC_DXT1_EXT:
+		return MESA_FORMAT_RGB_DXT1;
+
+	case GL_COMPRESSED_RGBA_S3TC_DXT1_EXT:
+		return MESA_FORMAT_RGBA_DXT1;
+
+	case GL_RGBA_S3TC:
+	case GL_RGBA4_S3TC:
+	case GL_COMPRESSED_RGBA_S3TC_DXT3_EXT:
+		return MESA_FORMAT_RGBA_DXT3;
+
+	case GL_COMPRESSED_RGBA_S3TC_DXT5_EXT:
+		return MESA_FORMAT_RGBA_DXT5;
+
 	default:
 		assert(0);
 	}
@@ -353,7 +380,9 @@ relayout_texture(struct gl_context *ctx, struct gl_texture_object *t)
 		struct nouveau_surface *ss = to_nouveau_texture(t)->surfaces;
 		struct nouveau_surface *s = &to_nouveau_teximage(base)->surface;
 		int i, ret, last = get_last_level(t);
-		unsigned size, offset = 0,
+		enum nouveau_surface_layout layout =
+			(_mesa_is_format_compressed(s->format) ? LINEAR : SWIZZLED);
+		unsigned size, pitch, offset = 0,
 			width = s->width,
 			height = s->height;
 
@@ -363,7 +392,8 @@ relayout_texture(struct gl_context *ctx, struct gl_texture_object *t)
 
 		/* Relayout the mipmap tree. */
 		for (i = t->BaseLevel; i <= last; i++) {
-			size = width * height * s->cpp;
+			pitch = _mesa_format_row_stride(s->format, width);
+			size = get_format_blocksy(s->format, height) * pitch;
 
 			/* Images larger than 16B have to be aligned. */
 			if (size > 16)
@@ -371,12 +401,12 @@ relayout_texture(struct gl_context *ctx, struct gl_texture_object *t)
 
 			ss[i] = (struct nouveau_surface) {
 				.offset = offset,
-				.layout = SWIZZLED,
+				.layout = layout,
 				.format = s->format,
 				.width = width,
 				.height = height,
 				.cpp = s->cpp,
-				.pitch = width * s->cpp,
+				.pitch = pitch,
 			};
 
 			offset += size;
@@ -453,8 +483,10 @@ nouveau_teximage(struct gl_context *ctx, GLint dims,
 		 struct gl_texture_image *ti,
 		 GLint internalFormat,
 		 GLint width, GLint height, GLint depth, GLint border,
+		 GLsizei imageSize,
 		 GLenum format, GLenum type, const GLvoid *pixels,
-		 const struct gl_pixelstore_attrib *packing)
+		 const struct gl_pixelstore_attrib *packing,
+		 GLboolean compressed)
 {
 	struct gl_texture_object *t = ti->TexObject;
 	const GLuint level = ti->Level;
@@ -467,9 +499,15 @@ nouveau_teximage(struct gl_context *ctx, GLint dims,
 			      ti->TexFormat, width, height);
 	nti->base.RowStride = s->pitch / s->cpp;
 
-	pixels = _mesa_validate_pbo_teximage(ctx, dims, width, height, depth,
-					     format, type, pixels, packing,
-					     "glTexImage");
+	if (compressed)
+		pixels = _mesa_validate_pbo_compressed_teximage(ctx,
+			imageSize,
+			pixels, packing, "glCompressedTexImage");
+	else
+		pixels = _mesa_validate_pbo_teximage(ctx,
+			dims, width, height, depth, format, type,
+			pixels, packing, "glTexImage");
+
 	if (pixels) {
 		/* Store the pixel data. */
 		nouveau_teximage_map(ctx, ti, GL_MAP_WRITE_BIT,
@@ -511,8 +549,8 @@ nouveau_teximage_1d(struct gl_context *ctx,
 		    const struct gl_pixelstore_attrib *packing)
 {
 	nouveau_teximage(ctx, 1, ti, internalFormat,
-			 width, 1, 1, border, format, type, pixels,
-			 packing);
+			 width, 1, 1, border, 0, format, type, pixels,
+			 packing, GL_FALSE);
 }
 
 static void
@@ -524,8 +562,8 @@ nouveau_teximage_2d(struct gl_context *ctx,
 		    const struct gl_pixelstore_attrib *packing)
 {
 	nouveau_teximage(ctx, 2, ti, internalFormat,
-			 width, height, 1, border, format, type, pixels,
-			 packing);
+			 width, height, 1, border, 0, format, type, pixels,
+			 packing, GL_FALSE);
 }
 
 static void
@@ -537,8 +575,20 @@ nouveau_teximage_3d(struct gl_context *ctx,
 		    const struct gl_pixelstore_attrib *packing)
 {
 	nouveau_teximage(ctx, 3, ti, internalFormat,
-			 width, height, depth, border, format, type, pixels,
-			 packing);
+			 width, height, depth, border, 0, format, type, pixels,
+			 packing, GL_FALSE);
+}
+
+static void
+nouveau_compressed_teximage_2d(struct gl_context *ctx,
+		    struct gl_texture_image *ti,
+		    GLint internalFormat,
+		    GLint width, GLint height, GLint border,
+		    GLsizei imageSize, const GLvoid *data)
+{
+	nouveau_teximage(ctx, 2, ti, internalFormat,
+			 width, height, 1, border, imageSize, 0, 0, data,
+			 &ctx->Unpack, GL_TRUE);
 }
 
 static void
@@ -546,21 +596,29 @@ nouveau_texsubimage(struct gl_context *ctx, GLint dims,
 		    struct gl_texture_image *ti,
 		    GLint xoffset, GLint yoffset, GLint zoffset,
 		    GLint width, GLint height, GLint depth,
+		    GLsizei imageSize,
 		    GLenum format, GLenum type, const void *pixels,
-		    const struct gl_pixelstore_attrib *packing)
+		    const struct gl_pixelstore_attrib *packing,
+		    GLboolean compressed)
 {
 	struct nouveau_surface *s = &to_nouveau_teximage(ti)->surface;
 	struct nouveau_teximage *nti = to_nouveau_teximage(ti);
 	int ret;
 
-	pixels = _mesa_validate_pbo_teximage(ctx, dims, width, height, depth,
-					     format, type, pixels, packing,
-					     "glTexSubImage");
+	if (compressed)
+		pixels = _mesa_validate_pbo_compressed_teximage(ctx,
+				imageSize,
+				pixels, packing, "glCompressedTexSubImage");
+	else
+		pixels = _mesa_validate_pbo_teximage(ctx,
+				dims, width, height, depth, format, type,
+				pixels, packing, "glTexSubImage");
+
 	if (pixels) {
 		nouveau_teximage_map(ctx, ti, GL_MAP_WRITE_BIT,
 				     xoffset, yoffset, width, height);
 
-		ret = _mesa_texstore(ctx, 3, ti->_BaseFormat, ti->TexFormat,
+		ret = _mesa_texstore(ctx, dims, ti->_BaseFormat, ti->TexFormat,
                                      s->pitch,
 				     &nti->base.Map,
                                      width, height, depth,
@@ -586,8 +644,8 @@ nouveau_texsubimage_3d(struct gl_context *ctx,
 		       const struct gl_pixelstore_attrib *packing)
 {
 	nouveau_texsubimage(ctx, 3, ti, xoffset, yoffset, zoffset,
-			    width, height, depth, format, type, pixels,
-			    packing);
+			    width, height, depth, 0, format, type, pixels,
+			    packing, GL_FALSE);
 }
 
 static void
@@ -599,8 +657,8 @@ nouveau_texsubimage_2d(struct gl_context *ctx,
 		       const struct gl_pixelstore_attrib *packing)
 {
 	nouveau_texsubimage(ctx, 2, ti, xoffset, yoffset, 0,
-			    width, height, 1, format, type, pixels,
-			    packing);
+			    width, height, 1, 0, format, type, pixels,
+			    packing, GL_FALSE);
 }
 
 static void
@@ -611,8 +669,21 @@ nouveau_texsubimage_1d(struct gl_context *ctx,
 		       const struct gl_pixelstore_attrib *packing)
 {
 	nouveau_texsubimage(ctx, 1, ti, xoffset, 0, 0,
-			    width, 1, 1, format, type, pixels,
-			    packing);
+			    width, 1, 1, 0, format, type, pixels,
+			    packing, GL_FALSE);
+}
+
+static void
+nouveau_compressed_texsubimage_2d(struct gl_context *ctx,
+		       struct gl_texture_image *ti,
+		       GLint xoffset, GLint yoffset,
+		       GLsizei width, GLint height,
+		       GLenum format,
+		       GLint imageSize, const void *data)
+{
+	nouveau_texsubimage(ctx, 2, ti, xoffset, yoffset, 0,
+			  width, height, 1, imageSize, format, 0, data,
+			  &ctx->Unpack, GL_TRUE);
 }
 
 static void
@@ -691,6 +762,8 @@ nouveau_texture_functions_init(struct dd_function_table *functions)
 	functions->TexSubImage1D = nouveau_texsubimage_1d;
 	functions->TexSubImage2D = nouveau_texsubimage_2d;
 	functions->TexSubImage3D = nouveau_texsubimage_3d;
+	functions->CompressedTexImage2D = nouveau_compressed_teximage_2d;
+	functions->CompressedTexSubImage2D = nouveau_compressed_texsubimage_2d;
 	functions->BindTexture = nouveau_bind_texture;
 	functions->MapTextureImage = nouveau_map_texture_image;
 	functions->UnmapTextureImage = nouveau_unmap_texture_image;
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_util.h b/src/mesa/drivers/dri/nouveau/nouveau_util.h
index d4cc5c4fb9c..17d6965ee2c 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_util.h
+++ b/src/mesa/drivers/dri/nouveau/nouveau_util.h
@@ -207,4 +207,24 @@ get_texgen_coeff(struct gl_texgen *c)
 		return NULL;
 }
 
+static inline unsigned
+get_format_blocksx(gl_format format,
+		       unsigned x)
+{
+	GLuint blockwidth;
+	GLuint blockheight;
+	_mesa_get_format_block_size(format, &blockwidth, &blockheight);
+	return (x + blockwidth - 1) / blockwidth;
+}
+
+static inline unsigned
+get_format_blocksy(gl_format format,
+		       unsigned y)
+{
+	GLuint blockwidth;
+	GLuint blockheight;
+	_mesa_get_format_block_size(format, &blockwidth, &blockheight);
+	return (y + blockheight - 1) / blockheight;
+}
+
 #endif
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_vbo_t.c b/src/mesa/drivers/dri/nouveau/nouveau_vbo_t.c
index 62fee2e1e27..a9a56e78f2d 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_vbo_t.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_vbo_t.c
@@ -216,7 +216,7 @@ get_max_client_stride(struct gl_context *ctx, const struct gl_client_array **arr
 }
 
 static void
-TAG(vbo_render_prims)(struct gl_context *ctx, const struct gl_client_array **arrays,
+TAG(vbo_render_prims)(struct gl_context *ctx,
 		      const struct _mesa_prim *prims, GLuint nr_prims,
 		      const struct _mesa_index_buffer *ib,
 		      GLboolean index_bounds_valid,
@@ -448,7 +448,6 @@ vbo_draw_imm(struct gl_context *ctx, const struct gl_client_array **arrays,
 
 static void
 TAG(vbo_render_prims)(struct gl_context *ctx,
-		      const struct gl_client_array **arrays,
 		      const struct _mesa_prim *prims, GLuint nr_prims,
 		      const struct _mesa_index_buffer *ib,
 		      GLboolean index_bounds_valid,
@@ -456,6 +455,7 @@ TAG(vbo_render_prims)(struct gl_context *ctx,
 		      struct gl_transform_feedback_object *tfb_vertcount)
 {
 	struct nouveau_render_state *render = to_render_state(ctx);
+	const struct gl_client_array **arrays = ctx->Array._DrawArrays;
 
 	if (!index_bounds_valid)
 		vbo_get_minmax_indices(ctx, prims, ib, &min_index, &max_index,
@@ -484,7 +484,6 @@ TAG(vbo_render_prims)(struct gl_context *ctx,
 
 static void
 TAG(vbo_check_render_prims)(struct gl_context *ctx,
-			    const struct gl_client_array **arrays,
 			    const struct _mesa_prim *prims, GLuint nr_prims,
 			    const struct _mesa_index_buffer *ib,
 			    GLboolean index_bounds_valid,
@@ -496,12 +495,12 @@ TAG(vbo_check_render_prims)(struct gl_context *ctx,
 	nouveau_validate_framebuffer(ctx);
 
 	if (nctx->fallback == HWTNL)
-		TAG(vbo_render_prims)(ctx, arrays, prims, nr_prims, ib,
+		TAG(vbo_render_prims)(ctx, prims, nr_prims, ib,
 				      index_bounds_valid, min_index, max_index,
 				      tfb_vertcount);
 
 	if (nctx->fallback == SWTNL)
-		_tnl_vbo_draw_prims(ctx, arrays, prims, nr_prims, ib,
+		_tnl_vbo_draw_prims(ctx, prims, nr_prims, ib,
 				    index_bounds_valid, min_index, max_index,
 				    tfb_vertcount);
 }
diff --git a/src/mesa/drivers/dri/nouveau/nv04_surface.c b/src/mesa/drivers/dri/nouveau/nv04_surface.c
index 522c94819c0..103453f1b9a 100644
--- a/src/mesa/drivers/dri/nouveau/nv04_surface.c
+++ b/src/mesa/drivers/dri/nouveau/nv04_surface.c
@@ -393,6 +393,15 @@ nv04_surface_copy(struct gl_context *ctx,
 		  int dx, int dy, int sx, int sy,
 		  int w, int h)
 {
+	if (_mesa_is_format_compressed(src->format)) {
+		sx = get_format_blocksx(src->format, sx);
+		sy = get_format_blocksy(src->format, sy);
+		dx = get_format_blocksx(dst->format, dx);
+		dy = get_format_blocksy(dst->format, dy);
+		w = get_format_blocksx(src->format, w);
+		h = get_format_blocksy(src->format, h);
+	}
+
 	/* Linear texture copy. */
 	if ((src->layout == LINEAR && dst->layout == LINEAR) ||
 	    dst->width <= 2 || dst->height <= 1) {
diff --git a/src/mesa/drivers/dri/nouveau/nv10_context.c b/src/mesa/drivers/dri/nouveau/nv10_context.c
index 90c1545ce37..a2448caaf81 100644
--- a/src/mesa/drivers/dri/nouveau/nv10_context.c
+++ b/src/mesa/drivers/dri/nouveau/nv10_context.c
@@ -449,6 +449,10 @@ nv10_context_create(struct nouveau_screen *screen, const struct gl_config *visua
 	ctx->Extensions.ARB_texture_env_dot3 = true;
 	ctx->Extensions.NV_fog_distance = true;
 	ctx->Extensions.NV_texture_rectangle = true;
+	if (ctx->Mesa_DXTn) {
+		ctx->Extensions.EXT_texture_compression_s3tc = true;
+		ctx->Extensions.S3_s3tc = true;
+	}
 
 	/* GL constants. */
 	ctx->Const.MaxTextureLevels = 12;
diff --git a/src/mesa/drivers/dri/nouveau/nv10_state_tex.c b/src/mesa/drivers/dri/nouveau/nv10_state_tex.c
index 0254341151b..b467bb33142 100644
--- a/src/mesa/drivers/dri/nouveau/nv10_state_tex.c
+++ b/src/mesa/drivers/dri/nouveau/nv10_state_tex.c
@@ -111,6 +111,16 @@ get_tex_format_pot(struct gl_texture_image *ti)
 	case MESA_FORMAT_L8:
 		return NV10_3D_TEX_FORMAT_FORMAT_L8;
 
+	case MESA_FORMAT_RGB_DXT1:
+	case MESA_FORMAT_RGBA_DXT1:
+		return NV10_3D_TEX_FORMAT_FORMAT_DXT1;
+
+	case MESA_FORMAT_RGBA_DXT3:
+		return NV10_3D_TEX_FORMAT_FORMAT_DXT3;
+
+	case MESA_FORMAT_RGBA_DXT5:
+		return NV10_3D_TEX_FORMAT_FORMAT_DXT5;
+
 	default:
 		assert(0);
 	}
diff --git a/src/mesa/drivers/dri/nouveau/nv20_context.c b/src/mesa/drivers/dri/nouveau/nv20_context.c
index c91171791cc..5a36c8766d7 100644
--- a/src/mesa/drivers/dri/nouveau/nv20_context.c
+++ b/src/mesa/drivers/dri/nouveau/nv20_context.c
@@ -460,6 +460,10 @@ nv20_context_create(struct nouveau_screen *screen, const struct gl_config *visua
 	ctx->Extensions.ARB_texture_env_dot3 = true;
 	ctx->Extensions.NV_fog_distance = true;
 	ctx->Extensions.NV_texture_rectangle = true;
+	if (ctx->Mesa_DXTn) {
+		ctx->Extensions.EXT_texture_compression_s3tc = true;
+		ctx->Extensions.S3_s3tc = true;
+	}
 
 	/* GL constants. */
 	ctx->Const.MaxTextureCoordUnits = NV20_TEXTURE_UNITS;
diff --git a/src/mesa/drivers/dri/nouveau/nv20_state_tex.c b/src/mesa/drivers/dri/nouveau/nv20_state_tex.c
index 799510daa69..d8bfdf2e58f 100644
--- a/src/mesa/drivers/dri/nouveau/nv20_state_tex.c
+++ b/src/mesa/drivers/dri/nouveau/nv20_state_tex.c
@@ -108,6 +108,16 @@ get_tex_format_pot(struct gl_texture_image *ti)
 	case MESA_FORMAT_L8:
 		return NV20_3D_TEX_FORMAT_FORMAT_L8;
 
+	case MESA_FORMAT_RGB_DXT1:
+	case MESA_FORMAT_RGBA_DXT1:
+		return NV20_3D_TEX_FORMAT_FORMAT_DXT1;
+
+	case MESA_FORMAT_RGBA_DXT3:
+		return NV20_3D_TEX_FORMAT_FORMAT_DXT3;
+
+	case MESA_FORMAT_RGBA_DXT5:
+		return NV20_3D_TEX_FORMAT_FORMAT_DXT5;
+
 	default:
 		assert(0);
 	}
diff --git a/src/mesa/drivers/dri/r200/r200_state.c b/src/mesa/drivers/dri/r200/r200_state.c
index 3131007517b..0f7b564024e 100644
--- a/src/mesa/drivers/dri/r200/r200_state.c
+++ b/src/mesa/drivers/dri/r200/r200_state.c
@@ -40,6 +40,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/colormac.h"
 #include "main/light.h"
 #include "main/framebuffer.h"
+#include "main/fbobject.h"
 
 #include "swrast/swrast.h"
 #include "vbo/vbo.h"
@@ -536,7 +537,7 @@ static void r200FrontFace( struct gl_context *ctx, GLenum mode )
    rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] &= ~R200_CULL_FRONT_IS_CCW;
 
    /* Winding is inverted when rendering to FBO */
-   if (ctx->DrawBuffer && ctx->DrawBuffer->Name)
+   if (ctx->DrawBuffer && _mesa_is_user_fbo(ctx->DrawBuffer))
       mode = (mode == GL_CW) ? GL_CCW : GL_CW;
 
    switch ( mode ) {
@@ -1547,7 +1548,7 @@ void r200UpdateWindow( struct gl_context *ctx )
    GLfloat xoffset = 0;
    GLfloat yoffset = dPriv ? (GLfloat) dPriv->h : 0;
    const GLfloat *v = ctx->Viewport._WindowMap.m;
-   const GLboolean render_to_fbo = (ctx->DrawBuffer ? (ctx->DrawBuffer->Name != 0) : 0);
+   const GLboolean render_to_fbo = (ctx->DrawBuffer ? _mesa_is_user_fbo(ctx->DrawBuffer) : 0);
    const GLfloat depthScale = 1.0F / ctx->DrawBuffer->_DepthMaxF;
    GLfloat y_scale, y_bias;
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_common.c b/src/mesa/drivers/dri/radeon/radeon_common.c
index b64ff8160d0..a8dfae01923 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common.c
+++ b/src/mesa/drivers/dri/radeon/radeon_common.c
@@ -46,6 +46,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/imports.h"
 #include "main/context.h"
 #include "main/enums.h"
+#include "main/fbobject.h"
 #include "main/framebuffer.h"
 #include "main/renderbuffer.h"
 #include "drivers/common/meta.h"
@@ -168,7 +169,7 @@ void radeonUpdateScissor( struct gl_context *ctx )
 	max_x = ctx->DrawBuffer->Width - 1;
 	max_y = ctx->DrawBuffer->Height - 1;
 
-	if ( !ctx->DrawBuffer->Name ) {
+	if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
 		x1 = x;
 		y1 = ctx->DrawBuffer->Height - (y + h);
 		x2 = x + w - 1;
@@ -407,7 +408,7 @@ void radeonDrawBuffer( struct gl_context *ctx, GLenum mode )
 		fprintf(stderr, "%s %s\n", __FUNCTION__,
 			_mesa_lookup_enum_by_nr( mode ));
 
-	if (ctx->DrawBuffer->Name == 0) {
+	if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
 		radeonContextPtr radeon = RADEON_CONTEXT(ctx);
 
 		const GLboolean was_front_buffer_rendering =
@@ -430,7 +431,7 @@ void radeonDrawBuffer( struct gl_context *ctx, GLenum mode )
 
 void radeonReadBuffer( struct gl_context *ctx, GLenum mode )
 {
-	if ((ctx->DrawBuffer != NULL) && (ctx->DrawBuffer->Name == 0)) {
+	if (ctx->DrawBuffer && _mesa_is_winsys_fbo(ctx->DrawBuffer)) {
 		struct radeon_context *const rmesa = RADEON_CONTEXT(ctx);
 		const GLboolean was_front_buffer_reading = rmesa->is_front_buffer_reading;
 		rmesa->is_front_buffer_reading = (mode == GL_FRONT_LEFT)
@@ -465,7 +466,7 @@ void radeon_viewport(struct gl_context *ctx, GLint x, GLint y, GLsizei width, GL
 	void (*old_viewport)(struct gl_context *ctx, GLint x, GLint y,
 			     GLsizei w, GLsizei h);
 
-	if (ctx->DrawBuffer->Name == 0) {
+	if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
 		if (radeon->is_front_buffer_rendering) {
 			ctx->Driver.Flush(ctx);
 		}
@@ -656,7 +657,7 @@ void radeonFlush(struct gl_context *ctx)
 		rcommonFlushCmdBuf(radeon, __FUNCTION__);
 
 flush_front:
-	if ((ctx->DrawBuffer->Name == 0) && radeon->front_buffer_dirty) {
+	if (_mesa_is_winsys_fbo(ctx->DrawBuffer) && radeon->front_buffer_dirty) {
 		__DRIscreen *const screen = radeon->radeonScreen->driScreen;
 
 		if (screen->dri2.loader && (screen->dri2.loader->base.version >= 2)
diff --git a/src/mesa/drivers/dri/radeon/radeon_pixel_read.c b/src/mesa/drivers/dri/radeon/radeon_pixel_read.c
index 3a14cc69af4..db5e01da49b 100644
--- a/src/mesa/drivers/dri/radeon/radeon_pixel_read.c
+++ b/src/mesa/drivers/dri/radeon/radeon_pixel_read.c
@@ -28,6 +28,7 @@
 #include "stdint.h"
 #include "main/bufferobj.h"
 #include "main/enums.h"
+#include "main/fbobject.h"
 #include "main/image.h"
 #include "main/readpix.h"
 #include "main/state.h"
@@ -148,7 +149,7 @@ do_blit_readpixels(struct gl_context * ctx,
     }
 
     /* Disable source Y flipping for FBOs */
-    flip_y = (ctx->ReadBuffer->Name == 0);
+    flip_y = _mesa_is_winsys_fbo(ctx->ReadBuffer);
     if (pack->Invert) {
         y = rrb->base.Base.Height - height - y;
         flip_y = !flip_y;
diff --git a/src/mesa/drivers/dri/radeon/radeon_state.c b/src/mesa/drivers/dri/radeon/radeon_state.c
index a3b4e54edc4..6e2bb5bce62 100644
--- a/src/mesa/drivers/dri/radeon/radeon_state.c
+++ b/src/mesa/drivers/dri/radeon/radeon_state.c
@@ -39,6 +39,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/light.h"
 #include "main/context.h"
 #include "main/framebuffer.h"
+#include "main/fbobject.h"
 #include "main/simple_list.h"
 #include "main/state.h"
 
@@ -444,7 +445,7 @@ static void radeonFrontFace( struct gl_context *ctx, GLenum mode )
    rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] &= ~RADEON_CULL_FRONT_IS_CCW;
 
    /* Winding is inverted when rendering to FBO */
-   if (ctx->DrawBuffer && ctx->DrawBuffer->Name)
+   if (ctx->DrawBuffer && _mesa_is_user_fbo(ctx->DrawBuffer))
       mode = (mode == GL_CW) ? GL_CCW : GL_CW;
 
    switch ( mode ) {
@@ -1354,7 +1355,7 @@ void radeonUpdateWindow( struct gl_context *ctx )
    GLfloat xoffset = 0.0;
    GLfloat yoffset = dPriv ? (GLfloat) dPriv->h : 0;
    const GLfloat *v = ctx->Viewport._WindowMap.m;
-   const GLboolean render_to_fbo = (ctx->DrawBuffer ? (ctx->DrawBuffer->Name != 0) : 0);
+   const GLboolean render_to_fbo = (ctx->DrawBuffer ? _mesa_is_user_fbo(ctx->DrawBuffer) : 0);
    const GLfloat depthScale = 1.0F / ctx->DrawBuffer->_DepthMaxF;
    GLfloat y_scale, y_bias;
 
diff --git a/src/mesa/drivers/x11/xm_dd.c b/src/mesa/drivers/x11/xm_dd.c
index 7748298d49f..12004ba71df 100644
--- a/src/mesa/drivers/x11/xm_dd.c
+++ b/src/mesa/drivers/x11/xm_dd.c
@@ -32,6 +32,7 @@
 #include "main/bufferobj.h"
 #include "main/context.h"
 #include "main/colormac.h"
+#include "main/fbobject.h"
 #include "main/macros.h"
 #include "main/image.h"
 #include "main/imports.h"
@@ -69,7 +70,7 @@ color_mask(struct gl_context *ctx,
    const int xclass = xmesa->xm_visual->visualType;
    (void) amask;
 
-   if (ctx->DrawBuffer->Name != 0)
+   if (_mesa_is_user_fbo(ctx->DrawBuffer))
       return;
 
    xmbuf = XMESA_BUFFER(ctx->DrawBuffer);
@@ -240,7 +241,7 @@ clear_nbit_ximage(struct gl_context *ctx, struct xmesa_renderbuffer *xrb,
 static void
 clear_buffers(struct gl_context *ctx, GLbitfield buffers)
 {
-   if (ctx->DrawBuffer->Name == 0) {
+   if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
       /* this is a window system framebuffer */
       const GLuint *colorMask = (GLuint *) &ctx->Color.ColorMask[0];
       const XMesaContext xmesa = XMESA_CONTEXT(ctx);
@@ -304,7 +305,7 @@ can_do_DrawPixels_8R8G8B(struct gl_context *ctx, GLenum format, GLenum type)
    if (format == GL_BGRA &&
        type == GL_UNSIGNED_BYTE &&
        ctx->DrawBuffer &&
-       ctx->DrawBuffer->Name == 0 &&
+       _mesa_is_winsys_fbo(ctx->DrawBuffer) &&
        ctx->Pixel.ZoomX == 1.0 &&        /* no zooming */
        ctx->Pixel.ZoomY == 1.0 &&
        ctx->_ImageTransferState == 0 /* no color tables, scale/bias, etc */) {
@@ -437,7 +438,7 @@ can_do_DrawPixels_5R6G5B(struct gl_context *ctx, GLenum format, GLenum type)
        type == GL_UNSIGNED_SHORT_5_6_5 &&
        !ctx->Color.DitherFlag &&  /* no dithering */
        ctx->DrawBuffer &&
-       ctx->DrawBuffer->Name == 0 &&
+       _mesa_is_winsys_fbo(ctx->DrawBuffer) &&
        ctx->Pixel.ZoomX == 1.0 &&        /* no zooming */
        ctx->Pixel.ZoomY == 1.0 &&
        ctx->_ImageTransferState == 0 /* no color tables, scale/bias, etc */) {
@@ -693,7 +694,7 @@ xmesa_update_state( struct gl_context *ctx, GLbitfield new_state )
    _vbo_InvalidateState( ctx, new_state );
    _swsetup_InvalidateState( ctx, new_state );
 
-   if (ctx->DrawBuffer->Name != 0)
+   if (_mesa_is_user_fbo(ctx->DrawBuffer))
       return;
 
    /*
diff --git a/src/mesa/main/api_arrayelt.c b/src/mesa/main/api_arrayelt.c
index 7bf55f3198b..6de6de2b765 100644
--- a/src/mesa/main/api_arrayelt.c
+++ b/src/mesa/main/api_arrayelt.c
@@ -1643,12 +1643,20 @@ void GLAPIENTRY _ae_ArrayElement( GLint elt )
    const struct _glapi_table * const disp = GET_DISPATCH();
    GLboolean do_map;
 
+   /* If PrimitiveRestart is enabled and the index is the RestartIndex
+    * then we call PrimitiveRestartNV and return.
+    */
+   if (ctx->Array.PrimitiveRestart && (elt == ctx->Array.RestartIndex)) {
+      CALL_PrimitiveRestartNV((struct _glapi_table *)disp, ());
+      return;
+   }
+
    if (actx->NewState) {
       assert(!actx->mapped_vbos);
       _ae_update_state( ctx );
    }
 
-   /* Determine if w need to map/unmap VBOs */
+   /* Determine if we need to map/unmap VBOs */
    do_map = actx->nr_vbos && !actx->mapped_vbos;
 
    if (do_map)
diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c
index d75351c8598..7e2ac98b9a0 100644
--- a/src/mesa/main/context.c
+++ b/src/mesa/main/context.c
@@ -792,6 +792,7 @@ init_attrib_groups(struct gl_context *ctx)
 
    /* Miscellaneous */
    ctx->NewState = _NEW_ALL;
+   ctx->NewDriverState = ~0;
    ctx->ErrorValue = (GLenum) GL_NO_ERROR;
    ctx->ResetStatus = (GLenum) GL_NO_ERROR;
    ctx->varying_vp_inputs = VERT_BIT_ALL;
@@ -1290,6 +1291,7 @@ _mesa_copy_context( const struct gl_context *src, struct gl_context *dst,
    /* XXX FIXME:  Call callbacks?
     */
    dst->NewState = _NEW_ALL;
+   dst->NewDriverState = ~0;
 }
 #endif
 
diff --git a/src/mesa/main/dlist.c b/src/mesa/main/dlist.c
index 8cfb97c266e..c1c65ea25d2 100644
--- a/src/mesa/main/dlist.c
+++ b/src/mesa/main/dlist.c
@@ -134,7 +134,7 @@ do {						\
 do {									\
    if (ctx->Driver.CurrentSavePrimitive <= GL_POLYGON ||		\
        ctx->Driver.CurrentSavePrimitive == PRIM_INSIDE_UNKNOWN_PRIM) {	\
-      _mesa_compile_error( ctx, GL_INVALID_OPERATION, "begin/end" );	\
+      _mesa_compile_error( ctx, GL_INVALID_OPERATION, "glBegin/End" );	\
       return retval;							\
    }									\
 } while (0)
@@ -149,7 +149,7 @@ do {									\
 do {									\
    if (ctx->Driver.CurrentSavePrimitive <= GL_POLYGON ||		\
        ctx->Driver.CurrentSavePrimitive == PRIM_INSIDE_UNKNOWN_PRIM) {	\
-      _mesa_compile_error( ctx, GL_INVALID_OPERATION, "begin/end" );	\
+      _mesa_compile_error( ctx, GL_INVALID_OPERATION, "glBegin/End" );	\
       return;								\
    }									\
 } while (0)
@@ -5673,17 +5673,25 @@ save_Indexfv(const GLfloat * v)
 static void GLAPIENTRY
 save_EdgeFlag(GLboolean x)
 {
-   save_Attr1fNV(VERT_ATTRIB_EDGEFLAG, x ? (GLfloat)1.0 : (GLfloat)0.0);
+   save_Attr1fNV(VERT_ATTRIB_EDGEFLAG, x ? 1.0f : 0.0f);
 }
 
-static inline GLboolean compare4fv( const GLfloat *a,
-                                    const GLfloat *b,
-                                    GLuint count )
+
+/**
+ * Compare 'count' elements of vectors 'a' and 'b'.
+ * \return GL_TRUE if equal, GL_FALSE if different.
+ */
+static inline GLboolean
+compare_vec(const GLfloat *a, const GLfloat *b, GLuint count)
 {
    return memcmp( a, b, count * sizeof(GLfloat) ) == 0;
 }
-                              
 
+
+/**
+ * This glMaterial function is used for glMaterial calls that are outside
+ * a glBegin/End pair.  For glMaterial inside glBegin/End, see the VBO code.
+ */
 static void GLAPIENTRY
 save_Materialfv(GLenum face, GLenum pname, const GLfloat * param)
 {
@@ -5698,7 +5706,7 @@ save_Materialfv(GLenum face, GLenum pname, const GLfloat * param)
    case GL_FRONT_AND_BACK:
       break;
    default:
-      _mesa_compile_error(ctx, GL_INVALID_ENUM, "material(face)");
+      _mesa_compile_error(ctx, GL_INVALID_ENUM, "glMaterial(face)");
       return;
    }
 
@@ -5717,7 +5725,7 @@ save_Materialfv(GLenum face, GLenum pname, const GLfloat * param)
       args = 3;
       break;
    default:
-      _mesa_compile_error(ctx, GL_INVALID_ENUM, "material(pname)");
+      _mesa_compile_error(ctx, GL_INVALID_ENUM, "glMaterial(pname)");
       return;
    }
    
@@ -5734,7 +5742,8 @@ save_Materialfv(GLenum face, GLenum pname, const GLfloat * param)
    for (i = 0; i < MAT_ATTRIB_MAX; i++) {
       if (bitmask & (1 << i)) {
          if (ctx->ListState.ActiveMaterialSize[i] == args &&
-             compare4fv(ctx->ListState.CurrentMaterial[i], param, args)) {
+             compare_vec(ctx->ListState.CurrentMaterial[i], param, args)) {
+            /* no change in material value */
             bitmask &= ~(1 << i);
          }
          else {
@@ -5744,8 +5753,7 @@ save_Materialfv(GLenum face, GLenum pname, const GLfloat * param)
       }
    }
 
-   /* If this call has effect, return early:
-    */
+   /* If this call has no effect, return early */
    if (bitmask == 0)
       return;
 
diff --git a/src/mesa/main/errors.c b/src/mesa/main/errors.c
index fcf873f18c2..4a187b7b0f2 100644
--- a/src/mesa/main/errors.c
+++ b/src/mesa/main/errors.c
@@ -1062,4 +1062,47 @@ _mesa_debug( const struct gl_context *ctx, const char *fmtString, ... )
    (void) fmtString;
 }
 
+
+/**
+ * Report debug information from the shader compiler via GL_ARB_debug_output.
+ *
+ * \param ctx GL context.
+ * \param type The namespace to which this message belongs.
+ * \param id The message ID within the given namespace.
+ * \param msg The message to output. Need not be null-terminated.
+ * \param len The length of 'msg'. If negative, 'msg' must be null-terminated.
+ */
+void
+_mesa_shader_debug( struct gl_context *ctx, GLenum type, GLuint id,
+                    const char *msg, int len )
+{
+   GLenum source = GL_DEBUG_SOURCE_SHADER_COMPILER_ARB,
+          severity;
+
+   switch (type) {
+   case GL_DEBUG_TYPE_ERROR_ARB:
+      assert(id < SHADER_ERROR_COUNT);
+      severity = GL_DEBUG_SEVERITY_HIGH_ARB;
+      break;
+   case GL_DEBUG_TYPE_DEPRECATED_BEHAVIOR_ARB:
+   case GL_DEBUG_TYPE_UNDEFINED_BEHAVIOR_ARB:
+   case GL_DEBUG_TYPE_PORTABILITY_ARB:
+   case GL_DEBUG_TYPE_PERFORMANCE_ARB:
+   case GL_DEBUG_TYPE_OTHER_ARB:
+      assert(0 && "other categories not implemented yet");
+   default:
+      _mesa_problem(ctx, "bad enum in _mesa_shader_debug()");
+      return;
+   }
+
+   if (len < 0)
+      len = strlen(msg);
+
+   /* Truncate the message if necessary. */
+   if (len >= MAX_DEBUG_MESSAGE_LENGTH)
+      len = MAX_DEBUG_MESSAGE_LENGTH - 1;
+
+   _mesa_log_msg(ctx, source, type, id, severity, len, msg);
+}
+
 /*@}*/
diff --git a/src/mesa/main/errors.h b/src/mesa/main/errors.h
index ed1c6fc7fb5..b4490fac9dd 100644
--- a/src/mesa/main/errors.h
+++ b/src/mesa/main/errors.h
@@ -68,6 +68,9 @@ _mesa_error( struct gl_context *ctx, GLenum error, const char *fmtString, ... )
 extern void
 _mesa_debug( const struct gl_context *ctx, const char *fmtString, ... ) PRINTFLIKE(2, 3);
 
+extern void
+_mesa_shader_debug( struct gl_context *ctx, GLenum type, GLuint id, const char *msg, int len );
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/mesa/main/extensions.c b/src/mesa/main/extensions.c
index 5f2c74a758e..a843a40925c 100644
--- a/src/mesa/main/extensions.c
+++ b/src/mesa/main/extensions.c
@@ -754,7 +754,7 @@ get_extension_override( struct gl_context *ctx )
 
    /* Remove trailing space. */
    len = strlen(extra_exts);
-   if (extra_exts[len - 1] == ' ')
+   if (len > 0 && extra_exts[len - 1] == ' ')
       extra_exts[len - 1] = '\0';
 
    return extra_exts;
diff --git a/src/mesa/main/fbobject.c b/src/mesa/main/fbobject.c
index 26ae1087c6e..f5636948397 100644
--- a/src/mesa/main/fbobject.c
+++ b/src/mesa/main/fbobject.c
@@ -2023,7 +2023,11 @@ framebuffer_texture(struct gl_context *ctx, const char *caller, GLenum target,
    _glthread_LOCK_MUTEX(fb->Mutex);
    if (texObj) {
       if (attachment == GL_DEPTH_ATTACHMENT &&
-	   texObj == fb->Attachment[BUFFER_STENCIL].Texture) {
+          texObj == fb->Attachment[BUFFER_STENCIL].Texture &&
+          level == fb->Attachment[BUFFER_STENCIL].TextureLevel &&
+          _mesa_tex_target_to_face(textarget) ==
+          fb->Attachment[BUFFER_STENCIL].CubeMapFace &&
+          zoffset == fb->Attachment[BUFFER_STENCIL].Zoffset) {
 	 /* The texture object is already attached to the stencil attachment
 	  * point. Don't create a new renderbuffer; just reuse the stencil
 	  * attachment's. This is required to prevent a GL error in
@@ -2032,8 +2036,12 @@ framebuffer_texture(struct gl_context *ctx, const char *caller, GLenum target,
 	 reuse_framebuffer_texture_attachment(fb, BUFFER_DEPTH,
 	                                      BUFFER_STENCIL);
       } else if (attachment == GL_STENCIL_ATTACHMENT &&
-	         texObj == fb->Attachment[BUFFER_DEPTH].Texture) {
-	 /* As above, but with depth and stencil juxtaposed. */
+	         texObj == fb->Attachment[BUFFER_DEPTH].Texture &&
+                 level == fb->Attachment[BUFFER_DEPTH].TextureLevel &&
+                 _mesa_tex_target_to_face(textarget) ==
+                 fb->Attachment[BUFFER_DEPTH].CubeMapFace &&
+                 zoffset == fb->Attachment[BUFFER_DEPTH].Zoffset) {
+	 /* As above, but with depth and stencil transposed. */
 	 reuse_framebuffer_texture_attachment(fb, BUFFER_STENCIL,
 	                                      BUFFER_DEPTH);
       } else {
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 2b709ded683..06ca0d5df16 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -1636,6 +1636,11 @@ struct gl_array_attrib
 
    /* GL_ARB_vertex_buffer_object */
    struct gl_buffer_object *ArrayBufferObj;
+
+   /**
+    * Vertex arrays as consumed by a driver.
+    * The array pointer is set up only by the VBO module. */
+   const struct gl_client_array **_DrawArrays; /**< 0..VERT_ATTRIB_MAX-1 */
 };
 
 
@@ -3252,6 +3257,17 @@ typedef enum
    API_OPENGLES2
 } gl_api;
 
+/**
+ * Driver-specific state flags.
+ *
+ * These are or'd with gl_context::NewDriverState to notify a driver about
+ * a state change. The driver sets the flags at context creation and
+ * the meaning of the bits set is opaque to core Mesa.
+ */
+struct gl_driver_flags
+{
+   GLbitfield NewArray;             /**< Vertex array state */
+};
 
 /**
  * Mesa rendering context.
@@ -3411,6 +3427,9 @@ struct gl_context
 
    GLenum RenderMode;        /**< either GL_RENDER, GL_SELECT, GL_FEEDBACK */
    GLbitfield NewState;      /**< bitwise-or of _NEW_* flags */
+   GLbitfield NewDriverState;/**< bitwise-or of flags from DriverFlags */
+
+   struct gl_driver_flags DriverFlags;
 
    GLboolean ViewportInitialized;  /**< has viewport size been initialized? */
 
diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index 23486467dbc..694f6fa0010 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -530,6 +530,32 @@ _mesa_base_tex_format( struct gl_context *ctx, GLint internalFormat )
 }
 
 
+/**
+ * Is the given texture format a generic compressed format?
+ */
+static GLboolean
+is_generic_compressed_format(GLenum format)
+{
+   switch (format) {
+   case GL_COMPRESSED_RED:
+   case GL_COMPRESSED_RG:
+   case GL_COMPRESSED_RGB:
+   case GL_COMPRESSED_RGBA:
+   case GL_COMPRESSED_ALPHA:
+   case GL_COMPRESSED_LUMINANCE:
+   case GL_COMPRESSED_LUMINANCE_ALPHA:
+   case GL_COMPRESSED_INTENSITY:
+   case GL_COMPRESSED_SRGB:
+   case GL_COMPRESSED_SRGB_ALPHA:
+   case GL_COMPRESSED_SLUMINANCE:
+   case GL_COMPRESSED_SLUMINANCE_ALPHA:
+      return GL_TRUE;
+   default:
+      return GL_FALSE;
+   }
+}
+
+
 /**
  * For cube map faces, return a face index in [0,5].
  * For other targets return 0;
@@ -1705,7 +1731,8 @@ texture_error_check( struct gl_context *ctx,
    }
 
    /* additional checks for compressed textures */
-   if (_mesa_is_compressed_format(ctx, internalFormat)) {
+   if (_mesa_is_compressed_format(ctx, internalFormat) ||
+       is_generic_compressed_format(internalFormat)) {
       if (!target_can_be_compressed(ctx, target, internalFormat)) {
          if (!isProxy)
             _mesa_error(ctx, GL_INVALID_ENUM,
@@ -2036,7 +2063,8 @@ copytexture_error_check( struct gl_context *ctx, GLuint dimensions,
       return GL_TRUE;
    }
 
-   if (_mesa_is_compressed_format(ctx, internalFormat)) {
+   if (_mesa_is_compressed_format(ctx, internalFormat) ||
+       is_generic_compressed_format(internalFormat)) {
       if (!target_can_be_compressed(ctx, target, internalFormat)) {
          _mesa_error(ctx, GL_INVALID_ENUM,
                      "glCopyTexImage%dD(target)", dimensions);
@@ -2692,6 +2720,13 @@ _mesa_EGLImageTargetTexture2DOES (GLenum target, GLeglImageOES image)
    texObj = _mesa_get_current_tex_object(ctx, target);
    _mesa_lock_texture(ctx, texObj);
 
+   if (texObj->Immutable) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+		  "glEGLImageTargetTexture2D(texture is immutable)");
+      _mesa_unlock_texture(ctx, texObj);
+      return;
+   }
+
    texImage = _mesa_get_tex_image(ctx, texObj, target, 0);
    if (!texImage) {
       _mesa_error(ctx, GL_OUT_OF_MEMORY, "glEGLImageTargetTexture2D");
diff --git a/src/mesa/main/teximage.h b/src/mesa/main/teximage.h
index e2bdaca0150..66a0c88950b 100644
--- a/src/mesa/main/teximage.h
+++ b/src/mesa/main/teximage.h
@@ -35,6 +35,9 @@
 #include "mtypes.h"
 #include "formats.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 /** Is the given value one of the 6 cube faces? */
 static inline GLboolean
@@ -287,4 +290,8 @@ _mesa_TexBuffer(GLenum target, GLenum internalFormat, GLuint buffer);
 
 /*@}*/
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/src/mesa/state_tracker/st_cb_rasterpos.c b/src/mesa/state_tracker/st_cb_rasterpos.c
index 2c21dc9a776..8337f462418 100644
--- a/src/mesa/state_tracker/st_cb_rasterpos.c
+++ b/src/mesa/state_tracker/st_cb_rasterpos.c
@@ -225,6 +225,7 @@ st_RasterPos(struct gl_context *ctx, const GLfloat v[4])
    struct st_context *st = st_context(ctx);
    struct draw_context *draw = st->draw;
    struct rastpos_stage *rs;
+   const struct gl_client_array **saved_arrays = ctx->Array._DrawArrays;
 
    if (st->rastpos_stage) {
       /* get rastpos stage info */
@@ -250,9 +251,14 @@ st_RasterPos(struct gl_context *ctx, const GLfloat v[4])
     */
    rs->array[0].Ptr = (GLubyte *) v;
 
-   /* draw the point */
-   st_feedback_draw_vbo(ctx, rs->arrays, &rs->prim, 1, NULL, GL_TRUE, 0, 1,
+   /* Draw the point.
+    *
+    * Don't set DriverFlags.NewArray.
+    * st_feedback_draw_vbo doesn't check for that flag. */
+   ctx->Array._DrawArrays = rs->arrays;
+   st_feedback_draw_vbo(ctx, &rs->prim, 1, NULL, GL_TRUE, 0, 1,
                         NULL);
+   ctx->Array._DrawArrays = saved_arrays;
 
    /* restore draw's rasterization stage depending on rendermode */
    if (ctx->RenderMode == GL_FEEDBACK) {
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index 93f4216f34e..b4497652539 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -195,6 +195,10 @@ st_create_context_priv( struct gl_context *ctx, struct pipe_context *pipe )
    return st;
 }
 
+static void st_init_driver_flags(struct gl_driver_flags *f)
+{
+   f->NewArray = ST_NEW_VERTEX_ARRAYS;
+}
 
 struct st_context *st_create_context(gl_api api, struct pipe_context *pipe,
                                      const struct gl_config *visual,
@@ -217,6 +221,8 @@ struct st_context *st_create_context(gl_api api, struct pipe_context *pipe,
       return NULL;
    }
 
+   st_init_driver_flags(&ctx->DriverFlags);
+
    /* XXX: need a capability bit in gallium to query if the pipe
     * driver prefers DP4 or MUL/MAD for vertex transformation.
     */
diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h
index 49b8316842e..00a405b69f9 100644
--- a/src/mesa/state_tracker/st_context.h
+++ b/src/mesa/state_tracker/st_context.h
@@ -49,6 +49,7 @@ struct u_upload_mgr;
 #define ST_NEW_FRAMEBUFFER             (1 << 3)
 #define ST_NEW_EDGEFLAGS_DATA          (1 << 4)
 #define ST_NEW_GEOMETRY_PROGRAM        (1 << 5)
+#define ST_NEW_VERTEX_ARRAYS           (1 << 6)
 
 
 struct st_state_flags {
diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
index ab2290f2c9d..a8c20f45acd 100644
--- a/src/mesa/state_tracker/st_draw.c
+++ b/src/mesa/state_tracker/st_draw.c
@@ -892,7 +892,6 @@ st_validate_varrays(struct gl_context *ctx,
  */
 void
 st_draw_vbo(struct gl_context *ctx,
-            const struct gl_client_array **arrays,
             const struct _mesa_prim *prims,
             GLuint nr_prims,
             const struct _mesa_index_buffer *ib,
@@ -905,14 +904,21 @@ st_draw_vbo(struct gl_context *ctx,
    struct pipe_context *pipe = st->pipe;
    struct pipe_index_buffer ibuffer = {0};
    struct pipe_draw_info info;
+   const struct gl_client_array **arrays = ctx->Array._DrawArrays;
    unsigned i;
-   GLboolean new_array =
-      st->dirty.st &&
-      (st->dirty.mesa & (_NEW_ARRAY | _NEW_PROGRAM | _NEW_BUFFER_OBJECT)) != 0;
+   GLboolean new_array;
 
    /* Mesa core state should have been validated already */
    assert(ctx->NewState == 0x0);
 
+   /* Get Mesa driver state. */
+   st->dirty.st |= ctx->NewDriverState;
+   ctx->NewDriverState = 0;
+
+   new_array =
+      (st->dirty.st & (ST_NEW_VERTEX_ARRAYS | ST_NEW_VERTEX_PROGRAM)) ||
+      (st->dirty.mesa & (_NEW_PROGRAM | _NEW_BUFFER_OBJECT)) != 0;
+
    /* Validate state. */
    if (st->dirty.st) {
       GLboolean vertDataEdgeFlags;
diff --git a/src/mesa/state_tracker/st_draw.h b/src/mesa/state_tracker/st_draw.h
index 47bdb11f81c..c608051eba1 100644
--- a/src/mesa/state_tracker/st_draw.h
+++ b/src/mesa/state_tracker/st_draw.h
@@ -49,7 +49,6 @@ void st_destroy_draw( struct st_context *st );
 
 extern void
 st_draw_vbo(struct gl_context *ctx,
-            const struct gl_client_array **arrays,
             const struct _mesa_prim *prims,
             GLuint nr_prims,
             const struct _mesa_index_buffer *ib,
@@ -60,7 +59,6 @@ st_draw_vbo(struct gl_context *ctx,
 
 extern void
 st_feedback_draw_vbo(struct gl_context *ctx,
-                     const struct gl_client_array **arrays,
                      const struct _mesa_prim *prims,
                      GLuint nr_prims,
                      const struct _mesa_index_buffer *ib,
diff --git a/src/mesa/state_tracker/st_draw_feedback.c b/src/mesa/state_tracker/st_draw_feedback.c
index ee19898b3ea..4209fb214f8 100644
--- a/src/mesa/state_tracker/st_draw_feedback.c
+++ b/src/mesa/state_tracker/st_draw_feedback.c
@@ -91,7 +91,6 @@ set_feedback_vertex_format(struct gl_context *ctx)
  */
 void
 st_feedback_draw_vbo(struct gl_context *ctx,
-                     const struct gl_client_array **arrays,
                      const struct _mesa_prim *prims,
                      GLuint nr_prims,
                      const struct _mesa_index_buffer *ib,
@@ -110,6 +109,7 @@ st_feedback_draw_vbo(struct gl_context *ctx,
    struct pipe_index_buffer ibuffer;
    struct pipe_transfer *vb_transfer[PIPE_MAX_ATTRIBS] = {NULL};
    struct pipe_transfer *ib_transfer = NULL;
+   const struct gl_client_array **arrays = ctx->Array._DrawArrays;
    GLuint attr, i;
    const GLubyte *low_addr = NULL;
    const void *mapped_indices = NULL;
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 9e68deb3471..60a4e2831a4 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -1775,9 +1775,7 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
       emit(ir, TGSI_OPCODE_TRUNC, result_dst, op[0]);
       break;
    case ir_unop_ceil:
-      op[0].negate = ~op[0].negate;
-      emit(ir, TGSI_OPCODE_FLR, result_dst, op[0]);
-      result_src.negate = ~result_src.negate;
+      emit(ir, TGSI_OPCODE_CEIL, result_dst, op[0]);
       break;
    case ir_unop_floor:
       emit(ir, TGSI_OPCODE_FLR, result_dst, op[0]);
@@ -3963,7 +3961,7 @@ dst_register(struct st_translate *t,
 
    case PROGRAM_TEMPORARY:
       if (ureg_dst_is_undef(t->temps[index]))
-         t->temps[index] = ureg_DECL_temporary(t->ureg);
+         t->temps[index] = ureg_DECL_local_temporary(t->ureg);
 
       return t->temps[index];
 
@@ -4004,7 +4002,7 @@ src_register(struct st_translate *t,
       assert(index >= 0);
       assert(index < Elements(t->temps));
       if (ureg_dst_is_undef(t->temps[index]))
-         t->temps[index] = ureg_DECL_temporary(t->ureg);
+         t->temps[index] = ureg_DECL_local_temporary(t->ureg);
       return ureg_src(t->temps[index]);
 
    case PROGRAM_NAMED_PARAM:
@@ -4261,7 +4259,7 @@ emit_wpos_adjustment( struct st_translate *t,
           * or not, which is determined by testing against the inversion
           * state variable used below, which will be either +1 or -1.
           */
-         struct ureg_dst adj_temp = ureg_DECL_temporary(ureg);
+         struct ureg_dst adj_temp = ureg_DECL_local_temporary(ureg);
 
          ureg_CMP(ureg, adj_temp,
                   ureg_scalar(wpostrans, invert ? 2 : 0),
@@ -4624,7 +4622,7 @@ st_translate_program(
        */
       for (i = 0; i < (unsigned)program->next_temp; i++) {
          /* XXX use TGSI_FILE_TEMPORARY_ARRAY when it's supported by ureg */
-         t->temps[i] = ureg_DECL_temporary(t->ureg);
+         t->temps[i] = ureg_DECL_local_temporary(t->ureg);
       }
    }
 
diff --git a/src/mesa/tnl/t_draw.c b/src/mesa/tnl/t_draw.c
index 17042cf8f37..6a3c9662e96 100644
--- a/src/mesa/tnl/t_draw.c
+++ b/src/mesa/tnl/t_draw.c
@@ -408,7 +408,6 @@ static void unmap_vbos( struct gl_context *ctx,
 
 
 void _tnl_vbo_draw_prims(struct gl_context *ctx,
-			 const struct gl_client_array *arrays[],
 			 const struct _mesa_prim *prim,
 			 GLuint nr_prims,
 			 const struct _mesa_index_buffer *ib,
@@ -417,6 +416,8 @@ void _tnl_vbo_draw_prims(struct gl_context *ctx,
 			 GLuint max_index,
 			 struct gl_transform_feedback_object *tfb_vertcount)
 {
+   const struct gl_client_array **arrays = ctx->Array._DrawArrays;
+
    if (!index_bounds_valid)
       vbo_get_minmax_indices(ctx, prim, ib, &min_index, &max_index, nr_prims);
 
diff --git a/src/mesa/tnl/tnl.h b/src/mesa/tnl/tnl.h
index 434bd7fcd95..f6b70e3236f 100644
--- a/src/mesa/tnl/tnl.h
+++ b/src/mesa/tnl/tnl.h
@@ -86,7 +86,6 @@ _tnl_draw_prims( struct gl_context *ctx,
 
 void
 _tnl_vbo_draw_prims( struct gl_context *ctx,
-		     const struct gl_client_array *arrays[],
 		     const struct _mesa_prim *prim,
 		     GLuint nr_prims,
 		     const struct _mesa_index_buffer *ib,
diff --git a/src/mesa/vbo/vbo.h b/src/mesa/vbo/vbo.h
index 2d01d9823cc..3cff8987e78 100644
--- a/src/mesa/vbo/vbo.h
+++ b/src/mesa/vbo/vbo.h
@@ -72,7 +72,6 @@ void _vbo_InvalidateState( struct gl_context *ctx, GLuint new_state );
 
 
 typedef void (*vbo_draw_func)( struct gl_context *ctx,
-			       const struct gl_client_array **arrays,
 			       const struct _mesa_prim *prims,
 			       GLuint nr_prims,
 			       const struct _mesa_index_buffer *ib,
diff --git a/src/mesa/vbo/vbo_context.h b/src/mesa/vbo/vbo_context.h
index b9a8affa520..1c49de0ca2f 100644
--- a/src/mesa/vbo/vbo_context.h
+++ b/src/mesa/vbo/vbo_context.h
@@ -58,6 +58,18 @@
 #include "vbo_save.h"
 
 
+/** Used to signal when transitioning from one kind of drawing method
+ * to another.
+ */
+enum draw_method
+{
+   DRAW_NONE,          /**< Initial value only */
+   DRAW_BEGIN_END,
+   DRAW_DISPLAY_LIST,
+   DRAW_ARRAYS
+};
+
+
 struct vbo_context {
    struct gl_client_array currval[VBO_ATTRIB_MAX];
    
@@ -74,6 +86,8 @@ struct vbo_context {
     * is responsible for initiating any fallback actions required:
     */
    vbo_draw_func draw_prims;
+
+   enum draw_method last_draw_method;
 };
 
 
@@ -101,4 +115,40 @@ get_program_mode( struct gl_context *ctx )
 }
 
 
+/**
+ * This is called by glBegin, glDrawArrays and glDrawElements (and
+ * variations of those calls).  When we transition from immediate mode
+ * drawing to array drawing we need to invalidate the array state.
+ *
+ * glBegin/End builds vertex arrays.  Those arrays may look identical
+ * to glDrawArrays arrays except that the position of the elements may
+ * be different.  For example, arrays of (position3v, normal3f) vs. arrays
+ * of (normal3f, position3f).  So we need to make sure we notify drivers
+ * that arrays may be changing.
+ */
+static inline void
+vbo_draw_method(struct vbo_context *vbo, enum draw_method method)
+{
+   if (vbo->last_draw_method != method) {
+      struct gl_context *ctx = vbo->exec.ctx;
+
+      switch (method) {
+      case DRAW_ARRAYS:
+         ctx->Array._DrawArrays = vbo->exec.array.inputs;
+         break;
+      case DRAW_BEGIN_END:
+         ctx->Array._DrawArrays = vbo->exec.vtx.inputs;
+         break;
+      case DRAW_DISPLAY_LIST:
+         ctx->Array._DrawArrays = vbo->save.inputs;
+         break;
+      default:
+         ASSERT(0);
+      }
+
+      ctx->NewDriverState |= ctx->DriverFlags.NewArray;
+      vbo->last_draw_method = method;
+   }
+}
+
 #endif
diff --git a/src/mesa/vbo/vbo_exec.h b/src/mesa/vbo/vbo_exec.h
index be9f3d78d2b..4ac7d168dc7 100644
--- a/src/mesa/vbo/vbo_exec.h
+++ b/src/mesa/vbo/vbo_exec.h
@@ -78,26 +78,12 @@ struct vbo_exec_copied_vtx {
 };
 
 
-/** Used to signal when transitioning from one kind of drawing method
- * to another.
- */
-enum draw_method
-{
-   DRAW_NONE,          /**< Initial value only */
-   DRAW_BEGIN_END,
-   DRAW_DISPLAY_LIST,
-   DRAW_ARRAYS
-};
-
-
 struct vbo_exec_context
 {
    struct gl_context *ctx;   
    GLvertexformat vtxfmt;
    GLvertexformat vtxfmt_noop;
 
-   enum draw_method last_draw_method;
-
    struct {
       struct gl_buffer_object *bufferobj;
 
@@ -174,28 +160,6 @@ void vbo_exec_vtx_init( struct vbo_exec_context *exec );
 void vbo_exec_vtx_destroy( struct vbo_exec_context *exec );
 
 
-/**
- * This is called by glBegin, glDrawArrays and glDrawElements (and
- * variations of those calls).  When we transition from immediate mode
- * drawing to array drawing we need to invalidate the array state.
- *
- * glBegin/End builds vertex arrays.  Those arrays may look identical
- * to glDrawArrays arrays except that the position of the elements may
- * be different.  For example, arrays of (position3v, normal3f) vs. arrays
- * of (normal3f, position3f).  So we need to make sure we notify drivers
- * that arrays may be changing.
- */
-static inline void
-vbo_draw_method(struct vbo_exec_context *exec, enum draw_method method)
-{
-   if (exec->last_draw_method != method) {
-      struct gl_context *ctx = exec->ctx;
-      ctx->Driver.UpdateState(ctx, _NEW_ARRAY);
-      exec->last_draw_method = method;
-   }
-}
-
-
 #if FEATURE_beginend
 
 void vbo_exec_vtx_flush( struct vbo_exec_context *exec, GLboolean unmap );
diff --git a/src/mesa/vbo/vbo_exec_api.c b/src/mesa/vbo/vbo_exec_api.c
index 3f95410892e..b87da18f3eb 100644
--- a/src/mesa/vbo/vbo_exec_api.c
+++ b/src/mesa/vbo/vbo_exec_api.c
@@ -700,7 +700,7 @@ static void GLAPIENTRY vbo_exec_Begin( GLenum mode )
          return;
       }
 
-      vbo_draw_method(exec, DRAW_BEGIN_END);
+      vbo_draw_method(vbo_context(ctx), DRAW_BEGIN_END);
 
       if (ctx->Driver.PrepareExecBegin)
 	 ctx->Driver.PrepareExecBegin(ctx);
diff --git a/src/mesa/vbo/vbo_exec_array.c b/src/mesa/vbo/vbo_exec_array.c
index 2dcfb8e5b8c..cc94e761bc1 100644
--- a/src/mesa/vbo/vbo_exec_array.c
+++ b/src/mesa/vbo/vbo_exec_array.c
@@ -506,7 +506,7 @@ recalculate_input_bindings(struct gl_context *ctx)
    }
 
    _mesa_set_varying_vp_inputs( ctx, VERT_BIT_ALL & (~const_inputs) );
-   ctx->Driver.UpdateState(ctx, _NEW_ARRAY);
+   ctx->NewDriverState |= ctx->DriverFlags.NewArray;
 }
 
 
@@ -523,7 +523,7 @@ vbo_bind_arrays(struct gl_context *ctx)
    struct vbo_context *vbo = vbo_context(ctx);
    struct vbo_exec_context *exec = &vbo->exec;
 
-   vbo_draw_method(exec, DRAW_ARRAYS);
+   vbo_draw_method(vbo, DRAW_ARRAYS);
 
    if (exec->array.recalculate_inputs) {
       recalculate_input_bindings(ctx);
@@ -600,7 +600,7 @@ vbo_draw_arrays(struct gl_context *ctx, GLenum mode, GLint start,
       if (primCount > 0) {
          /* draw one or two prims */
          check_buffers_are_unmapped(exec->array.inputs);
-         vbo->draw_prims(ctx, exec->array.inputs, prim, primCount, NULL,
+         vbo->draw_prims(ctx, prim, primCount, NULL,
                          GL_TRUE, start, start + count - 1, NULL);
       }
    }
@@ -610,7 +610,7 @@ vbo_draw_arrays(struct gl_context *ctx, GLenum mode, GLint start,
       prim[0].count = count;
 
       check_buffers_are_unmapped(exec->array.inputs);
-      vbo->draw_prims(ctx, exec->array.inputs, prim, 1, NULL,
+      vbo->draw_prims(ctx, prim, 1, NULL,
                       GL_TRUE, start, start + count - 1,
                       NULL);
    }
@@ -801,7 +801,7 @@ vbo_validated_drawrangeelements(struct gl_context *ctx, GLenum mode,
     */
 
    check_buffers_are_unmapped(exec->array.inputs);
-   vbo->draw_prims( ctx, exec->array.inputs, prim, 1, &ib,
+   vbo->draw_prims( ctx, prim, 1, &ib,
 		    index_bounds_valid, start, end, NULL );
 }
 
@@ -1096,7 +1096,7 @@ vbo_validated_multidrawelements(struct gl_context *ctx, GLenum mode,
       }
 
       check_buffers_are_unmapped(exec->array.inputs);
-      vbo->draw_prims(ctx, exec->array.inputs, prim, primcount, &ib,
+      vbo->draw_prims(ctx, prim, primcount, &ib,
 		      GL_FALSE, ~0, ~0, NULL);
    } else {
       /* render one prim at a time */
@@ -1121,7 +1121,7 @@ vbo_validated_multidrawelements(struct gl_context *ctx, GLenum mode,
 	    prim[0].basevertex = 0;
 
          check_buffers_are_unmapped(exec->array.inputs);
-         vbo->draw_prims(ctx, exec->array.inputs, prim, 1, &ib,
+         vbo->draw_prims(ctx, prim, 1, &ib,
                          GL_FALSE, ~0, ~0, NULL);
       }
    }
@@ -1199,7 +1199,7 @@ vbo_draw_transform_feedback(struct gl_context *ctx, GLenum mode,
     * will be rendered. */
 
    check_buffers_are_unmapped(exec->array.inputs);
-   vbo->draw_prims(ctx, exec->array.inputs, prim, 1, NULL,
+   vbo->draw_prims(ctx, prim, 1, NULL,
                    GL_TRUE, 0, 0, obj);
 }
 
diff --git a/src/mesa/vbo/vbo_exec_draw.c b/src/mesa/vbo/vbo_exec_draw.c
index da5ca695eaf..77db8ec7f3e 100644
--- a/src/mesa/vbo/vbo_exec_draw.c
+++ b/src/mesa/vbo/vbo_exec_draw.c
@@ -257,7 +257,7 @@ vbo_exec_bind_arrays( struct gl_context *ctx )
    }
 
    _mesa_set_varying_vp_inputs( ctx, varying_inputs );
-   ctx->Driver.UpdateState(ctx, _NEW_ARRAY);
+   ctx->NewDriverState |= ctx->DriverFlags.NewArray;
 }
 
 
@@ -407,8 +407,7 @@ vbo_exec_vtx_flush(struct vbo_exec_context *exec, GLboolean keepUnmapped)
 		   exec->vtx.vert_count);
 
 	 vbo_context(ctx)->draw_prims( ctx, 
-				       exec->vtx.inputs, 
-				       exec->vtx.prim, 
+				       exec->vtx.prim,
 				       exec->vtx.prim_count,
 				       NULL,
 				       GL_TRUE,
diff --git a/src/mesa/vbo/vbo_rebase.c b/src/mesa/vbo/vbo_rebase.c
index 597a8f46994..fff9df0c29d 100644
--- a/src/mesa/vbo/vbo_rebase.c
+++ b/src/mesa/vbo/vbo_rebase.c
@@ -129,6 +129,7 @@ void vbo_rebase_prims( struct gl_context *ctx,
 
    struct _mesa_index_buffer tmp_ib;
    struct _mesa_prim *tmp_prims = NULL;
+   const struct gl_client_array **saved_arrays = ctx->Array._DrawArrays;
    void *tmp_indices = NULL;
    GLuint i;
 
@@ -226,15 +227,20 @@ void vbo_rebase_prims( struct gl_context *ctx,
    
    /* Re-issue the draw call.
     */
+   ctx->Array._DrawArrays = tmp_array_pointers;
+   ctx->NewDriverState |= ctx->DriverFlags.NewArray;
+
    draw( ctx, 
-	 tmp_array_pointers, 
-	 prim, 
+	 prim,
 	 nr_prims, 
 	 ib, 
 	 GL_TRUE,
 	 0, 
 	 max_index - min_index,
 	 NULL );
+
+   ctx->Array._DrawArrays = saved_arrays;
+   ctx->NewDriverState |= ctx->DriverFlags.NewArray;
    
    if (tmp_indices)
       free(tmp_indices);
diff --git a/src/mesa/vbo/vbo_save.c b/src/mesa/vbo/vbo_save.c
index 040c9b7a95a..7a97d3c6567 100644
--- a/src/mesa/vbo/vbo_save.c
+++ b/src/mesa/vbo/vbo_save.c
@@ -49,6 +49,9 @@ static void vbo_save_callback_init( struct gl_context *ctx )
 
 
 
+/**
+ * Called at context creation time.
+ */
 void vbo_save_init( struct gl_context *ctx )
 {
    struct vbo_context *vbo = vbo_context(ctx);
diff --git a/src/mesa/vbo/vbo_save_api.c b/src/mesa/vbo/vbo_save_api.c
index 13604333e95..74f5dc9ce68 100644
--- a/src/mesa/vbo/vbo_save_api.c
+++ b/src/mesa/vbo/vbo_save_api.c
@@ -1506,6 +1506,9 @@ vbo_print_vertex_list(struct gl_context *ctx, void *data)
 }
 
 
+/**
+ * Called during context creation/init.
+ */
 static void
 _save_current_init(struct gl_context *ctx)
 {
@@ -1529,7 +1532,7 @@ _save_current_init(struct gl_context *ctx)
 
 
 /**
- * Initialize the display list compiler
+ * Initialize the display list compiler.  Called during context creation.
  */
 void
 vbo_save_api_init(struct vbo_save_context *save)
diff --git a/src/mesa/vbo/vbo_save_draw.c b/src/mesa/vbo/vbo_save_draw.c
index 88a9a7e3462..c6425ab1b79 100644
--- a/src/mesa/vbo/vbo_save_draw.c
+++ b/src/mesa/vbo/vbo_save_draw.c
@@ -213,7 +213,7 @@ static void vbo_bind_vertex_list(struct gl_context *ctx,
    }
 
    _mesa_set_varying_vp_inputs( ctx, varying_inputs );
-   ctx->Driver.UpdateState(ctx, _NEW_ARRAY);
+   ctx->NewDriverState |= ctx->DriverFlags.NewArray;
 }
 
 
@@ -250,7 +250,6 @@ vbo_save_playback_vertex_list(struct gl_context *ctx, void *data)
    const struct vbo_save_vertex_list *node =
       (const struct vbo_save_vertex_list *) data;
    struct vbo_save_context *save = &vbo_context(ctx)->save;
-   struct vbo_exec_context *exec = &vbo_context(ctx)->exec;
    GLboolean remap_vertex_store = GL_FALSE;
 
    if (save->vertex_store->buffer) {
@@ -304,7 +303,7 @@ vbo_save_playback_vertex_list(struct gl_context *ctx, void *data)
 
       vbo_bind_vertex_list( ctx, node );
 
-      vbo_draw_method(exec, DRAW_DISPLAY_LIST);
+      vbo_draw_method(vbo_context(ctx), DRAW_DISPLAY_LIST);
 
       /* Again...
        */
@@ -313,8 +312,7 @@ vbo_save_playback_vertex_list(struct gl_context *ctx, void *data)
 
       if (node->count > 0) {
          vbo_context(ctx)->draw_prims(ctx, 
-                                      save->inputs, 
-                                      node->prim, 
+                                      node->prim,
                                       node->prim_count,
                                       NULL,
                                       GL_TRUE,
diff --git a/src/mesa/vbo/vbo_split_copy.c b/src/mesa/vbo/vbo_split_copy.c
index b53293c3120..528fcfd7f80 100644
--- a/src/mesa/vbo/vbo_split_copy.c
+++ b/src/mesa/vbo/vbo_split_copy.c
@@ -171,6 +171,8 @@ dump_draw_info(struct gl_context *ctx,
 static void
 flush( struct copy_context *copy )
 {
+   struct gl_context *ctx = copy->ctx;
+   const struct gl_client_array **saved_arrays = ctx->Array._DrawArrays;
    GLuint i;
 
    /* Set some counters: 
@@ -189,8 +191,10 @@ flush( struct copy_context *copy )
    (void) dump_draw_info;
 #endif
 
-   copy->draw( copy->ctx,
-	       copy->dstarray_ptr,
+   ctx->Array._DrawArrays = copy->dstarray_ptr;
+   ctx->NewDriverState |= ctx->DriverFlags.NewArray;
+
+   copy->draw( ctx,
 	       copy->dstprim,
 	       copy->dstprim_nr,
 	       &copy->dstib,
@@ -199,6 +203,9 @@ flush( struct copy_context *copy )
 	       copy->dstbuf_nr - 1,
 	       NULL );
 
+   ctx->Array._DrawArrays = saved_arrays;
+   ctx->NewDriverState |= ctx->DriverFlags.NewArray;
+
    /* Reset all pointers: 
     */
    copy->dstprim_nr = 0;
diff --git a/src/mesa/vbo/vbo_split_inplace.c b/src/mesa/vbo/vbo_split_inplace.c
index 9e596f66891..00464049ddd 100644
--- a/src/mesa/vbo/vbo_split_inplace.c
+++ b/src/mesa/vbo/vbo_split_inplace.c
@@ -62,6 +62,8 @@ struct split_context {
 
 static void flush_vertex( struct split_context *split )
 {
+   struct gl_context *ctx = split->ctx;
+   const struct gl_client_array **saved_arrays = ctx->Array._DrawArrays;
    struct _mesa_index_buffer ib;
    GLuint i;
 
@@ -82,8 +84,10 @@ static void flush_vertex( struct split_context *split )
 
    assert(split->max_index >= split->min_index);
 
-   split->draw(split->ctx,
-	       split->array,
+   ctx->Array._DrawArrays = split->array;
+   ctx->NewDriverState |= ctx->DriverFlags.NewArray;
+
+   split->draw(ctx,
 	       split->dstprim,
 	       split->dstprim_nr,
 	       split->ib ? &ib : NULL,
@@ -92,6 +96,9 @@ static void flush_vertex( struct split_context *split )
 	       split->max_index,
 	       NULL);
 
+   ctx->Array._DrawArrays = saved_arrays;
+   ctx->NewDriverState |= ctx->DriverFlags.NewArray;
+
    split->dstprim_nr = 0;
    split->min_index = ~0;
    split->max_index = 0;