gallium/tests/trivial: Import compute unit tests.
authorFrancisco Jerez <currojerez@riseup.net>
Tue, 20 Mar 2012 22:41:09 +0000 (23:41 +0100)
committerFrancisco Jerez <currojerez@riseup.net>
Fri, 11 May 2012 10:39:44 +0000 (12:39 +0200)
Add a test program that tries to exercise some of the language
features commonly used by compute programs at the Gallium API level:

   - Correctness of the values returned by the grid parameters.
   - Proper functioning of resource LOADs and STOREs.
   - Subroutine calls.
   - Argument passing to the compute parameter through the INPUT
     memory space.
   - Mapping of buffer objects to the GLOBAL memory space.
   - Proper functioning of the PRIVATE and LOCAL memory spaces.
   - Texture sampling and constant buffers.
   - Support for multiple kernels in the same program.
   - Indirect resource indexing.
   - Formatted resource loads and stores (i.e. with channel conversion
     and scaling) using several different formats.
   - Proper functioning of work-group barriers.
   - Atomicity and semantics of the atomic opcodes.

As of now all of them seem to pass on my nvA8.

src/gallium/tests/trivial/Makefile
src/gallium/tests/trivial/compute.c [new file with mode: 0644]

index bfe186b1607ac94d4e0d1f091afb03a503b5dbe1..8c0320165388fa5aa7cecb154df9539a2c0fc774 100644 (file)
@@ -18,7 +18,8 @@ LINKS += \
 
 SOURCES = \
        tri.c \
-       quad-tex.c
+       quad-tex.c \
+       compute.c
 
 OBJECTS = $(SOURCES:.c=.o)
 
diff --git a/src/gallium/tests/trivial/compute.c b/src/gallium/tests/trivial/compute.c
new file mode 100644 (file)
index 0000000..1812090
--- /dev/null
@@ -0,0 +1,1592 @@
+/*
+ * Copyright (C) 2011 Francisco Jerez.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <inttypes.h>
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "util/u_sampler.h"
+#include "util/u_format.h"
+#include "tgsi/tgsi_text.h"
+#include "pipe-loader/pipe_loader.h"
+
+#define MAX_RESOURCES 4
+
+struct context {
+        struct pipe_loader_device *dev;
+        struct pipe_screen *screen;
+        struct pipe_context *pipe;
+        void *hwcs;
+        void *hwsmp[MAX_RESOURCES];
+        struct pipe_resource *tex[MAX_RESOURCES];
+        bool tex_rw[MAX_RESOURCES];
+        struct pipe_sampler_view *view[MAX_RESOURCES];
+        struct pipe_surface *surf[MAX_RESOURCES];
+};
+
+#define DUMP_COMPUTE_PARAM(p, c) do {                                   \
+                uint64_t __v[4];                                        \
+                int __i, __n;                                           \
+                                                                        \
+                __n = ctx->screen->get_compute_param(ctx->screen, c, __v); \
+                printf("%s: {", #c);                                    \
+                                                                        \
+                for (__i = 0; __i < __n / sizeof(*__v); ++__i)          \
+                        printf(" %"PRIu64, __v[__i]);                   \
+                                                                        \
+                printf(" }\n");                                         \
+        } while (0)
+
+static void init_ctx(struct context *ctx)
+{
+        int ret;
+
+        ret = pipe_loader_probe(&ctx->dev, 1);
+        assert(ret);
+
+        ctx->screen = pipe_loader_create_screen(ctx->dev, PIPE_SEARCH_DIR);
+        assert(ctx->screen);
+
+        ctx->pipe = ctx->screen->context_create(ctx->screen, NULL);
+        assert(ctx->pipe);
+
+        DUMP_COMPUTE_PARAM(p, PIPE_COMPUTE_CAP_GRID_DIMENSION);
+        DUMP_COMPUTE_PARAM(p, PIPE_COMPUTE_CAP_MAX_GRID_SIZE);
+        DUMP_COMPUTE_PARAM(p, PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE);
+}
+
+static void destroy_ctx(struct context *ctx)
+{
+        ctx->pipe->destroy(ctx->pipe);
+        ctx->screen->destroy(ctx->screen);
+        pipe_loader_release(&ctx->dev, 1);
+        FREE(ctx);
+}
+
+static char *
+preprocess_prog(struct context *ctx, const char *src, const char *defs)
+{
+        const char header[] =
+                "#define RGLOBAL        RES[32767]\n"
+                "#define RLOCAL         RES[32766]\n"
+                "#define RPRIVATE       RES[32765]\n"
+                "#define RINPUT         RES[32764]\n";
+        char cmd[512];
+        char tmp[] = "/tmp/test-compute.tgsi-XXXXXX";
+        char *buf;
+        int fd, ret;
+        struct stat st;
+        FILE *p;
+
+        /* Open a temporary file */
+        fd = mkstemp(tmp);
+        assert(fd >= 0);
+        snprintf(cmd, sizeof(cmd), "cpp -P -nostdinc -undef %s > %s",
+                 defs ? defs : "", tmp);
+
+        /* Preprocess */
+        p = popen(cmd, "w");
+        fwrite(header, strlen(header), 1, p);
+        fwrite(src, strlen(src), 1, p);
+        ret = pclose(p);
+        assert(!ret);
+
+        /* Read back */
+        ret = fstat(fd, &st);
+        assert(!ret);
+
+        buf = malloc(st.st_size + 1);
+        ret = read(fd, buf, st.st_size);
+        assert(ret == st.st_size);
+        buf[ret] = 0;
+
+        /* Clean up */
+        close(fd);
+        unlink(tmp);
+
+        return buf;
+}
+
+static void init_prog(struct context *ctx, unsigned local_sz,
+                      unsigned private_sz, unsigned input_sz,
+                      const char *src, const char *defs)
+{
+        struct pipe_context *pipe = ctx->pipe;
+        struct tgsi_token prog[1024];
+        struct pipe_compute_state cs = {
+                .prog = prog,
+                .req_local_mem = local_sz,
+                .req_private_mem = private_sz,
+                .req_input_mem = input_sz
+        };
+        char *psrc = preprocess_prog(ctx, src, defs);
+        int ret;
+
+        ret = tgsi_text_translate(psrc, prog, Elements(prog));
+        assert(ret);
+        free(psrc);
+
+        ctx->hwcs = pipe->create_compute_state(pipe, &cs);
+        assert(ctx->hwcs);
+
+        pipe->bind_compute_state(pipe, ctx->hwcs);
+}
+
+static void destroy_prog(struct context *ctx)
+{
+        struct pipe_context *pipe = ctx->pipe;
+
+        pipe->delete_compute_state(pipe, ctx->hwcs);
+        ctx->hwcs = NULL;
+}
+
+static void init_tex(struct context *ctx, int slot,
+                     enum pipe_texture_target target, bool rw,
+                     enum pipe_format format, int w, int h,
+                     void (*init)(void *, int, int, int))
+{
+        struct pipe_context *pipe = ctx->pipe;
+        struct pipe_resource **tex = &ctx->tex[slot];
+        struct pipe_resource ttex = {
+                .target = target,
+                .format = format,
+                .width0 = w,
+                .height0 = h,
+                .depth0 = 1,
+                .array_size = 1,
+                .bind = (PIPE_BIND_SAMPLER_VIEW |
+                         PIPE_BIND_COMPUTE_RESOURCE |
+                         PIPE_BIND_GLOBAL)
+        };
+        int dx = util_format_get_blocksize(format);
+        int dy = util_format_get_stride(format, w);
+        int nx = (target == PIPE_BUFFER ? (w / dx) :
+                  util_format_get_nblocksx(format, w));
+        int ny = (target == PIPE_BUFFER ? 1 :
+                  util_format_get_nblocksy(format, h));
+        struct pipe_transfer *xfer;
+        char *map;
+        int x, y;
+
+        *tex = ctx->screen->resource_create(ctx->screen, &ttex);
+        assert(*tex);
+
+        xfer = pipe->get_transfer(pipe, *tex, 0, PIPE_TRANSFER_WRITE,
+                                  &(struct pipe_box) { .width = w,
+                                                  .height = h,
+                                                  .depth = 1 });
+        assert(xfer);
+
+        map = pipe->transfer_map(pipe, xfer);
+        assert(map);
+
+        for (y = 0; y < ny; ++y) {
+                for (x = 0; x < nx; ++x) {
+                        init(map + y * dy + x * dx, slot, x, y);
+                }
+        }
+
+        pipe->transfer_unmap(pipe, xfer);
+        pipe->transfer_destroy(pipe, xfer);
+
+        ctx->tex_rw[slot] = rw;
+}
+
+static bool default_check(void *x, void *y, int sz) {
+        return !memcmp(x, y, sz);
+}
+
+static void check_tex(struct context *ctx, int slot,
+                      void (*expect)(void *, int, int, int),
+                      bool (*check)(void *, void *, int))
+{
+        struct pipe_context *pipe = ctx->pipe;
+        struct pipe_resource *tex = ctx->tex[slot];
+        int dx = util_format_get_blocksize(tex->format);
+        int dy = util_format_get_stride(tex->format, tex->width0);
+        int nx = (tex->target == PIPE_BUFFER ? (tex->width0 / dx) :
+                  util_format_get_nblocksx(tex->format, tex->width0));
+        int ny = (tex->target == PIPE_BUFFER ? 1 :
+                  util_format_get_nblocksy(tex->format, tex->height0));
+        struct pipe_transfer *xfer;
+        char *map;
+        int x, y, i;
+        int err = 0;
+
+        if (!check)
+                check = default_check;
+
+        xfer = pipe->get_transfer(pipe, tex, 0, PIPE_TRANSFER_READ,
+                                  &(struct pipe_box) { .width = tex->width0,
+                                        .height = tex->height0,
+                                        .depth = 1 });
+        assert(xfer);
+
+        map = pipe->transfer_map(pipe, xfer);
+        assert(map);
+
+        for (y = 0; y < ny; ++y) {
+                for (x = 0; x < nx; ++x) {
+                        uint32_t exp[4];
+                        uint32_t *res = (uint32_t *)(map + y * dy + x * dx);
+
+                        expect(exp, slot, x, y);
+                        if (check(res, exp, dx) || (++err) > 20)
+                                continue;
+
+                        if (dx < 4) {
+                                uint32_t u = 0, v = 0;
+
+                                for (i = 0; i < dx; i++) {
+                                        u |= ((uint8_t *)exp)[i] << (8 * i);
+                                        v |= ((uint8_t *)res)[i] << (8 * i);
+                                }
+                                printf("(%d, %d): got 0x%x, expected 0x%x\n",
+                                       x, y, v, u);
+                        } else {
+                                for (i = 0; i < dx / 4; i++) {
+                                        printf("(%d, %d)[%d]: got 0x%x/%f,"
+                                               " expected 0x%x/%f\n", x, y, i,
+                                               res[i], ((float *)res)[i],
+                                               exp[i], ((float *)exp)[i]);
+                                }
+                        }
+                }
+        }
+
+        pipe->transfer_unmap(pipe, xfer);
+        pipe->transfer_destroy(pipe, xfer);
+
+        if (err)
+                printf("(%d, %d): \x1b[31mFAIL\x1b[0m (%d)\n", x, y, err);
+        else
+                printf("(%d, %d): \x1b[32mOK\x1b[0m\n", x, y);
+}
+
+static void destroy_tex(struct context *ctx)
+{
+        int i;
+
+        for (i = 0; i < MAX_RESOURCES; ++i) {
+                if (ctx->tex[i])
+                        pipe_resource_reference(&ctx->tex[i], NULL);
+        }
+}
+
+static void init_sampler_views(struct context *ctx, const int *slots)
+{
+        struct pipe_context *pipe = ctx->pipe;
+        struct pipe_sampler_view tview;
+        int i;
+
+        for (i = 0; *slots >= 0; ++i, ++slots) {
+                u_sampler_view_default_template(&tview, ctx->tex[*slots],
+                                                ctx->tex[*slots]->format);
+
+                ctx->view[i] = pipe->create_sampler_view(pipe, ctx->tex[*slots],
+                                                         &tview);
+                assert(ctx->view[i]);
+        }
+
+        pipe->set_compute_sampler_views(pipe, 0, i, ctx->view);
+}
+
+static void destroy_sampler_views(struct context *ctx)
+{
+        struct pipe_context *pipe = ctx->pipe;
+        int i;
+
+        pipe->set_compute_sampler_views(pipe, 0, MAX_RESOURCES, NULL);
+
+        for (i = 0; i < MAX_RESOURCES; ++i) {
+                if (ctx->view[i]) {
+                        pipe->sampler_view_destroy(pipe, ctx->view[i]);
+                        ctx->view[i] = NULL;
+                }
+        }
+}
+
+static void init_compute_resources(struct context *ctx, const int *slots)
+{
+        struct pipe_context *pipe = ctx->pipe;
+        int i;
+
+        for (i = 0; *slots >= 0; ++i, ++slots) {
+                struct pipe_surface tsurf = {
+                        .format = ctx->tex[*slots]->format,
+                        .usage = ctx->tex[*slots]->bind,
+                        .writable = ctx->tex_rw[*slots]
+                };
+
+                if (ctx->tex[*slots]->target == PIPE_BUFFER)
+                        tsurf.u.buf.last_element = ctx->tex[*slots]->width0 - 1;
+
+                ctx->surf[i] = pipe->create_surface(pipe, ctx->tex[*slots],
+                                                    &tsurf);
+                assert(ctx->surf[i]);
+        }
+
+        pipe->set_compute_resources(pipe, 0, i, ctx->surf);
+}
+
+static void destroy_compute_resources(struct context *ctx)
+{
+        struct pipe_context *pipe = ctx->pipe;
+        int i;
+
+        pipe->set_compute_resources(pipe, 0, MAX_RESOURCES, NULL);
+
+        for (i = 0; i < MAX_RESOURCES; ++i) {
+                if (ctx->surf[i]) {
+                        pipe->surface_destroy(pipe, ctx->surf[i]);
+                        ctx->surf[i] = NULL;
+                }
+        }
+}
+
+static void init_sampler_states(struct context *ctx, int n)
+{
+        struct pipe_context *pipe = ctx->pipe;
+        struct pipe_sampler_state smp = {
+                .normalized_coords = 1,
+        };
+        int i;
+
+        for (i = 0; i < n; ++i) {
+                ctx->hwsmp[i] = pipe->create_sampler_state(pipe, &smp);
+                assert(ctx->hwsmp[i]);
+        }
+
+        pipe->bind_compute_sampler_states(pipe, 0, i, ctx->hwsmp);
+}
+
+static void destroy_sampler_states(struct context *ctx)
+{
+        struct pipe_context *pipe = ctx->pipe;
+        int i;
+
+        pipe->bind_compute_sampler_states(pipe, 0, MAX_RESOURCES, NULL);
+
+        for (i = 0; i < MAX_RESOURCES; ++i) {
+                if (ctx->hwsmp[i]) {
+                        pipe->delete_sampler_state(pipe, ctx->hwsmp[i]);
+                        ctx->hwsmp[i] = NULL;
+                }
+        }
+}
+
+static void init_globals(struct context *ctx, const int *slots,
+                         uint32_t **handles)
+{
+        struct pipe_context *pipe = ctx->pipe;
+        struct pipe_resource *res[MAX_RESOURCES];
+        int i;
+
+        for (i = 0; *slots >= 0; ++i, ++slots)
+                res[i] = ctx->tex[*slots];
+
+        pipe->set_global_binding(pipe, 0, i, res, handles);
+}
+
+static void destroy_globals(struct context *ctx)
+{
+        struct pipe_context *pipe = ctx->pipe;
+
+        pipe->set_global_binding(pipe, 0, MAX_RESOURCES, NULL, NULL);
+}
+
+static void launch_grid(struct context *ctx, const uint *block_layout,
+                        const uint *grid_layout, uint32_t pc,
+                        const void *input)
+{
+        struct pipe_context *pipe = ctx->pipe;
+
+        pipe->launch_grid(pipe, block_layout, grid_layout, pc, input);
+}
+
+static void test_system_values(struct context *ctx)
+{
+        const char *src = "COMP\n"
+                "DCL RES[0], BUFFER, RAW, WR\n"
+                "DCL SV[0], BLOCK_ID[0]\n"
+                "DCL SV[1], BLOCK_SIZE[0]\n"
+                "DCL SV[2], GRID_SIZE[0]\n"
+                "DCL SV[3], THREAD_ID[0]\n"
+                "DCL TEMP[0], LOCAL\n"
+                "DCL TEMP[1], LOCAL\n"
+                "IMM UINT32 { 64, 0, 0, 0 }\n"
+                "IMM UINT32 { 16, 0, 0, 0 }\n"
+                "IMM UINT32 { 0, 0, 0, 0 }\n"
+                "\n"
+                "BGNSUB"
+                "  UMUL TEMP[0], SV[0], SV[1]\n"
+                "  UADD TEMP[0], TEMP[0], SV[3]\n"
+                "  UMUL TEMP[1], SV[1], SV[2]\n"
+                "  UMUL TEMP[0].w, TEMP[0], TEMP[1].zzzz\n"
+                "  UMUL TEMP[0].zw, TEMP[0], TEMP[1].yyyy\n"
+                "  UMUL TEMP[0].yzw, TEMP[0], TEMP[1].xxxx\n"
+                "  UADD TEMP[0].xy, TEMP[0].xyxy, TEMP[0].zwzw\n"
+                "  UADD TEMP[0].x, TEMP[0].xxxx, TEMP[0].yyyy\n"
+                "  UMUL TEMP[0].x, TEMP[0], IMM[0]\n"
+                "  STORE RES[0].xyzw, TEMP[0], SV[0]\n"
+                "  UADD TEMP[0].x, TEMP[0], IMM[1]\n"
+                "  STORE RES[0].xyzw, TEMP[0], SV[1]\n"
+                "  UADD TEMP[0].x, TEMP[0], IMM[1]\n"
+                "  STORE RES[0].xyzw, TEMP[0], SV[2]\n"
+                "  UADD TEMP[0].x, TEMP[0], IMM[1]\n"
+                "  STORE RES[0].xyzw, TEMP[0], SV[3]\n"
+                "  RET\n"
+                "ENDSUB\n";
+        void init(void *p, int s, int x, int y) {
+                *(uint32_t *)p = 0xdeadbeef;
+        }
+        void expect(void *p, int s, int x, int y) {
+                int id = x / 16, sv = (x % 16) / 4, c = x % 4;
+                int tid[] = { id % 20, (id % 240) / 20, id / 240, 0 };
+                int bsz[] = { 4, 3, 5, 1};
+                int gsz[] = { 5, 4, 1, 1};
+
+                switch (sv) {
+                case 0:
+                        *(uint32_t *)p = tid[c] / bsz[c];
+                        break;
+                case 1:
+                        *(uint32_t *)p = bsz[c];
+                        break;
+                case 2:
+                        *(uint32_t *)p = gsz[c];
+                        break;
+                case 3:
+                        *(uint32_t *)p = tid[c] % bsz[c];
+                        break;
+                }
+        }
+
+        printf("- %s\n", __func__);
+
+        init_prog(ctx, 0, 0, 0, src, NULL);
+        init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
+                 76800, 0, init);
+        init_compute_resources(ctx, (int []) { 0, -1 });
+        launch_grid(ctx, (uint []){4, 3, 5}, (uint []){5, 4, 1}, 0, NULL);
+        check_tex(ctx, 0, expect, NULL);
+        destroy_compute_resources(ctx);
+        destroy_tex(ctx);
+        destroy_prog(ctx);
+}
+
+static void test_resource_access(struct context *ctx)
+{
+        const char *src = "COMP\n"
+                "DCL RES[0], BUFFER, RAW, WR\n"
+                "DCL RES[1], 2D, RAW, WR\n"
+                "DCL SV[0], BLOCK_ID[0]\n"
+                "DCL TEMP[0], LOCAL\n"
+                "DCL TEMP[1], LOCAL\n"
+                "IMM UINT32 { 15, 0, 0, 0 }\n"
+                "IMM UINT32 { 16, 1, 0, 0 }\n"
+                "\n"
+                "    BGNSUB\n"
+                "       UADD TEMP[0].x, SV[0].xxxx, SV[0].yyyy\n"
+                "       AND TEMP[0].x, TEMP[0], IMM[0]\n"
+                "       UMUL TEMP[0].x, TEMP[0], IMM[1]\n"
+                "       LOAD TEMP[0].xyzw, RES[0], TEMP[0]\n"
+                "       UMUL TEMP[1], SV[0], IMM[1]\n"
+                "       STORE RES[1].xyzw, TEMP[1], TEMP[0]\n"
+                "       RET\n"
+                "    ENDSUB\n";
+        void init0(void *p, int s, int x, int y) {
+                *(float *)p = 8.0 - (float)x;
+        }
+        void init1(void *p, int s, int x, int y) {
+                *(uint32_t *)p = 0xdeadbeef;
+        }
+        void expect(void *p, int s, int x, int y) {
+                *(float *)p = 8.0 - (float)((x + 4*y) & 0x3f);
+        }
+
+        printf("- %s\n", __func__);
+
+        init_prog(ctx, 0, 0, 0, src, NULL);
+        init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
+                 256, 0, init0);
+        init_tex(ctx, 1, PIPE_TEXTURE_2D, true, PIPE_FORMAT_R32_FLOAT,
+                 60, 12, init1);
+        init_compute_resources(ctx, (int []) { 0, 1, -1 });
+        launch_grid(ctx, (uint []){1, 1, 1}, (uint []){15, 12, 1}, 0, NULL);
+        check_tex(ctx, 1, expect, NULL);
+        destroy_compute_resources(ctx);
+        destroy_tex(ctx);
+        destroy_prog(ctx);
+}
+
+static void test_function_calls(struct context *ctx)
+{
+        const char *src = "COMP\n"
+                "DCL RES[0], 2D, RAW, WR\n"
+                "DCL SV[0], BLOCK_ID[0]\n"
+                "DCL SV[1], BLOCK_SIZE[0]\n"
+                "DCL SV[2], GRID_SIZE[0]\n"
+                "DCL SV[3], THREAD_ID[0]\n"
+                "DCL TEMP[0]\n"
+                "DCL TEMP[1]\n"
+                "DCL TEMP[2], LOCAL\n"
+                "IMM UINT32 { 0, 11, 22, 33 }\n"
+                "IMM FLT32 { 11, 33, 55, 99 }\n"
+                "IMM UINT32 { 4, 1, 0, 0 }\n"
+                "IMM UINT32 { 12, 0, 0, 0 }\n"
+                "\n"
+                "00: BGNSUB\n"
+                "01:  UMUL TEMP[0].x, TEMP[0], TEMP[0]\n"
+                "02:  UADD TEMP[1].x, TEMP[1], IMM[2].yyyy\n"
+                "03:  USLT TEMP[0].x, TEMP[0], IMM[0]\n"
+                "04:  RET\n"
+                "05: ENDSUB\n"
+                "06: BGNSUB\n"
+                "07:  UMUL TEMP[0].x, TEMP[0], TEMP[0]\n"
+                "08:  UADD TEMP[1].x, TEMP[1], IMM[2].yyyy\n"
+                "09:  USLT TEMP[0].x, TEMP[0], IMM[0].yyyy\n"
+                "10:  IF TEMP[0].xxxx\n"
+                "11:   CAL :0\n"
+                "12:  ENDIF\n"
+                "13:  RET\n"
+                "14: ENDSUB\n"
+                "15: BGNSUB\n"
+                "16:  UMUL TEMP[2], SV[0], SV[1]\n"
+                "17:  UADD TEMP[2], TEMP[2], SV[3]\n"
+                "18:  UMUL TEMP[2], TEMP[2], IMM[2]\n"
+                "00:  MOV TEMP[1].x, IMM[2].wwww\n"
+                "19:  LOAD TEMP[0].x, RES[0].xxxx, TEMP[2]\n"
+                "20:  CAL :6\n"
+                "21:  STORE RES[0].x, TEMP[2], TEMP[1].xxxx\n"
+                "22:  RET\n"
+                "23: ENDSUB\n";
+        void init(void *p, int s, int x, int y) {
+                *(uint32_t *)p = 15 * y + x;
+        }
+        void expect(void *p, int s, int x, int y) {
+                *(uint32_t *)p = (15 * y + x) < 4 ? 2 : 1 ;
+        }
+
+        printf("- %s\n", __func__);
+
+        init_prog(ctx, 0, 0, 0, src, NULL);
+        init_tex(ctx, 0, PIPE_TEXTURE_2D, true, PIPE_FORMAT_R32_FLOAT,
+                 15, 12, init);
+        init_compute_resources(ctx, (int []) { 0, -1 });
+        launch_grid(ctx, (uint []){3, 3, 3}, (uint []){5, 4, 1}, 15, NULL);
+        check_tex(ctx, 0, expect, NULL);
+        destroy_compute_resources(ctx);
+        destroy_tex(ctx);
+        destroy_prog(ctx);
+}
+
+static void test_input_global(struct context *ctx)
+{
+        const char *src = "COMP\n"
+                "DCL SV[0], THREAD_ID[0]\n"
+                "DCL TEMP[0], LOCAL\n"
+                "DCL TEMP[1], LOCAL\n"
+                "IMM UINT32 { 8, 0, 0, 0 }\n"
+                "\n"
+                "    BGNSUB\n"
+                "       UMUL TEMP[0], SV[0], IMM[0]\n"
+                "       LOAD TEMP[1].xy, RINPUT, TEMP[0]\n"
+                "       LOAD TEMP[0].x, RGLOBAL, TEMP[1].yyyy\n"
+                "       UADD TEMP[1].x, TEMP[0], -TEMP[1]\n"
+                "       STORE RGLOBAL.x, TEMP[1].yyyy, TEMP[1]\n"
+                "       RET\n"
+                "    ENDSUB\n";
+        void init(void *p, int s, int x, int y) {
+                *(uint32_t *)p = 0xdeadbeef;
+        }
+        void expect(void *p, int s, int x, int y) {
+                *(uint32_t *)p = 0xdeadbeef - (x == 0 ? 0x10001 + 2 * s : 0);
+        }
+        uint32_t input[8] = { 0x10001, 0x10002, 0x10003, 0x10004,
+                              0x10005, 0x10006, 0x10007, 0x10008 };
+
+        printf("- %s\n", __func__);
+
+        init_prog(ctx, 0, 0, 32, src, NULL);
+        init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT, 32, 0, init);
+        init_tex(ctx, 1, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT, 32, 0, init);
+        init_tex(ctx, 2, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT, 32, 0, init);
+        init_tex(ctx, 3, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT, 32, 0, init);
+        init_globals(ctx, (int []){ 0, 1, 2, 3, -1 },
+                     (uint32_t *[]){ &input[1], &input[3],
+                                     &input[5], &input[7] });
+        launch_grid(ctx, (uint []){4, 1, 1}, (uint []){1, 1, 1}, 0, input);
+        check_tex(ctx, 0, expect, NULL);
+        check_tex(ctx, 1, expect, NULL);
+        check_tex(ctx, 2, expect, NULL);
+        check_tex(ctx, 3, expect, NULL);
+        destroy_globals(ctx);
+        destroy_tex(ctx);
+        destroy_prog(ctx);
+}
+
+static void test_private(struct context *ctx)
+{
+        const char *src = "COMP\n"
+                "DCL RES[0], BUFFER, RAW, WR\n"
+                "DCL SV[0], BLOCK_ID[0]\n"
+                "DCL SV[1], BLOCK_SIZE[0]\n"
+                "DCL SV[2], THREAD_ID[0]\n"
+                "DCL TEMP[0], LOCAL\n"
+                "DCL TEMP[1], LOCAL\n"
+                "DCL TEMP[2], LOCAL\n"
+                "IMM UINT32 { 128, 0, 0, 0 }\n"
+                "IMM UINT32 { 4, 0, 0, 0 }\n"
+                "\n"
+                "    BGNSUB\n"
+                "       UMUL TEMP[0].x, SV[0], SV[1]\n"
+                "       UADD TEMP[0].x, TEMP[0], SV[2]\n"
+                "       MOV TEMP[1].x, IMM[0].wwww\n"
+                "       BGNLOOP\n"
+                "               USEQ TEMP[2].x, TEMP[1], IMM[0]\n"
+                "               IF TEMP[2]\n"
+                "                       BRK\n"
+                "               ENDIF\n"
+                "               UDIV TEMP[2].x, TEMP[1], IMM[1]\n"
+                "               UADD TEMP[2].x, TEMP[2], TEMP[0]\n"
+                "               STORE RPRIVATE.x, TEMP[1], TEMP[2]\n"
+                "               UADD TEMP[1].x, TEMP[1], IMM[1]\n"
+                "       ENDLOOP\n"
+                "       MOV TEMP[1].x, IMM[0].wwww\n"
+                "       UMUL TEMP[0].x, TEMP[0], IMM[0]\n"
+                "       BGNLOOP\n"
+                "               USEQ TEMP[2].x, TEMP[1], IMM[0]\n"
+                "               IF TEMP[2]\n"
+                "                       BRK\n"
+                "               ENDIF\n"
+                "               LOAD TEMP[2].x, RPRIVATE, TEMP[1]\n"
+                "               STORE RES[0].x, TEMP[0], TEMP[2]\n"
+                "               UADD TEMP[0].x, TEMP[0], IMM[1]\n"
+                "               UADD TEMP[1].x, TEMP[1], IMM[1]\n"
+                "       ENDLOOP\n"
+                "       RET\n"
+                "    ENDSUB\n";
+        void init(void *p, int s, int x, int y) {
+                *(uint32_t *)p = 0xdeadbeef;
+        }
+        void expect(void *p, int s, int x, int y) {
+                *(uint32_t *)p = (x / 32) + x % 32;
+        }
+
+        printf("- %s\n", __func__);
+
+        init_prog(ctx, 0, 128, 0, src, NULL);
+        init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
+                 32768, 0, init);
+        init_compute_resources(ctx, (int []) { 0, -1 });
+        launch_grid(ctx, (uint []){16, 1, 1}, (uint []){16, 1, 1}, 0, NULL);
+        check_tex(ctx, 0, expect, NULL);
+        destroy_compute_resources(ctx);
+        destroy_tex(ctx);
+        destroy_prog(ctx);
+}
+
+static void test_local(struct context *ctx)
+{
+        const char *src = "COMP\n"
+                "DCL RES[0], BUFFER, RAW, WR\n"
+                "DCL SV[0], BLOCK_ID[0]\n"
+                "DCL SV[1], BLOCK_SIZE[0]\n"
+                "DCL SV[2], THREAD_ID[0]\n"
+                "DCL TEMP[0], LOCAL\n"
+                "DCL TEMP[1], LOCAL\n"
+                "DCL TEMP[2], LOCAL\n"
+                "IMM UINT32 { 1, 0, 0, 0 }\n"
+                "IMM UINT32 { 2, 0, 0, 0 }\n"
+                "IMM UINT32 { 4, 0, 0, 0 }\n"
+                "IMM UINT32 { 32, 0, 0, 0 }\n"
+                "IMM UINT32 { 128, 0, 0, 0 }\n"
+                "\n"
+                "    BGNSUB\n"
+                "       UMUL TEMP[0].x, SV[2], IMM[2]\n"
+                "       STORE RLOCAL.x, TEMP[0], IMM[0].wwww\n"
+                "       MFENCE RLOCAL\n"
+                "       USLT TEMP[1].x, SV[2], IMM[3]\n"
+                "       IF TEMP[1]\n"
+                "               UADD TEMP[1].x, TEMP[0], IMM[4]\n"
+                "               BGNLOOP\n"
+                "                       LOAD TEMP[2].x, RLOCAL, TEMP[1]\n"
+                "                       USEQ TEMP[2].x, TEMP[2], IMM[0]\n"
+                "                       IF TEMP[2]\n"
+                "                               BRK\n"
+                "                       ENDIF\n"
+                "               ENDLOOP\n"
+                "               STORE RLOCAL.x, TEMP[0], IMM[0]\n"
+                "               MFENCE RLOCAL\n"
+                "               BGNLOOP\n"
+                "                       LOAD TEMP[2].x, RLOCAL, TEMP[1]\n"
+                "                       USEQ TEMP[2].x, TEMP[2], IMM[1]\n"
+                "                       IF TEMP[2]\n"
+                "                               BRK\n"
+                "                       ENDIF\n"
+                "               ENDLOOP\n"
+                "       ELSE\n"
+                "               UADD TEMP[1].x, TEMP[0], -IMM[4]\n"
+                "               BGNLOOP\n"
+                "                       LOAD TEMP[2].x, RLOCAL, TEMP[1]\n"
+                "                       USEQ TEMP[2].x, TEMP[2], IMM[0].wwww\n"
+                "                       IF TEMP[2]\n"
+                "                               BRK\n"
+                "                       ENDIF\n"
+                "               ENDLOOP\n"
+                "               STORE RLOCAL.x, TEMP[0], IMM[0]\n"
+                "               MFENCE RLOCAL\n"
+                "               BGNLOOP\n"
+                "                       LOAD TEMP[2].x, RLOCAL, TEMP[1]\n"
+                "                       USEQ TEMP[2].x, TEMP[2], IMM[0]\n"
+                "                       IF TEMP[2]\n"
+                "                               BRK\n"
+                "                       ENDIF\n"
+                "               ENDLOOP\n"
+                "               STORE RLOCAL.x, TEMP[0], IMM[1]\n"
+                "               MFENCE RLOCAL\n"
+                "       ENDIF\n"
+                "       UMUL TEMP[1].x, SV[0], SV[1]\n"
+                "       UMUL TEMP[1].x, TEMP[1], IMM[2]\n"
+                "       UADD TEMP[1].x, TEMP[1], TEMP[0]\n"
+                "       LOAD TEMP[0].x, RLOCAL, TEMP[0]\n"
+                "       STORE RES[0].x, TEMP[1], TEMP[0]\n"
+                "       RET\n"
+                "    ENDSUB\n";
+        void init(void *p, int s, int x, int y) {
+                *(uint32_t *)p = 0xdeadbeef;
+        }
+        void expect(void *p, int s, int x, int y) {
+                *(uint32_t *)p = x & 0x20 ? 2 : 1;
+        }
+
+        printf("- %s\n", __func__);
+
+        init_prog(ctx, 256, 0, 0, src, NULL);
+        init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
+                 4096, 0, init);
+        init_compute_resources(ctx, (int []) { 0, -1 });
+        launch_grid(ctx, (uint []){64, 1, 1}, (uint []){16, 1, 1}, 0, NULL);
+        check_tex(ctx, 0, expect, NULL);
+        destroy_compute_resources(ctx);
+        destroy_tex(ctx);
+        destroy_prog(ctx);
+}
+
+static void test_sample(struct context *ctx)
+{
+        const char *src = "COMP\n"
+                "DCL SVIEW[0], 2D, FLOAT\n"
+                "DCL RES[0], 2D, RAW, WR\n"
+                "DCL SAMP[0]\n"
+                "DCL SV[0], BLOCK_ID[0]\n"
+                "DCL TEMP[0], LOCAL\n"
+                "DCL TEMP[1], LOCAL\n"
+                "IMM UINT32 { 16, 1, 0, 0 }\n"
+                "IMM FLT32 { 128, 32, 0, 0 }\n"
+                "\n"
+                "    BGNSUB\n"
+                "       I2F TEMP[1], SV[0]\n"
+                "       DIV TEMP[1], TEMP[1], IMM[1]\n"
+                "       SAMPLE TEMP[1], TEMP[1], SVIEW[0], SAMP[0]\n"
+                "       UMUL TEMP[0], SV[0], IMM[0]\n"
+                "       STORE RES[0].xyzw, TEMP[0], TEMP[1]\n"
+                "       RET\n"
+                "    ENDSUB\n";
+        void init(void *p, int s, int x, int y) {
+                *(float *)p = s ? 1 : x * y;
+        }
+        void expect(void *p, int s, int x, int y) {
+                switch (x % 4) {
+                case 0:
+                        *(float *)p = x / 4 * y;
+                        break;
+                case 1:
+                case 2:
+                        *(float *)p = 0;
+                        break;
+                case 3:
+                        *(float *)p = 1;
+                        break;
+                }
+        }
+
+        printf("- %s\n", __func__);
+
+        init_prog(ctx, 0, 0, 0, src, NULL);
+        init_tex(ctx, 0, PIPE_TEXTURE_2D, true, PIPE_FORMAT_R32_FLOAT,
+                 128, 32, init);
+        init_tex(ctx, 1, PIPE_TEXTURE_2D, true, PIPE_FORMAT_R32_FLOAT,
+                 512, 32, init);
+        init_compute_resources(ctx, (int []) { 1, -1 });
+        init_sampler_views(ctx, (int []) { 0, -1 });
+        init_sampler_states(ctx, 2);
+        launch_grid(ctx, (uint []){1, 1, 1}, (uint []){128, 32, 1}, 0, NULL);
+        check_tex(ctx, 1, expect, NULL);
+        destroy_sampler_states(ctx);
+        destroy_sampler_views(ctx);
+        destroy_compute_resources(ctx);
+        destroy_tex(ctx);
+        destroy_prog(ctx);
+}
+
+static void test_many_kern(struct context *ctx)
+{
+        const char *src = "COMP\n"
+                "DCL RES[0], BUFFER, RAW, WR\n"
+                "DCL TEMP[0], LOCAL\n"
+                "IMM UINT32 { 0, 1, 2, 3 }\n"
+                "IMM UINT32 { 4, 0, 0, 0 }\n"
+                "\n"
+                "    BGNSUB\n"
+                "       UMUL TEMP[0].x, IMM[0].xxxx, IMM[1].xxxx\n"
+                "       STORE RES[0].x, TEMP[0], IMM[0].xxxx\n"
+                "       RET\n"
+                "    ENDSUB\n"
+                "    BGNSUB\n"
+                "       UMUL TEMP[0].x, IMM[0].yyyy, IMM[1].xxxx\n"
+                "       STORE RES[0].x, TEMP[0], IMM[0].yyyy\n"
+                "       RET\n"
+                "    ENDSUB\n"
+                "    BGNSUB\n"
+                "       UMUL TEMP[0].x, IMM[0].zzzz, IMM[1].xxxx\n"
+                "       STORE RES[0].x, TEMP[0], IMM[0].zzzz\n"
+                "       RET\n"
+                "    ENDSUB\n"
+                "    BGNSUB\n"
+                "       UMUL TEMP[0].x, IMM[0].wwww, IMM[1].xxxx\n"
+                "       STORE RES[0].x, TEMP[0], IMM[0].wwww\n"
+                "       RET\n"
+                "    ENDSUB\n";
+        void init(void *p, int s, int x, int y) {
+                *(uint32_t *)p = 0xdeadbeef;
+        }
+        void expect(void *p, int s, int x, int y) {
+                *(uint32_t *)p = x;
+        }
+
+        printf("- %s\n", __func__);
+
+        init_prog(ctx, 0, 0, 0, src, NULL);
+        init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
+                 16, 0, init);
+        init_compute_resources(ctx, (int []) { 0, -1 });
+        launch_grid(ctx, (uint []){1, 1, 1}, (uint []){1, 1, 1}, 0, NULL);
+        launch_grid(ctx, (uint []){1, 1, 1}, (uint []){1, 1, 1}, 5, NULL);
+        launch_grid(ctx, (uint []){1, 1, 1}, (uint []){1, 1, 1}, 10, NULL);
+        launch_grid(ctx, (uint []){1, 1, 1}, (uint []){1, 1, 1}, 15, NULL);
+        check_tex(ctx, 0, expect, NULL);
+        destroy_compute_resources(ctx);
+        destroy_tex(ctx);
+        destroy_prog(ctx);
+}
+
+static void test_constant(struct context *ctx)
+{
+        const char *src = "COMP\n"
+                "DCL RES[0], BUFFER, RAW\n"
+                "DCL RES[1], BUFFER, RAW, WR\n"
+                "DCL SV[0], BLOCK_ID[0]\n"
+                "DCL TEMP[0], LOCAL\n"
+                "DCL TEMP[1], LOCAL\n"
+                "IMM UINT32 { 4, 0, 0, 0 }\n"
+                "\n"
+                "    BGNSUB\n"
+                "       UMUL TEMP[0].x, SV[0], IMM[0]\n"
+                "       LOAD TEMP[1].x, RES[0], TEMP[0]\n"
+                "       STORE RES[1].x, TEMP[0], TEMP[1]\n"
+                "       RET\n"
+                "    ENDSUB\n";
+        void init(void *p, int s, int x, int y) {
+                *(float *)p = s ? 0xdeadbeef : 8.0 - (float)x;
+        }
+        void expect(void *p, int s, int x, int y) {
+                *(float *)p = 8.0 - (float)x;
+        }
+
+        printf("- %s\n", __func__);
+
+        init_prog(ctx, 0, 0, 0, src, NULL);
+        init_tex(ctx, 0, PIPE_BUFFER, false, PIPE_FORMAT_R32_FLOAT,
+                 256, 0, init);
+        init_tex(ctx, 1, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
+                 256, 0, init);
+        init_compute_resources(ctx, (int []) { 0, 1, -1 });
+        launch_grid(ctx, (uint []){1, 1, 1}, (uint []){64, 1, 1}, 0, NULL);
+        check_tex(ctx, 1, expect, NULL);
+        destroy_compute_resources(ctx);
+        destroy_tex(ctx);
+        destroy_prog(ctx);
+}
+
+static void test_resource_indirect(struct context *ctx)
+{
+        const char *src = "COMP\n"
+                "DCL RES[0], BUFFER, RAW, WR\n"
+                "DCL RES[1..3], BUFFER, RAW\n"
+                "DCL SV[0], BLOCK_ID[0]\n"
+                "DCL TEMP[0], LOCAL\n"
+                "DCL TEMP[1], LOCAL\n"
+                "IMM UINT32 { 4, 0, 0, 0 }\n"
+                "\n"
+                "    BGNSUB\n"
+                "       UMUL TEMP[0].x, SV[0], IMM[0]\n"
+                "       LOAD TEMP[1].x, RES[1], TEMP[0]\n"
+                "       LOAD TEMP[1].x, RES[TEMP[1].x+2], TEMP[0]\n"
+                "       STORE RES[0].x, TEMP[0], TEMP[1]\n"
+                "       RET\n"
+                "    ENDSUB\n";
+        void init(void *p, int s, int x, int y) {
+                *(uint32_t *)p = s == 0 ? 0xdeadbeef :
+                   s == 1 ? x % 2 :
+                   s == 2 ? 2 * x :
+                   2 * x + 1;
+        }
+        void expect(void *p, int s, int x, int y) {
+           *(uint32_t *)p = 2 * x + (x % 2 ? 1 : 0);
+        }
+
+        printf("- %s\n", __func__);
+
+        init_prog(ctx, 0, 0, 0, src, NULL);
+        init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
+                 256, 0, init);
+        init_tex(ctx, 1, PIPE_BUFFER, false, PIPE_FORMAT_R32_FLOAT,
+                 256, 0, init);
+        init_tex(ctx, 2, PIPE_BUFFER, false, PIPE_FORMAT_R32_FLOAT,
+                 256, 0, init);
+        init_tex(ctx, 3, PIPE_BUFFER, false, PIPE_FORMAT_R32_FLOAT,
+                 256, 0, init);
+        init_compute_resources(ctx, (int []) { 0, 1, 2, 3, -1 });
+        launch_grid(ctx, (uint []){1, 1, 1}, (uint []){64, 1, 1}, 0, NULL);
+        check_tex(ctx, 0, expect, NULL);
+        destroy_compute_resources(ctx);
+        destroy_tex(ctx);
+        destroy_prog(ctx);
+}
+
+enum pipe_format surface_fmts[] = {
+        PIPE_FORMAT_B8G8R8A8_UNORM,
+        PIPE_FORMAT_B8G8R8X8_UNORM,
+        PIPE_FORMAT_A8R8G8B8_UNORM,
+        PIPE_FORMAT_X8R8G8B8_UNORM,
+        PIPE_FORMAT_X8R8G8B8_UNORM,
+        PIPE_FORMAT_L8_UNORM,
+        PIPE_FORMAT_A8_UNORM,
+        PIPE_FORMAT_I8_UNORM,
+        PIPE_FORMAT_L8A8_UNORM,
+        PIPE_FORMAT_R32_FLOAT,
+        PIPE_FORMAT_R32G32_FLOAT,
+        PIPE_FORMAT_R32G32B32A32_FLOAT,
+        PIPE_FORMAT_R32_UNORM,
+        PIPE_FORMAT_R32G32_UNORM,
+        PIPE_FORMAT_R32G32B32A32_UNORM,
+        PIPE_FORMAT_R32_SNORM,
+        PIPE_FORMAT_R32G32_SNORM,
+        PIPE_FORMAT_R32G32B32A32_SNORM,
+        PIPE_FORMAT_R8_UINT,
+        PIPE_FORMAT_R8G8_UINT,
+        PIPE_FORMAT_R8G8B8A8_UINT,
+        PIPE_FORMAT_R8_SINT,
+        PIPE_FORMAT_R8G8_SINT,
+        PIPE_FORMAT_R8G8B8A8_SINT,
+        PIPE_FORMAT_R32_UINT,
+        PIPE_FORMAT_R32G32_UINT,
+        PIPE_FORMAT_R32G32B32A32_UINT,
+        PIPE_FORMAT_R32_SINT,
+        PIPE_FORMAT_R32G32_SINT,
+        PIPE_FORMAT_R32G32B32A32_SINT
+};
+
+static void test_surface_ld(struct context *ctx)
+{
+        const char *src = "COMP\n"
+                "DCL RES[0], 2D\n"
+                "DCL RES[1], 2D, RAW, WR\n"
+                "DCL SV[0], BLOCK_ID[0]\n"
+                "DCL TEMP[0], LOCAL\n"
+                "DCL TEMP[1], LOCAL\n"
+                "IMM UINT32 { 16, 1, 0, 0 }\n"
+                "\n"
+                "    BGNSUB\n"
+                "       LOAD TEMP[1], RES[0], SV[0]\n"
+                "       UMUL TEMP[0], SV[0], IMM[0]\n"
+                "       STORE RES[1].xyzw, TEMP[0], TEMP[1]\n"
+                "       RET\n"
+                "    ENDSUB\n";
+        int i = 0;
+        void init0f(void *p, int s, int x, int y) {
+                float v[] = { 1.0, -.75, .50, -.25 };
+                util_format_write_4f(surface_fmts[i], v, 0,
+                                     p, 0, 0, 0, 1, 1);
+        }
+        void init0i(void *p, int s, int x, int y) {
+                int v[] = { 0xffffffff, 0xffff, 0xff, 0xf };
+                util_format_write_4i(surface_fmts[i], v, 0,
+                                     p, 0, 0, 0, 1, 1);
+        }
+        void init1(void *p, int s, int x, int y) {
+                *(uint32_t *)p = 0xdeadbeef;
+        }
+        void expectf(void *p, int s, int x, int y) {
+                float v[4], w[4];
+                init0f(v, s, x / 4, y);
+                util_format_read_4f(surface_fmts[i], w, 0,
+                                    v, 0, 0, 0, 1, 1);
+                *(float *)p = w[x % 4];
+        }
+        void expecti(void *p, int s, int x, int y) {
+                int32_t v[4], w[4];
+                init0i(v, s, x / 4, y);
+                util_format_read_4i(surface_fmts[i], w, 0,
+                                    v, 0, 0, 0, 1, 1);
+                *(uint32_t *)p = w[x % 4];
+        }
+
+        printf("- %s\n", __func__);
+
+        init_prog(ctx, 0, 0, 0, src, NULL);
+
+        for (i = 0; i < Elements(surface_fmts); i++) {
+                bool is_int = util_format_is_pure_integer(surface_fmts[i]);
+
+                printf("   - %s\n", util_format_name(surface_fmts[i]));
+
+                init_tex(ctx, 0, PIPE_TEXTURE_2D, true, surface_fmts[i],
+                         128, 32, (is_int ? init0i : init0f));
+                init_tex(ctx, 1, PIPE_TEXTURE_2D, true, PIPE_FORMAT_R32_FLOAT,
+                         512, 32, init1);
+                init_compute_resources(ctx, (int []) { 0, 1, -1 });
+                init_sampler_states(ctx, 2);
+                launch_grid(ctx, (uint []){1, 1, 1}, (uint []){128, 32, 1}, 0,
+                            NULL);
+                check_tex(ctx, 1, (is_int ? expecti : expectf), NULL);
+                destroy_sampler_states(ctx);
+                destroy_compute_resources(ctx);
+                destroy_tex(ctx);
+        }
+
+        destroy_prog(ctx);
+}
+
+static void test_surface_st(struct context *ctx)
+{
+        const char *src = "COMP\n"
+                "DCL RES[0], 2D, RAW\n"
+                "DCL RES[1], 2D, WR\n"
+                "DCL SV[0], BLOCK_ID[0]\n"
+                "DCL TEMP[0], LOCAL\n"
+                "DCL TEMP[1], LOCAL\n"
+                "IMM UINT32 { 16, 1, 0, 0 }\n"
+                "\n"
+                "    BGNSUB\n"
+                "       UMUL TEMP[0], SV[0], IMM[0]\n"
+                "       LOAD TEMP[1], RES[0], TEMP[0]\n"
+                "       STORE RES[1], SV[0], TEMP[1]\n"
+                "       RET\n"
+                "    ENDSUB\n";
+        int i = 0;
+        void init0f(void *p, int s, int x, int y) {
+                float v[] = { 1.0, -.75, 0.5, -.25 };
+                *(float *)p = v[x % 4];
+        }
+        void init0i(void *p, int s, int x, int y) {
+                int v[] = { 0xffffffff, 0xffff, 0xff, 0xf };
+                *(int32_t *)p = v[x % 4];
+        }
+        void init1(void *p, int s, int x, int y) {
+                memset(p, 1, util_format_get_blocksize(surface_fmts[i]));
+        }
+        void expectf(void *p, int s, int x, int y) {
+                float vf[4];
+                int j;
+
+                for (j = 0; j < 4; j++)
+                        init0f(&vf[j], s, 4 * x + j, y);
+                util_format_write_4f(surface_fmts[i], vf, 0,
+                                     p, 0, 0, 0, 1, 1);
+        }
+        void expects(void *p, int s, int x, int y) {
+                int32_t v[4];
+                int j;
+
+                for (j = 0; j < 4; j++)
+                        init0i(&v[j], s, 4 * x + j, y);
+                util_format_write_4i(surface_fmts[i], v, 0,
+                                     p, 0, 0, 0, 1, 1);
+        }
+        void expectu(void *p, int s, int x, int y) {
+                uint32_t v[4];
+                int j;
+
+                for (j = 0; j < 4; j++)
+                        init0i(&v[j], s, 4 * x + j, y);
+                util_format_write_4ui(surface_fmts[i], v, 0,
+                                      p, 0, 0, 0, 1, 1);
+        }
+        bool check(void *x, void *y, int sz) {
+                int j;
+
+                if (util_format_is_float(surface_fmts[i])) {
+                        return fabs(*(float *)x - *(float *)y) < 3.92156863e-3;
+
+                } else if ((sz % 4) == 0) {
+                        for (j = 0; j < sz / 4; j++)
+                                if (abs(((uint32_t *)x)[j] -
+                                        ((uint32_t *)y)[j]) > 1)
+                                        return false;
+                        return true;
+                } else {
+                        return !memcmp(x, y, sz);
+                }
+        }
+
+        printf("- %s\n", __func__);
+
+        init_prog(ctx, 0, 0, 0, src, NULL);
+
+        for (i = 0; i < Elements(surface_fmts); i++) {
+                bool is_signed = (util_format_description(surface_fmts[i])
+                                  ->channel[0].type == UTIL_FORMAT_TYPE_SIGNED);
+                bool is_int = util_format_is_pure_integer(surface_fmts[i]);
+
+                printf("   - %s\n", util_format_name(surface_fmts[i]));
+
+                init_tex(ctx, 0, PIPE_TEXTURE_2D, true, PIPE_FORMAT_R32_FLOAT,
+                         512, 32, (is_int ? init0i : init0f));
+                init_tex(ctx, 1, PIPE_TEXTURE_2D, true, surface_fmts[i],
+                         128, 32, init1);
+                init_compute_resources(ctx, (int []) { 0, 1, -1 });
+                init_sampler_states(ctx, 2);
+                launch_grid(ctx, (uint []){1, 1, 1}, (uint []){128, 32, 1}, 0,
+                            NULL);
+                check_tex(ctx, 1, (is_int && is_signed ? expects :
+                                   is_int && !is_signed ? expectu :
+                                   expectf), check);
+                destroy_sampler_states(ctx);
+                destroy_compute_resources(ctx);
+                destroy_tex(ctx);
+        }
+
+        destroy_prog(ctx);
+}
+
+static void test_barrier(struct context *ctx)
+{
+        const char *src = "COMP\n"
+                "DCL RES[0], BUFFER, RAW, WR\n"
+                "DCL SV[0], BLOCK_ID[0]\n"
+                "DCL SV[1], BLOCK_SIZE[0]\n"
+                "DCL SV[2], THREAD_ID[0]\n"
+                "DCL TEMP[0], LOCAL\n"
+                "DCL TEMP[1], LOCAL\n"
+                "DCL TEMP[2], LOCAL\n"
+                "DCL TEMP[3], LOCAL\n"
+                "IMM UINT32 { 1, 0, 0, 0 }\n"
+                "IMM UINT32 { 4, 0, 0, 0 }\n"
+                "IMM UINT32 { 32, 0, 0, 0 }\n"
+                "\n"
+                "    BGNSUB\n"
+                "       UMUL TEMP[0].x, SV[2], IMM[1]\n"
+                "       MOV TEMP[1].x, IMM[0].wwww\n"
+                "       BGNLOOP\n"
+                "               BARRIER\n"
+                "               STORE RLOCAL.x, TEMP[0], TEMP[1]\n"
+                "               BARRIER\n"
+                "               MOV TEMP[2].x, IMM[0].wwww\n"
+                "               BGNLOOP\n"
+                "                       UMUL TEMP[3].x, TEMP[2], IMM[1]\n"
+                "                       LOAD TEMP[3].x, RLOCAL, TEMP[3]\n"
+                "                       USNE TEMP[3].x, TEMP[3], TEMP[1]\n"
+                "                       IF TEMP[3]\n"
+                "                               END\n"
+                "                       ENDIF\n"
+                "                       UADD TEMP[2].x, TEMP[2], IMM[0]\n"
+                "                       USEQ TEMP[3].x, TEMP[2], SV[1]\n"
+                "                       IF TEMP[3]\n"
+                "                               BRK\n"
+                "                       ENDIF\n"
+                "               ENDLOOP\n"
+                "               UADD TEMP[1].x, TEMP[1], IMM[0]\n"
+                "               USEQ TEMP[2].x, TEMP[1], IMM[2]\n"
+                "               IF TEMP[2]\n"
+                "                       BRK\n"
+                "               ENDIF\n"
+                "       ENDLOOP\n"
+                "       UMUL TEMP[1].x, SV[0], SV[1]\n"
+                "       UMUL TEMP[1].x, TEMP[1], IMM[1]\n"
+                "       UADD TEMP[1].x, TEMP[1], TEMP[0]\n"
+                "       LOAD TEMP[0].x, RLOCAL, TEMP[0]\n"
+                "       STORE RES[0].x, TEMP[1], TEMP[0]\n"
+                "       RET\n"
+                "    ENDSUB\n";
+        void init(void *p, int s, int x, int y) {
+                *(uint32_t *)p = 0xdeadbeef;
+        }
+        void expect(void *p, int s, int x, int y) {
+                *(uint32_t *)p = 31;
+        }
+
+        printf("- %s\n", __func__);
+
+        init_prog(ctx, 256, 0, 0, src, NULL);
+        init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
+                 4096, 0, init);
+        init_compute_resources(ctx, (int []) { 0, -1 });
+        launch_grid(ctx, (uint []){64, 1, 1}, (uint []){16, 1, 1}, 0, NULL);
+        check_tex(ctx, 0, expect, NULL);
+        destroy_compute_resources(ctx);
+        destroy_tex(ctx);
+        destroy_prog(ctx);
+}
+
+static void test_atom_ops(struct context *ctx, bool global)
+{
+        const char *src = "COMP\n"
+                "#ifdef TARGET_GLOBAL\n"
+                "#define target RES[0]\n"
+                "#else\n"
+                "#define target RLOCAL\n"
+                "#endif\n"
+                ""
+                "DCL RES[0], BUFFER, RAW, WR\n"
+                "#define threadid SV[0]\n"
+                "DCL threadid, THREAD_ID[0]\n"
+                ""
+                "#define offset TEMP[0]\n"
+                "DCL offset, LOCAL\n"
+                "#define tmp TEMP[1]\n"
+                "DCL tmp, LOCAL\n"
+                ""
+                "#define k0 IMM[0]\n"
+                "IMM UINT32 { 0, 0, 0, 0 }\n"
+                "#define k1 IMM[1]\n"
+                "IMM UINT32 { 1, 0, 0, 0 }\n"
+                "#define k2 IMM[2]\n"
+                "IMM UINT32 { 2, 0, 0, 0 }\n"
+                "#define k3 IMM[3]\n"
+                "IMM UINT32 { 3, 0, 0, 0 }\n"
+                "#define k4 IMM[4]\n"
+                "IMM UINT32 { 4, 0, 0, 0 }\n"
+                "#define k5 IMM[5]\n"
+                "IMM UINT32 { 5, 0, 0, 0 }\n"
+                "#define k6 IMM[6]\n"
+                "IMM UINT32 { 6, 0, 0, 0 }\n"
+                "#define k7 IMM[7]\n"
+                "IMM UINT32 { 7, 0, 0, 0 }\n"
+                "#define k8 IMM[8]\n"
+                "IMM UINT32 { 8, 0, 0, 0 }\n"
+                "#define k9 IMM[9]\n"
+                "IMM UINT32 { 9, 0, 0, 0 }\n"
+                "#define korig IMM[10].xxxx\n"
+                "#define karg IMM[10].yyyy\n"
+                "IMM UINT32 { 3735928559, 286331153, 0, 0 }\n"
+                "\n"
+                "    BGNSUB\n"
+                "       UMUL offset.x, threadid, k4\n"
+                "       STORE target.x, offset, korig\n"
+                "       USEQ tmp.x, threadid, k0\n"
+                "       IF tmp\n"
+                "               ATOMUADD tmp.x, target, offset, karg\n"
+                "               ATOMUADD tmp.x, target, offset, tmp\n"
+                "       ENDIF\n"
+                "       USEQ tmp.x, threadid, k1\n"
+                "       IF tmp\n"
+                "               ATOMXCHG tmp.x, target, offset, karg\n"
+                "               ATOMXCHG tmp.x, target, offset, tmp\n"
+                "       ENDIF\n"
+                "       USEQ tmp.x, threadid, k2\n"
+                "       IF tmp\n"
+                "               ATOMCAS tmp.x, target, offset, korig, karg\n"
+                "               ATOMCAS tmp.x, target, offset, tmp, k0\n"
+                "       ENDIF\n"
+                "       USEQ tmp.x, threadid, k3\n"
+                "       IF tmp\n"
+                "               ATOMAND tmp.x, target, offset, karg\n"
+                "               ATOMAND tmp.x, target, offset, tmp\n"
+                "       ENDIF\n"
+                "       USEQ tmp.x, threadid, k4\n"
+                "       IF tmp\n"
+                "               ATOMOR tmp.x, target, offset, karg\n"
+                "               ATOMOR tmp.x, target, offset, tmp\n"
+                "       ENDIF\n"
+                "       USEQ tmp.x, threadid, k5\n"
+                "       IF tmp\n"
+                "               ATOMXOR tmp.x, target, offset, karg\n"
+                "               ATOMXOR tmp.x, target, offset, tmp\n"
+                "       ENDIF\n"
+                "       USEQ tmp.x, threadid, k6\n"
+                "       IF tmp\n"
+                "               ATOMUMIN tmp.x, target, offset, karg\n"
+                "               ATOMUMIN tmp.x, target, offset, tmp\n"
+                "       ENDIF\n"
+                "       USEQ tmp.x, threadid, k7\n"
+                "       IF tmp\n"
+                "               ATOMUMAX tmp.x, target, offset, karg\n"
+                "               ATOMUMAX tmp.x, target, offset, tmp\n"
+                "       ENDIF\n"
+                "       USEQ tmp.x, threadid, k8\n"
+                "       IF tmp\n"
+                "               ATOMIMIN tmp.x, target, offset, karg\n"
+                "               ATOMIMIN tmp.x, target, offset, tmp\n"
+                "       ENDIF\n"
+                "       USEQ tmp.x, threadid, k9\n"
+                "       IF tmp\n"
+                "               ATOMIMAX tmp.x, target, offset, karg\n"
+                "               ATOMIMAX tmp.x, target, offset, tmp\n"
+                "       ENDIF\n"
+                "#ifdef TARGET_LOCAL\n"
+                "       LOAD tmp.x, RLOCAL, offset\n"
+                "       STORE RES[0].x, offset, tmp\n"
+                "#endif\n"
+                "       RET\n"
+                "    ENDSUB\n";
+
+        void init(void *p, int s, int x, int y) {
+                *(uint32_t *)p = 0xbad;
+        }
+        void expect(void *p, int s, int x, int y) {
+                switch (x) {
+                case 0:
+                        *(uint32_t *)p = 0xce6c8eef;
+                        break;
+                case 1:
+                        *(uint32_t *)p = 0xdeadbeef;
+                        break;
+                case 2:
+                        *(uint32_t *)p = 0x11111111;
+                        break;
+                case 3:
+                        *(uint32_t *)p = 0x10011001;
+                        break;
+                case 4:
+                        *(uint32_t *)p = 0xdfbdbfff;
+                        break;
+                case 5:
+                        *(uint32_t *)p = 0x11111111;
+                        break;
+                case 6:
+                        *(uint32_t *)p = 0x11111111;
+                        break;
+                case 7:
+                        *(uint32_t *)p = 0xdeadbeef;
+                        break;
+                case 8:
+                        *(uint32_t *)p = 0xdeadbeef;
+                        break;
+                case 9:
+                        *(uint32_t *)p = 0x11111111;
+                        break;
+                }
+        }
+
+        printf("- %s (%s)\n", __func__, global ? "global" : "local");
+
+        init_prog(ctx, 40, 0, 0, src,
+                  (global ? "-DTARGET_GLOBAL" : "-DTARGET_LOCAL"));
+        init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
+                 40, 0, init);
+        init_compute_resources(ctx, (int []) { 0, -1 });
+        launch_grid(ctx, (uint []){10, 1, 1}, (uint []){1, 1, 1}, 0, NULL);
+        check_tex(ctx, 0, expect, NULL);
+        destroy_compute_resources(ctx);
+        destroy_tex(ctx);
+        destroy_prog(ctx);
+}
+
+static void test_atom_race(struct context *ctx, bool global)
+{
+        const char *src = "COMP\n"
+                "#ifdef TARGET_GLOBAL\n"
+                "#define target RES[0]\n"
+                "#else\n"
+                "#define target RLOCAL\n"
+                "#endif\n"
+                ""
+                "DCL RES[0], BUFFER, RAW, WR\n"
+                ""
+                "#define blockid SV[0]\n"
+                "DCL blockid, BLOCK_ID[0]\n"
+                "#define blocksz SV[1]\n"
+                "DCL blocksz, BLOCK_SIZE[0]\n"
+                "#define threadid SV[2]\n"
+                "DCL threadid, THREAD_ID[0]\n"
+                ""
+                "#define offset TEMP[0]\n"
+                "DCL offset, LOCAL\n"
+                "#define arg TEMP[1]\n"
+                "DCL arg, LOCAL\n"
+                "#define count TEMP[2]\n"
+                "DCL count, LOCAL\n"
+                "#define vlocal TEMP[3]\n"
+                "DCL vlocal, LOCAL\n"
+                "#define vshared TEMP[4]\n"
+                "DCL vshared, LOCAL\n"
+                "#define last TEMP[5]\n"
+                "DCL last, LOCAL\n"
+                "#define tmp0 TEMP[6]\n"
+                "DCL tmp0, LOCAL\n"
+                "#define tmp1 TEMP[7]\n"
+                "DCL tmp1, LOCAL\n"
+                ""
+                "#define k0 IMM[0]\n"
+                "IMM UINT32 { 0, 0, 0, 0 }\n"
+                "#define k1 IMM[1]\n"
+                "IMM UINT32 { 1, 0, 0, 0 }\n"
+                "#define k4 IMM[2]\n"
+                "IMM UINT32 { 4, 0, 0, 0 }\n"
+                "#define k32 IMM[3]\n"
+                "IMM UINT32 { 32, 0, 0, 0 }\n"
+                "#define k128 IMM[4]\n"
+                "IMM UINT32 { 128, 0, 0, 0 }\n"
+                "#define kdeadcafe IMM[5]\n"
+                "IMM UINT32 { 3735931646, 0, 0, 0 }\n"
+                "#define kallowed_set IMM[6]\n"
+                "IMM UINT32 { 559035650, 0, 0, 0 }\n"
+                "#define k11111111 IMM[7]\n"
+                "IMM UINT32 { 286331153, 0, 0, 0 }\n"
+                "\n"
+                "    BGNSUB\n"
+                "       MOV offset.x, threadid\n"
+                "#ifdef TARGET_GLOBAL\n"
+                "       UMUL tmp0.x, blockid, blocksz\n"
+                "       UADD offset.x, offset, tmp0\n"
+                "#endif\n"
+                "       UMUL offset.x, offset, k4\n"
+                "       USLT tmp0.x, threadid, k32\n"
+                "       STORE target.x, offset, k0\n"
+                "       BARRIER\n"
+                "       IF tmp0\n"
+                "               MOV vlocal.x, k0\n"
+                "               MOV arg.x, kdeadcafe\n"
+                "               BGNLOOP\n"
+                "                       INEG arg.x, arg\n"
+                "                       ATOMUADD vshared.x, target, offset, arg\n"
+                "                       SFENCE target\n"
+                "                       USNE tmp0.x, vshared, vlocal\n"
+                "                       IF tmp0\n"
+                "                               BRK\n"
+                "                       ENDIF\n"
+                "                       UADD vlocal.x, vlocal, arg\n"
+                "               ENDLOOP\n"
+                "               UADD vlocal.x, vshared, arg\n"
+                "               LOAD vshared.x, target, offset\n"
+                "               USEQ tmp0.x, vshared, vlocal\n"
+                "               STORE target.x, offset, tmp0\n"
+                "       ELSE\n"
+                "               UADD offset.x, offset, -k128\n"
+                "               MOV count.x, k0\n"
+                "               MOV last.x, k0\n"
+                "               BGNLOOP\n"
+                "                       LOAD vshared.x, target, offset\n"
+                "                       USEQ tmp0.x, vshared, kallowed_set.xxxx\n"
+                "                       USEQ tmp1.x, vshared, kallowed_set.yyyy\n"
+                "                       OR tmp0.x, tmp0, tmp1\n"
+                "                       IF tmp0\n"
+                "                               USEQ tmp0.x, vshared, last\n"
+                "                               IF tmp0\n"
+                "                                       CONT\n"
+                "                               ENDIF\n"
+                "                               MOV last.x, vshared\n"
+                "                       ELSE\n"
+                "                               END\n"
+                "                       ENDIF\n"
+                "                       UADD count.x, count, k1\n"
+                "                       USEQ tmp0.x, count, k128\n"
+                "                       IF tmp0\n"
+                "                               BRK\n"
+                "                       ENDIF\n"
+                "               ENDLOOP\n"
+                "               ATOMXCHG tmp0.x, target, offset, k11111111\n"
+                "               UADD offset.x, offset, k128\n"
+                "               ATOMXCHG tmp0.x, target, offset, k11111111\n"
+                "               SFENCE target\n"
+                "       ENDIF\n"
+                "#ifdef TARGET_LOCAL\n"
+                "       LOAD tmp0.x, RLOCAL, offset\n"
+                "       UMUL tmp1.x, blockid, blocksz\n"
+                "       UMUL tmp1.x, tmp1, k4\n"
+                "       UADD offset.x, offset, tmp1\n"
+                "       STORE RES[0].x, offset, tmp0\n"
+                "#endif\n"
+                "       RET\n"
+                "    ENDSUB\n";
+
+        void init(void *p, int s, int x, int y) {
+                *(uint32_t *)p = 0xdeadbeef;
+        }
+        void expect(void *p, int s, int x, int y) {
+                *(uint32_t *)p = x & 0x20 ? 0x11111111 : 0xffffffff;
+        }
+
+        printf("- %s (%s)\n", __func__, global ? "global" : "local");
+
+        init_prog(ctx, 256, 0, 0, src,
+                  (global ? "-DTARGET_GLOBAL" : "-DTARGET_LOCAL"));
+        init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
+                 4096, 0, init);
+        init_compute_resources(ctx, (int []) { 0, -1 });
+        launch_grid(ctx, (uint []){64, 1, 1}, (uint []){16, 1, 1}, 0, NULL);
+        check_tex(ctx, 0, expect, NULL);
+        destroy_compute_resources(ctx);
+        destroy_tex(ctx);
+        destroy_prog(ctx);
+}
+
+int main(int argc, char *argv[])
+{
+        struct context *ctx = CALLOC_STRUCT(context);
+
+        init_ctx(ctx);
+        test_system_values(ctx);
+        test_resource_access(ctx);
+        test_function_calls(ctx);
+        test_input_global(ctx);
+        test_private(ctx);
+        test_local(ctx);
+        test_sample(ctx);
+        test_many_kern(ctx);
+        test_constant(ctx);
+        test_resource_indirect(ctx);
+        test_surface_ld(ctx);
+        test_surface_st(ctx);
+        test_barrier(ctx);
+        test_atom_ops(ctx, true);
+        test_atom_race(ctx, true);
+        test_atom_ops(ctx, false);
+        test_atom_race(ctx, false);
+        destroy_ctx(ctx);
+
+        return 0;
+}