panfrost: split index cache into shared part

[mesa.git] / src / gallium / drivers / panfrost / pan_context.c
diff --git a/src/gallium/drivers/panfrost/pan_context.c b/src/gallium/drivers/panfrost/pan_context.c

index 556909613cb11ca9c9bc3c981205cf7b5c05eb90..630f6753fd413f149005768309c1a996041b7823 100644 (file)
--- a/src/gallium/drivers/panfrost/pan_context.c
+++ b/src/gallium/drivers/panfrost/pan_context.c
@@ -29,7 +29,8 @@
  
  #include "pan_bo.h"
  #include "pan_context.h"
-#include "pan_format.h"
+#include "pan_minmax_cache.h"
+#include "panfrost-quirks.h"
  
  #include "util/macros.h"
  #include "util/format/u_format.h"
@@ -51,31 +52,31 @@
  #include "pan_blending.h"
  #include "pan_blend_shaders.h"
  #include "pan_util.h"
+#include "pandecode/decode.h"
  
-/* Framebuffer descriptor */
-
-static struct midgard_tiler_descriptor
+struct midgard_tiler_descriptor
  panfrost_emit_midg_tiler(struct panfrost_batch *batch, unsigned vertex_count)
  {
          struct panfrost_screen *screen = pan_screen(batch->ctx->base.screen);
-        struct midgard_tiler_descriptor t = {};
+        bool hierarchy = !(screen->quirks & MIDGARD_NO_HIER_TILING);
+        struct midgard_tiler_descriptor t = {0};
          unsigned height = batch->key.height;
          unsigned width = batch->key.width;
  
          t.hierarchy_mask =
-                panfrost_choose_hierarchy_mask(width, height, vertex_count);
+                panfrost_choose_hierarchy_mask(width, height, vertex_count, hierarchy);
  
          /* Compute the polygon header size and use that to offset the body */
  
          unsigned header_size = panfrost_tiler_header_size(
-                                       width, height, t.hierarchy_mask);
+                                       width, height, t.hierarchy_mask, hierarchy);
  
          t.polygon_list_size = panfrost_tiler_full_size(
-                                     width, height, t.hierarchy_mask);
+                                     width, height, t.hierarchy_mask, hierarchy);
  
          /* Sanity check */
  
-        if (t.hierarchy_mask) {
+        if (vertex_count) {
                  struct panfrost_bo *tiler_heap;
  
                  tiler_heap = panfrost_batch_get_tiler_heap(batch);
@@ -91,6 +92,7 @@ panfrost_emit_midg_tiler(struct panfrost_batch *batch, unsigned vertex_count)
                  struct panfrost_bo *tiler_dummy;
  
                  tiler_dummy = panfrost_batch_get_tiler_dummy(batch);
+                header_size = MALI_TILER_MINIMUM_HEADER_SIZE;
  
                  /* The tiler is disabled, so don't allow the tiler heap */
                  t.heap_start = tiler_dummy->gpu;
@@ -100,13 +102,13 @@ panfrost_emit_midg_tiler(struct panfrost_batch *batch, unsigned vertex_count)
                  t.polygon_list = tiler_dummy->gpu;
  
                  /* Disable the tiler */
-                t.hierarchy_mask |= MALI_TILER_DISABLED;
-
-                if (screen->require_sfbd) {
-                        t.hierarchy_mask = 0xFFF; /* TODO: What's this? */
-                        t.polygon_list_size = 0x200;
+                if (hierarchy)
+                        t.hierarchy_mask |= MALI_TILER_DISABLED;
+                else {
+                        t.hierarchy_mask = MALI_TILER_USER;
+                        t.polygon_list_size = MALI_TILER_MINIMUM_HEADER_SIZE + 4;
  
-                        /* We don't have a SET_VALUE job, so write the polygon list manually */
+                        /* We don't have a WRITE_VALUE job, so write the polygon list manually */
                          uint32_t *polygon_list_body = (uint32_t *) (tiler_dummy->cpu + header_size);
                          polygon_list_body[0] = 0xa0000000; /* TODO: Just that? */
                  }
@@ -118,54 +120,6 @@ panfrost_emit_midg_tiler(struct panfrost_batch *batch, unsigned vertex_count)
          return t;
  }
  
-struct mali_single_framebuffer
-panfrost_emit_sfbd(struct panfrost_batch *batch, unsigned vertex_count)
-{
-        unsigned width = batch->key.width;
-        unsigned height = batch->key.height;
-
-        struct mali_single_framebuffer framebuffer = {
-                .width = MALI_POSITIVE(width),
-                .height = MALI_POSITIVE(height),
-                .unknown2 = 0x1f,
-                .format = {
-                        .unk3 = 0x3,
-                },
-                .clear_flags = 0x1000,
-                .unknown_address_0 = panfrost_batch_get_scratchpad(batch)->gpu,
-                .tiler = panfrost_emit_midg_tiler(batch, vertex_count),
-        };
-
-        return framebuffer;
-}
-
-struct bifrost_framebuffer
-panfrost_emit_mfbd(struct panfrost_batch *batch, unsigned vertex_count)
-{
-        unsigned width = batch->key.width;
-        unsigned height = batch->key.height;
-
-        struct bifrost_framebuffer framebuffer = {
-                .unk0 = 0x1e5, /* 1e4 if no spill */
-                .width1 = MALI_POSITIVE(width),
-                .height1 = MALI_POSITIVE(height),
-                .width2 = MALI_POSITIVE(width),
-                .height2 = MALI_POSITIVE(height),
-
-                .unk1 = 0x1080,
-
-                .rt_count_1 = MALI_POSITIVE(batch->key.nr_cbufs),
-                .rt_count_2 = 4,
-
-                .unknown2 = 0x1f,
-
-                .scratchpad = panfrost_batch_get_scratchpad(batch)->gpu,
-                .tiler = panfrost_emit_midg_tiler(batch, vertex_count)
-        };
-
-        return framebuffer;
-}
-
  static void
  panfrost_clear(
          struct pipe_context *pipe,
@@ -179,7 +133,7 @@ panfrost_clear(
           * the existing batch targeting this FBO has draws. We could probably
           * avoid that by replacing plain clears by quad-draws with a specific
           * color/depth/stencil value, thus avoiding the generation of extra
-         * fragment/set_value jobs.
+         * fragment jobs.
           */
          struct panfrost_batch *batch = panfrost_get_fresh_batch_for_fbo(ctx);
  
@@ -187,42 +141,31 @@ panfrost_clear(
          panfrost_batch_clear(batch, buffers, color, depth, stencil);
  }
  
-static mali_ptr
-panfrost_attach_vt_mfbd(struct panfrost_batch *batch)
-{
-        struct bifrost_framebuffer mfbd = panfrost_emit_mfbd(batch, ~0);
-
-        return panfrost_upload_transient(batch, &mfbd, sizeof(mfbd)) | MALI_MFBD;
-}
-
-static mali_ptr
-panfrost_attach_vt_sfbd(struct panfrost_batch *batch)
-{
-        struct mali_single_framebuffer sfbd = panfrost_emit_sfbd(batch, ~0);
-
-        return panfrost_upload_transient(batch, &sfbd, sizeof(sfbd)) | MALI_SFBD;
-}
+/* TODO: Bifrost requires just a mali_shared_memory, without the rest of the
+ * framebuffer */
  
  static void
  panfrost_attach_vt_framebuffer(struct panfrost_context *ctx)
  {
-        /* Skip the attach if we can */
-
-        if (ctx->payloads[PIPE_SHADER_VERTEX].postfix.framebuffer) {
-                assert(ctx->payloads[PIPE_SHADER_FRAGMENT].postfix.framebuffer);
-                return;
-        }
-
          struct panfrost_screen *screen = pan_screen(ctx->base.screen);
          struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  
-        if (!batch->framebuffer)
-                batch->framebuffer = screen->require_sfbd ?
-                                     panfrost_attach_vt_sfbd(batch) :
-                                     panfrost_attach_vt_mfbd(batch);
+        /* If we haven't, reserve space for the framebuffer */
+
+        if (!batch->framebuffer.gpu) {
+                unsigned size = (screen->quirks & MIDGARD_SFBD) ?
+                        sizeof(struct mali_single_framebuffer) :
+                        sizeof(struct mali_framebuffer);
+
+                batch->framebuffer = panfrost_allocate_transient(batch, size);
+
+                /* Tag the pointer */
+                if (!(screen->quirks & MIDGARD_SFBD))
+                        batch->framebuffer.gpu |= MALI_MFBD;
+        }
  
          for (unsigned i = 0; i < PIPE_SHADER_TYPES; ++i)
-                ctx->payloads[i].postfix.framebuffer = batch->framebuffer;
+                ctx->payloads[i].postfix.shared_memory = batch->framebuffer.gpu;
  }
  
  /* Reset per-frame context, called on context initialisation as well as after
@@ -232,13 +175,7 @@ void
  panfrost_invalidate_frame(struct panfrost_context *ctx)
  {
          for (unsigned i = 0; i < PIPE_SHADER_TYPES; ++i)
-                ctx->payloads[i].postfix.framebuffer = 0;
-
-        if (ctx->rasterizer)
-                ctx->dirty |= PAN_DIRTY_RASTERIZER;
-
-        /* XXX */
-        ctx->dirty |= PAN_DIRTY_SAMPLERS | PAN_DIRTY_TEXTURES;
+                ctx->payloads[i].postfix.shared_memory = 0;
  
          /* TODO: When does this need to be handled? */
          ctx->active_queries = true;
@@ -263,18 +200,6 @@ panfrost_emit_vertex_payload(struct panfrost_context *ctx)
          memcpy(&ctx->payloads[PIPE_SHADER_COMPUTE], &payload, sizeof(payload));
  }
  
-static void
-panfrost_emit_tiler_payload(struct panfrost_context *ctx)
-{
-        struct midgard_payload_vertex_tiler payload = {
-                .prefix = {
-                        .zero1 = 0xffff, /* Why is this only seen on test-quad-textured? */
-                },
-        };
-
-        memcpy(&ctx->payloads[PIPE_SHADER_FRAGMENT], &payload, sizeof(payload));
-}
-
  static unsigned
  translate_tex_wrap(enum pipe_tex_wrap w)
  {
@@ -282,8 +207,9 @@ translate_tex_wrap(enum pipe_tex_wrap w)
          case PIPE_TEX_WRAP_REPEAT:
                  return MALI_WRAP_REPEAT;
  
-                /* TODO: lower GL_CLAMP? */
          case PIPE_TEX_WRAP_CLAMP:
+                return MALI_WRAP_CLAMP;
+
          case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
                  return MALI_WRAP_CLAMP_TO_EDGE;
  
@@ -293,6 +219,15 @@ translate_tex_wrap(enum pipe_tex_wrap w)
          case PIPE_TEX_WRAP_MIRROR_REPEAT:
                  return MALI_WRAP_MIRRORED_REPEAT;
  
+        case PIPE_TEX_WRAP_MIRROR_CLAMP:
+                return MALI_WRAP_MIRRORED_CLAMP;
+
+        case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+                return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE;
+
+        case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+                return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER;
+
          default:
                  unreachable("Invalid wrap");
          }
@@ -331,39 +266,6 @@ panfrost_translate_compare_func(enum pipe_compare_func in)
          }
  }
  
-static unsigned
-panfrost_translate_alt_compare_func(enum pipe_compare_func in)
-{
-        switch (in) {
-        case PIPE_FUNC_NEVER:
-                return MALI_ALT_FUNC_NEVER;
-
-        case PIPE_FUNC_LESS:
-                return MALI_ALT_FUNC_LESS;
-
-        case PIPE_FUNC_EQUAL:
-                return MALI_ALT_FUNC_EQUAL;
-
-        case PIPE_FUNC_LEQUAL:
-                return MALI_ALT_FUNC_LEQUAL;
-
-        case PIPE_FUNC_GREATER:
-                return MALI_ALT_FUNC_GREATER;
-
-        case PIPE_FUNC_NOTEQUAL:
-                return MALI_ALT_FUNC_NOTEQUAL;
-
-        case PIPE_FUNC_GEQUAL:
-                return MALI_ALT_FUNC_GEQUAL;
-
-        case PIPE_FUNC_ALWAYS:
-                return MALI_ALT_FUNC_ALWAYS;
-
-        default:
-                unreachable("Invalid alt func");
-        }
-}
-
  static unsigned
  panfrost_translate_stencil_op(enum pipe_stencil_op in)
  {
@@ -426,9 +328,8 @@ panfrost_default_shader_backend(struct panfrost_context *ctx)
           * these earlier chips (perhaps this is a chicken bit of some kind).
           * More investigation is needed. */
  
-       if (screen->require_sfbd) {
+       if (screen->quirks & MIDGARD_SFBD)
                 shader.unknown2_4 |= 0x10;
-       }
  
          struct pipe_stencil_state default_stencil = {
                  .enabled = 0,
@@ -452,38 +353,6 @@ panfrost_default_shader_backend(struct panfrost_context *ctx)
          memcpy(&ctx->fragment_shader_core, &shader, sizeof(shader));
  }
  
-/* Generates a vertex/tiler job. This is, in some sense, the heart of the
- * graphics command stream. It should be called once per draw, accordding to
- * presentations. Set is_tiler for "tiler" jobs (fragment shader jobs, but in
- * Mali parlance, "fragment" refers to framebuffer writeout). Clear it for
- * vertex jobs. */
-
-struct panfrost_transfer
-panfrost_vertex_tiler_job(struct panfrost_context *ctx, bool is_tiler)
-{
-        struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
-        struct mali_job_descriptor_header job = {
-                .job_type = is_tiler ? JOB_TYPE_TILER : JOB_TYPE_VERTEX,
-                .job_descriptor_size = 1,
-        };
-
-        struct midgard_payload_vertex_tiler *payload = is_tiler ? &ctx->payloads[PIPE_SHADER_FRAGMENT] : &ctx->payloads[PIPE_SHADER_VERTEX];
-
-        struct panfrost_transfer transfer = panfrost_allocate_transient(batch, sizeof(job) + sizeof(*payload));
-        memcpy(transfer.cpu, &job, sizeof(job));
-        memcpy(transfer.cpu + sizeof(job), payload, sizeof(*payload));
-        return transfer;
-}
-
-mali_ptr
-panfrost_vertex_buffer_address(struct panfrost_context *ctx, unsigned i)
-{
-        struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[i];
-        struct panfrost_resource *rsrc = (struct panfrost_resource *) (buf->buffer.resource);
-
-        return rsrc->bo->gpu + buf->buffer_offset;
-}
-
  static bool
  panfrost_writes_point_size(struct panfrost_context *ctx)
  {
@@ -502,7 +371,7 @@ panfrost_stage_attributes(struct panfrost_context *ctx)
          struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
          struct panfrost_vertex_state *so = ctx->vertex;
  
-        size_t sz = sizeof(struct mali_attr_meta) * so->num_elements;
+        size_t sz = sizeof(struct mali_attr_meta) * PAN_MAX_ATTRIBUTE;
          struct panfrost_transfer transfer = panfrost_allocate_transient(batch, sz);
          struct mali_attr_meta *target = (struct mali_attr_meta *) transfer.cpu;
  
@@ -535,21 +404,40 @@ panfrost_stage_attributes(struct panfrost_context *ctx)
          for (unsigned i = 0; i < so->num_elements; ++i) {
                  unsigned vbi = so->pipe[i].vertex_buffer_index;
                  struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
-                mali_ptr addr = panfrost_vertex_buffer_address(ctx, vbi);
+                struct panfrost_resource *rsrc = (struct panfrost_resource *) (buf->buffer.resource);
+                mali_ptr addr = rsrc->bo->gpu + buf->buffer_offset;
+
+                /* Adjust by the masked off bits of the offset. Make sure we
+                 * read src_offset from so->hw (which is not GPU visible)
+                 * rather than target (which is) due to caching effects */
  
-                /* Adjust by the masked off bits of the offset */
-                target[i].src_offset += (addr & 63);
+                unsigned src_offset = so->hw[i].src_offset;
+                src_offset += (addr & 63);
  
                  /* Also, somewhat obscurely per-instance data needs to be
                   * offset in response to a delayed start in an indexed draw */
  
-                if (so->pipe[i].instance_divisor && ctx->instance_count > 1 && start) {
-                        target[i].src_offset -= buf->stride * start;
-                }
-
+                if (so->pipe[i].instance_divisor && ctx->instance_count > 1 && start)
+                        src_offset -= buf->stride * start;
  
+                target[i].src_offset = src_offset;
          }
  
+        /* Let's also include vertex builtins */
+
+        struct mali_attr_meta builtin = {
+                .format = MALI_R32UI,
+                .swizzle = panfrost_get_default_swizzle(1)
+        };
+
+        /* See mali_attr_meta specification for the magic number */
+
+        builtin.index = so->vertexid_index;
+        memcpy(&target[PAN_VERTEX_ID], &builtin, 4);
+
+        builtin.index = so->vertexid_index + 1;
+        memcpy(&target[PAN_INSTANCE_ID], &builtin, 4);
+
          ctx->payloads[PIPE_SHADER_VERTEX].postfix.attribute_meta = transfer.gpu;
  }
  
@@ -562,7 +450,7 @@ panfrost_upload_sampler_descriptors(struct panfrost_context *ctx)
          for (int t = 0; t <= PIPE_SHADER_FRAGMENT; ++t) {
                  mali_ptr upload = 0;
  
-                if (ctx->sampler_count[t] && ctx->sampler_view_count[t]) {
+                if (ctx->sampler_count[t]) {
                          size_t transfer_size = desc_size * ctx->sampler_count[t];
  
                          struct panfrost_transfer transfer =
@@ -581,25 +469,6 @@ panfrost_upload_sampler_descriptors(struct panfrost_context *ctx)
          }
  }
  
-static enum mali_texture_layout
-panfrost_layout_for_texture(struct panfrost_resource *rsrc)
-{
-        /* TODO: other linear depth textures */
-        bool is_depth = rsrc->base.format == PIPE_FORMAT_Z32_UNORM;
-
-        switch (rsrc->layout) {
-        case PAN_AFBC:
-                return MALI_TEXTURE_AFBC;
-        case PAN_TILED:
-                assert(!is_depth);
-                return MALI_TEXTURE_TILED;
-        case PAN_LINEAR:
-                return is_depth ? MALI_TEXTURE_TILED : MALI_TEXTURE_LINEAR;
-        default:
-                unreachable("Invalid texture layout");
-        }
-}
-
  static mali_ptr
  panfrost_upload_tex(
          struct panfrost_context *ctx,
@@ -612,55 +481,18 @@ panfrost_upload_tex(
          struct pipe_sampler_view *pview = &view->base;
          struct panfrost_resource *rsrc = pan_resource(pview->texture);
  
-        /* Do we interleave an explicit stride with every element? */
-
-        bool has_manual_stride = view->manual_stride;
-
-        /* For easy access */
-
-        bool is_buffer = pview->target == PIPE_BUFFER;
-        unsigned first_level = is_buffer ? 0 : pview->u.tex.first_level;
-        unsigned last_level  = is_buffer ? 0 : pview->u.tex.last_level;
-        unsigned first_layer = is_buffer ? 0 : pview->u.tex.first_layer;
-        unsigned last_layer  = is_buffer ? 0 : pview->u.tex.last_layer;
-
-        /* Lower-bit is set when sampling from colour AFBC */
-        bool is_afbc = rsrc->layout == PAN_AFBC;
-        bool is_zs = rsrc->base.bind & PIPE_BIND_DEPTH_STENCIL;
-        unsigned afbc_bit = (is_afbc && !is_zs) ? 1 : 0;
-
          /* Add the BO to the job so it's retained until the job is done. */
          struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
+
          panfrost_batch_add_bo(batch, rsrc->bo,
                                PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
                                panfrost_bo_access_for_stage(st));
  
-        /* Add the usage flags in, since they can change across the CSO
-         * lifetime due to layout switches */
-
-        view->hw.format.layout = panfrost_layout_for_texture(rsrc);
-        view->hw.format.manual_stride = has_manual_stride;
-
-        /* Inject the addresses in, interleaving mip levels, cube faces, and
-         * strides in that order */
-
-        unsigned idx = 0;
-
-        for (unsigned l = first_level; l <= last_level; ++l) {
-                for (unsigned f = first_layer; f <= last_layer; ++f) {
-
-                        view->hw.payload[idx++] =
-                                panfrost_get_texture_address(rsrc, l, f) + afbc_bit;
-
-                        if (has_manual_stride) {
-                                view->hw.payload[idx++] =
-                                        rsrc->slices[l].stride;
-                        }
-                }
-        }
+        panfrost_batch_add_bo(batch, view->bo,
+                              PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
+                              panfrost_bo_access_for_stage(st));
  
-        return panfrost_upload_transient(batch, &view->hw,
-                                         sizeof(struct mali_texture_descriptor));
+        return view->bo->gpu;
  }
  
  static void
@@ -761,6 +593,29 @@ static void panfrost_upload_ssbo_sysval(
          uniform->u[2] = sb.buffer_size;
  }
  
+static void
+panfrost_upload_sampler_sysval(
+                struct panfrost_context *ctx,
+                enum pipe_shader_type st,
+                unsigned sampler_index,
+                struct sysval_uniform *uniform)
+{
+        struct pipe_sampler_state *sampl =
+                &ctx->samplers[st][sampler_index]->base;
+
+        uniform->f[0] = sampl->min_lod;
+        uniform->f[1] = sampl->max_lod;
+        uniform->f[2] = sampl->lod_bias;
+
+        /* Even without any errata, Midgard represents "no mipmapping" as
+         * fixing the LOD with the clamps; keep behaviour consistent. c.f.
+         * panfrost_create_sampler_state which also explains our choice of
+         * epsilon value (again to keep behaviour consistent) */
+
+        if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
+                uniform->f[1] = uniform->f[0] + (1.0/256.0);
+}
+
  static void panfrost_upload_num_work_groups_sysval(struct panfrost_context *ctx,
                  struct sysval_uniform *uniform)
  {
@@ -796,7 +651,10 @@ static void panfrost_upload_sysvals(struct panfrost_context *ctx, void *buf,
                  case PAN_SYSVAL_NUM_WORK_GROUPS:
                          panfrost_upload_num_work_groups_sysval(ctx, &uniforms[i]);
                          break;
-
+                case PAN_SYSVAL_SAMPLER:
+                        panfrost_upload_sampler_sysval(ctx, st, PAN_SYSVAL_ID(sysval),
+                                                    &uniforms[i]);
+                        break;
                  default:
                          assert(0);
                  }
@@ -833,9 +691,11 @@ panfrost_map_constant_buffer_gpu(
                                        PAN_BO_ACCESS_SHARED |
                                        PAN_BO_ACCESS_READ |
                                        panfrost_bo_access_for_stage(st));
-                return rsrc->bo->gpu;
+
+                /* Alignment gauranteed by PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
+                return rsrc->bo->gpu + cb->buffer_offset;
         } else if (cb->user_buffer) {
-                return panfrost_upload_transient(batch, cb->user_buffer, cb->buffer_size);
+                return panfrost_upload_transient(batch, cb->user_buffer + cb->buffer_offset, cb->buffer_size);
         } else {
                  unreachable("No constant buffer");
          }
@@ -853,20 +713,25 @@ panfrost_ubo_count(struct panfrost_context *ctx, enum pipe_shader_type stage)
          return 32 - __builtin_clz(mask);
  }
  
-/* Fixes up a shader state with current state, returning a GPU address to the
- * patched shader */
+/* Fixes up a shader state with current state */
  
-static mali_ptr
-panfrost_patch_shader_state(
-        struct panfrost_context *ctx,
-        struct panfrost_shader_state *ss,
-        enum pipe_shader_type stage,
-        bool should_upload)
+static void
+panfrost_patch_shader_state(struct panfrost_context *ctx,
+                            enum pipe_shader_type stage)
  {
+        struct panfrost_shader_variants *all = ctx->shader[stage];
+
+        if (!all) {
+                ctx->payloads[stage].postfix.shader = 0;
+                return;
+        }
+
+        struct panfrost_shader_state *ss = &all->variants[all->active_variant];
+
          ss->tripipe->texture_count = ctx->sampler_view_count[stage];
          ss->tripipe->sampler_count = ctx->sampler_count[stage];
  
-        ss->tripipe->midgard1.flags = 0x220;
+        ss->tripipe->midgard1.flags_lo = 0x220;
  
          unsigned ubo_count = panfrost_ubo_count(ctx, stage);
          ss->tripipe->midgard1.uniform_buffer_count = ubo_count;
@@ -879,36 +744,9 @@ panfrost_patch_shader_state(
                                PAN_BO_ACCESS_READ |
                               panfrost_bo_access_for_stage(stage));
  
-        /* We can't reuse over frames; that's not safe. The descriptor must be
-         * transient uploaded */
-
-        if (should_upload) {
-                return panfrost_upload_transient(batch, ss->tripipe,
-                                                 sizeof(struct mali_shader_meta));
-        }
-
-        /* If we don't need an upload, don't bother */
-        return 0;
-
-}
-
-static void
-panfrost_patch_shader_state_compute(
-        struct panfrost_context *ctx,
-        enum pipe_shader_type stage,
-        bool should_upload)
-{
-        struct panfrost_shader_variants *all = ctx->shader[stage];
-
-        if (!all) {
-                ctx->payloads[stage].postfix.shader = 0;
-                return;
-        }
-
-        struct panfrost_shader_state *s = &all->variants[all->active_variant];
-
-        ctx->payloads[stage].postfix.shader =
-                panfrost_patch_shader_state(ctx, s, stage, should_upload);
+        ctx->payloads[stage].postfix.shader = panfrost_upload_transient(batch,
+                                        ss->tripipe,
+                                        sizeof(struct mali_shader_meta));
  }
  
  /* Go through dirty flags and actualise them in the cmdstream. */
@@ -930,9 +768,9 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
                  panfrost_emit_varying_descriptor(ctx, total_count);
          }
  
-        bool msaa = ctx->rasterizer->base.multisample;
  
-        if (ctx->dirty & PAN_DIRTY_RASTERIZER) {
+        if (ctx->rasterizer) {
+                bool msaa = ctx->rasterizer->base.multisample;
                  ctx->payloads[PIPE_SHADER_FRAGMENT].gl_enables = ctx->rasterizer->tiler_gl_enables;
  
                  /* TODO: Sample size */
@@ -947,29 +785,28 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
                  ctx->payloads[PIPE_SHADER_FRAGMENT].postfix.occlusion_counter = ctx->occlusion_query->bo->gpu;
          }
  
-        panfrost_patch_shader_state_compute(ctx, PIPE_SHADER_VERTEX, true);
-        panfrost_patch_shader_state_compute(ctx, PIPE_SHADER_COMPUTE, true);
+        panfrost_patch_shader_state(ctx, PIPE_SHADER_VERTEX);
+        panfrost_patch_shader_state(ctx, PIPE_SHADER_COMPUTE);
  
-        if (ctx->dirty & (PAN_DIRTY_RASTERIZER | PAN_DIRTY_VS)) {
+        if (ctx->shader[PIPE_SHADER_VERTEX] && ctx->shader[PIPE_SHADER_FRAGMENT]) {
                  /* Check if we need to link the gl_PointSize varying */
                  if (!panfrost_writes_point_size(ctx)) {
                          /* If the size is constant, write it out. Otherwise,
                           * don't touch primitive_size (since we would clobber
                           * the pointer there) */
  
-                        ctx->payloads[PIPE_SHADER_FRAGMENT].primitive_size.constant = ctx->rasterizer->base.line_width;
+                        bool points = ctx->payloads[PIPE_SHADER_FRAGMENT].prefix.draw_mode == MALI_POINTS;
+
+                        ctx->payloads[PIPE_SHADER_FRAGMENT].primitive_size.constant = points ?
+                                ctx->rasterizer->base.point_size :
+                                ctx->rasterizer->base.line_width;
                  }
          }
  
-        /* TODO: Maybe dirty track FS, maybe not. For now, it's transient. */
-        if (ctx->shader[PIPE_SHADER_FRAGMENT])
-                ctx->dirty |= PAN_DIRTY_FS;
-
-        if (ctx->dirty & PAN_DIRTY_FS) {
-                assert(ctx->shader[PIPE_SHADER_FRAGMENT]);
+        if (ctx->shader[PIPE_SHADER_FRAGMENT]) {
                  struct panfrost_shader_state *variant = &ctx->shader[PIPE_SHADER_FRAGMENT]->variants[ctx->shader[PIPE_SHADER_FRAGMENT]->active_variant];
  
-                panfrost_patch_shader_state(ctx, variant, PIPE_SHADER_FRAGMENT, false);
+                panfrost_patch_shader_state(ctx, PIPE_SHADER_FRAGMENT);
  
  #define COPY(name) ctx->fragment_shader_core.name = variant->tripipe->name
  
@@ -981,8 +818,8 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
                  COPY(midgard1.uniform_count);
                  COPY(midgard1.uniform_buffer_count);
                  COPY(midgard1.work_count);
-                COPY(midgard1.flags);
-                COPY(midgard1.unknown2);
+                COPY(midgard1.flags_lo);
+                COPY(midgard1.flags_hi);
  
  #undef COPY
  
@@ -990,9 +827,12 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
                  unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
  
                  struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
+                unsigned shader_offset = 0;
+                struct panfrost_bo *shader_bo = NULL;
  
-                for (unsigned c = 0; c < rt_count; ++c)
-                        blend[c] = panfrost_get_blend_for_context(ctx, c);
+                for (unsigned c = 0; c < rt_count; ++c) {
+                        blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo, &shader_offset);
+                }
  
                  /* If there is a blend shader, work registers are shared. XXX: opt */
  
@@ -1001,22 +841,22 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
                                  ctx->fragment_shader_core.midgard1.work_count = 16;
                  }
  
-                /* Set late due to depending on render state */
-                unsigned flags = ctx->fragment_shader_core.midgard1.flags;
-
                  /* Depending on whether it's legal to in the given shader, we
                   * try to enable early-z testing (or forward-pixel kill?) */
  
-                if (!variant->can_discard)
-                        flags |= MALI_EARLY_Z;
+                SET_BIT(ctx->fragment_shader_core.midgard1.flags_lo, MALI_EARLY_Z,
+                        !variant->can_discard && !variant->writes_depth);
+
+                /* Add the writes Z/S flags if needed. */
+                SET_BIT(ctx->fragment_shader_core.midgard1.flags_lo,
+                        MALI_WRITES_Z, variant->writes_depth);
+                SET_BIT(ctx->fragment_shader_core.midgard1.flags_hi,
+                        MALI_WRITES_S, variant->writes_stencil);
  
                  /* Any time texturing is used, derivatives are implicitly
                   * calculated, so we need to enable helper invocations */
  
-                if (variant->helper_invocations)
-                        flags |= MALI_HELPER_INVOCATIONS;
-
-                ctx->fragment_shader_core.midgard1.flags = flags;
+                SET_BIT(ctx->fragment_shader_core.midgard1.flags_lo, MALI_HELPER_INVOCATIONS, variant->helper_invocations);
  
                  /* Assign the stencil refs late */
  
@@ -1034,41 +874,41 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
                   * thing?" by Peter Harris
                   */
  
-                if (variant->can_discard) {
-                        ctx->fragment_shader_core.unknown2_3 |= MALI_CAN_DISCARD;
-                        ctx->fragment_shader_core.midgard1.flags |= 0x400;
-                }
+                SET_BIT(ctx->fragment_shader_core.unknown2_3, MALI_CAN_DISCARD, variant->can_discard);
+                SET_BIT(ctx->fragment_shader_core.midgard1.flags_lo, 0x400, variant->can_discard);
  
                  /* Even on MFBD, the shader descriptor gets blend shaders. It's
                   * *also* copied to the blend_meta appended (by convention),
                   * but this is the field actually read by the hardware. (Or
-                 * maybe both are read...?) */
+                 * maybe both are read...?). Specify the last RTi with a blend
+                 * shader. */
  
-                if (blend[0].is_shader) {
-                        ctx->fragment_shader_core.blend.shader =
-                                blend[0].shader.bo->gpu | blend[0].shader.first_tag;
-                } else {
-                        ctx->fragment_shader_core.blend.shader = 0;
+                ctx->fragment_shader_core.blend.shader = 0;
+
+                for (signed rt = (rt_count - 1); rt >= 0; --rt) {
+                        if (blend[rt].is_shader) {
+                                ctx->fragment_shader_core.blend.shader =
+                                        blend[rt].shader.gpu | blend[rt].shader.first_tag;
+                                break;
+                        }
                  }
  
-                if (screen->require_sfbd) {
+                if (screen->quirks & MIDGARD_SFBD) {
                          /* When only a single render target platform is used, the blend
                           * information is inside the shader meta itself. We
                           * additionally need to signal CAN_DISCARD for nontrivial blend
                           * modes (so we're able to read back the destination buffer) */
  
-                        if (blend[0].is_shader) {
-                                ctx->fragment_shader_core.unknown2_3 |= MALI_HAS_BLEND_SHADER;
-                        } else {
+                        SET_BIT(ctx->fragment_shader_core.unknown2_3, MALI_HAS_BLEND_SHADER, blend[0].is_shader);
+
+                        if (!blend[0].is_shader) {
                                  ctx->fragment_shader_core.blend.equation =
                                          *blend[0].equation.equation;
                                  ctx->fragment_shader_core.blend.constant =
                                          blend[0].equation.constant;
                          }
  
-                        if (!blend[0].no_blending) {
-                                ctx->fragment_shader_core.unknown2_3 |= MALI_CAN_DISCARD;
-                        }
+                        SET_BIT(ctx->fragment_shader_core.unknown2_3, MALI_CAN_DISCARD, !blend[0].no_blending);
                  }
  
                  size_t size = sizeof(struct mali_shader_meta) + (sizeof(struct midgard_blend_rt) * rt_count);
@@ -1077,7 +917,7 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
  
                  ctx->payloads[PIPE_SHADER_FRAGMENT].postfix.shader = transfer.gpu;
  
-                if (!screen->require_sfbd) {
+                if (!(screen->quirks & MIDGARD_SFBD)) {
                          /* Additional blend descriptor tacked on for jobs using MFBD */
  
                          struct midgard_blend_rt rts[4];
@@ -1095,16 +935,8 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
                                  SET_BIT(rts[i].flags, MALI_BLEND_SRGB, is_srgb);
                                  SET_BIT(rts[i].flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
  
-                                /* TODO: sRGB in blend shaders is currently
-                                 * unimplemented. Contact me (Alyssa) if you're
-                                 * interested in working on this. We have
-                                 * native Midgard ops for helping here, but
-                                 * they're not well-understood yet. */
-
-                                assert(!(is_srgb && blend[i].is_shader));
-
                                  if (blend[i].is_shader) {
-                                        rts[i].blend.shader = blend[i].shader.bo->gpu | blend[i].shader.first_tag;
+                                        rts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
                                  } else {
                                          rts[i].blend.equation = *blend[i].equation.equation;
                                          rts[i].blend.constant = blend[i].equation.constant;
@@ -1119,11 +951,8 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
          if (ctx->vertex)
                  panfrost_stage_attributes(ctx);
  
-        if (ctx->dirty & PAN_DIRTY_SAMPLERS)
-                panfrost_upload_sampler_descriptors(ctx);
-
-        if (ctx->dirty & PAN_DIRTY_TEXTURES)
-                panfrost_upload_texture_descriptors(ctx);
+        panfrost_upload_sampler_descriptors(ctx);
+        panfrost_upload_texture_descriptors(ctx);
  
          const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
  
@@ -1150,7 +979,7 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
                  panfrost_upload_sysvals(ctx, transfer.cpu, ss, i);
  
                  /* Upload uniforms */
-                if (has_uniforms) {
+                if (has_uniforms && uniform_size) {
                          const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
                          memcpy(transfer.cpu + sys_size, cpu, uniform_size);
                  }
@@ -1167,12 +996,11 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
                  unsigned ubo_count = panfrost_ubo_count(ctx, i);
                  assert(ubo_count >= 1);
  
-                size_t sz = sizeof(struct mali_uniform_buffer_meta) * ubo_count;
-                struct mali_uniform_buffer_meta ubos[PAN_MAX_CONST_BUFFERS];
+                size_t sz = sizeof(uint64_t) * ubo_count;
+                uint64_t ubos[PAN_MAX_CONST_BUFFERS];
  
                  /* Upload uniforms as a UBO */
-                ubos[0].size = MALI_POSITIVE((2 + uniform_count));
-                ubos[0].ptr = transfer.gpu >> 2;
+                ubos[0] = MALI_MAKE_UBO(2 + uniform_count, transfer.gpu);
  
                  /* The rest are honest-to-goodness UBOs */
  
@@ -1184,9 +1012,7 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
  
                          if (!enabled || empty) {
                                  /* Stub out disabled UBOs to catch accesses */
-
-                                ubos[ubo].size = 0;
-                                ubos[ubo].ptr = 0xDEAD0000;
+                                ubos[ubo] = MALI_MAKE_UBO(0, 0xDEAD0000);
                                  continue;
                          }
  
@@ -1194,10 +1020,7 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
  
                          unsigned bytes_per_field = 16;
                          unsigned aligned = ALIGN_POT(usz, bytes_per_field);
-                        unsigned fields = aligned / bytes_per_field;
-
-                        ubos[ubo].size = MALI_POSITIVE(fields);
-                        ubos[ubo].ptr = gpu >> 2;
+                        ubos[ubo] = MALI_MAKE_UBO(aligned / bytes_per_field, gpu);
                  }
  
                  mali_ptr ubufs = panfrost_upload_transient(batch, ubos, sz);
@@ -1303,8 +1126,6 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
                  panfrost_upload_transient(batch,
                                            &view,
                                            sizeof(struct mali_viewport));
-
-        ctx->dirty = 0;
  }
  
  /* Corresponds to exactly one draw, but does not submit anything */
@@ -1320,20 +1141,33 @@ panfrost_queue_draw(struct panfrost_context *ctx)
          bool rasterizer_discard = ctx->rasterizer
                                    && ctx->rasterizer->base.rasterizer_discard;
  
-        struct panfrost_transfer vertex = panfrost_vertex_tiler_job(ctx, false);
-        struct panfrost_transfer tiler;
  
-        if (!rasterizer_discard)
-                tiler = panfrost_vertex_tiler_job(ctx, true);
+        struct midgard_payload_vertex_tiler *vertex_payload = &ctx->payloads[PIPE_SHADER_VERTEX];
+        struct midgard_payload_vertex_tiler *tiler_payload = &ctx->payloads[PIPE_SHADER_FRAGMENT];
  
          struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
+        bool wallpapering = ctx->wallpaper_batch && batch->tiler_dep;
  
-        if (rasterizer_discard)
-                panfrost_scoreboard_queue_vertex_job(batch, vertex, FALSE);
-        else if (ctx->wallpaper_batch && batch->first_tiler.gpu)
-                panfrost_scoreboard_queue_fused_job_prepend(batch, vertex, tiler);
-        else
-                panfrost_scoreboard_queue_fused_job(batch, vertex, tiler);
+        if (wallpapering) {
+                /* Inject in reverse order, with "predicted" job indices. THIS IS A HACK XXX */
+                panfrost_new_job(batch, JOB_TYPE_TILER, false, batch->job_index + 2, tiler_payload, sizeof(*tiler_payload), true);
+                panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0, vertex_payload, sizeof(*vertex_payload), true);
+        } else  {
+                unsigned vertex = panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0, vertex_payload, sizeof(*vertex_payload), false);
+
+                if (!rasterizer_discard)
+                        panfrost_new_job(batch, JOB_TYPE_TILER, false, vertex, tiler_payload, sizeof(*tiler_payload), false);
+        }
+
+        for (unsigned i = 0; i < PIPE_SHADER_TYPES; ++i) {
+                struct panfrost_shader_variants *all = ctx->shader[i];
+
+                if (!all)
+                        continue;
+
+                struct panfrost_shader_state *ss = &all->variants[all->active_variant];
+                batch->stack_size = MAX2(batch->stack_size, ss->stack_size);
+        }
  }
  
  /* The entire frame is in memory -- send it off to the kernel! */
@@ -1375,6 +1209,9 @@ panfrost_flush(
  
                  util_dynarray_fini(&fences);
          }
+
+        if (pan_debug & PAN_DBG_TRACE)
+                pandecode_next_frame();
  }
  
  #define DEFINE_CASE(c) case PIPE_PRIM_##c: return MALI_##c;
@@ -1420,15 +1257,27 @@ panfrost_translate_index_size(unsigned size)
  }
  
  /* Gets a GPU address for the associated index buffer. Only gauranteed to be
- * good for the duration of the draw (transient), could last longer */
+ * good for the duration of the draw (transient), could last longer. Also get
+ * the bounds on the index buffer for the range accessed by the draw. We do
+ * these operations together because there are natural optimizations which
+ * require them to be together. */
  
  static mali_ptr
-panfrost_get_index_buffer_mapped(struct panfrost_context *ctx, const struct pipe_draw_info *info)
+panfrost_get_index_buffer_bounded(struct panfrost_context *ctx, const struct pipe_draw_info *info, unsigned *min_index, unsigned *max_index)
  {
          struct panfrost_resource *rsrc = (struct panfrost_resource *) (info->index.resource);
  
          off_t offset = info->start * info->index_size;
          struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
+        mali_ptr out = 0;
+
+        bool needs_indices = true;
+
+        if (info->max_index != ~0u) {
+                *min_index = info->min_index;
+                *max_index = info->max_index;
+                needs_indices = false;
+        }
  
          if (!info->has_user_indices) {
                  /* Only resources can be directly mapped */
@@ -1436,12 +1285,29 @@ panfrost_get_index_buffer_mapped(struct panfrost_context *ctx, const struct pipe
                                        PAN_BO_ACCESS_SHARED |
                                        PAN_BO_ACCESS_READ |
                                        PAN_BO_ACCESS_VERTEX_TILER);
-                return rsrc->bo->gpu + offset;
+                out = rsrc->bo->gpu + offset;
+
+                /* Check the cache */
+                needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache, info->start, info->count,
+                                                           min_index, max_index);
          } else {
                  /* Otherwise, we need to upload to transient memory */
                  const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
-                return panfrost_upload_transient(batch, ibuf8 + offset, info->count * info->index_size);
+                out = panfrost_upload_transient(batch, ibuf8 + offset, info->count * info->index_size);
          }
+
+        if (needs_indices) {
+                /* Fallback */
+                u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
+
+                if (!info->has_user_indices) {
+                        panfrost_minmax_cache_add(rsrc->index_cache, info->start, info->count,
+                                                  *min_index, *max_index);
+                }
+        }
+
+
+        return out;
  }
  
  static bool
@@ -1549,38 +1415,19 @@ panfrost_draw_vbo(
          if (info->primitive_restart)
                  draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
  
-        /* For higher amounts of vertices (greater than what fits in a 16-bit
-         * short), the other value is needed, otherwise there will be bizarre
-         * rendering artefacts. It's not clear what these values mean yet. This
-         * change is also needed for instancing and sometimes points (perhaps
-         * related to dynamically setting gl_PointSize) */
+        /* These doesn't make much sense */
  
-        bool is_points = mode == PIPE_PRIM_POINTS;
-        bool many_verts = ctx->vertex_count > 0xFFFF;
-        bool instanced = ctx->instance_count > 1;
+        draw_flags |= 0x3000;
  
-        draw_flags |= (is_points || many_verts || instanced) ? 0x3000 : 0x18000;
-
-        /* This doesn't make much sense */
-        if (mode == PIPE_PRIM_LINE_STRIP) {
-                draw_flags |= 0x800;
-        }
+        if (ctx->rasterizer && ctx->rasterizer->base.flatshade_first)
+                draw_flags |= MALI_DRAW_FLATSHADE_FIRST;
  
          panfrost_statistics_record(ctx, info);
  
          if (info->index_size) {
-                /* Calculate the min/max index used so we can figure out how
-                 * many times to invoke the vertex shader */
-
-                /* Fetch / calculate index bounds */
                  unsigned min_index = 0, max_index = 0;
-
-                if (info->max_index == ~0u) {
-                        u_vbuf_get_minmax_index(pipe, info, &min_index, &max_index);
-                } else {
-                        min_index = info->min_index;
-                        max_index = info->max_index;
-                }
+                ctx->payloads[PIPE_SHADER_FRAGMENT].prefix.indices =
+                        panfrost_get_index_buffer_bounded(ctx, info, &min_index, &max_index);
  
                  /* Use the corresponding values */
                  vertex_count = max_index - min_index + 1;
@@ -1590,10 +1437,7 @@ panfrost_draw_vbo(
                  ctx->payloads[PIPE_SHADER_FRAGMENT].prefix.offset_bias_correction = -min_index;
                  ctx->payloads[PIPE_SHADER_FRAGMENT].prefix.index_count = MALI_POSITIVE(info->count);
  
-                //assert(!info->restart_index); /* TODO: Research */
-
                  draw_flags |= panfrost_translate_index_size(info->index_size);
-                ctx->payloads[PIPE_SHADER_FRAGMENT].prefix.indices = panfrost_get_index_buffer_mapped(ctx, info);
          } else {
                  /* Index count == vertex count, if no indexing is applied, as
                   * if it is internally indexed in the expected order */
@@ -1602,7 +1446,7 @@ panfrost_draw_vbo(
                  ctx->payloads[PIPE_SHADER_FRAGMENT].prefix.index_count = MALI_POSITIVE(ctx->vertex_count);
  
                  /* Reverse index state */
-                ctx->payloads[PIPE_SHADER_FRAGMENT].prefix.indices = (u64) NULL;
+                ctx->payloads[PIPE_SHADER_FRAGMENT].prefix.indices = (mali_ptr) 0;
          }
  
          /* Dispatch "compute jobs" for the vertex/tiler pair as (1,
@@ -1619,24 +1463,16 @@ panfrost_draw_vbo(
          /* Encode the padded vertex count */
  
          if (info->instance_count > 1) {
-                /* Triangles have non-even vertex counts so they change how
-                 * padding works internally */
-
-                bool is_triangle =
-                        mode == PIPE_PRIM_TRIANGLES ||
-                        mode == PIPE_PRIM_TRIANGLE_STRIP ||
-                        mode == PIPE_PRIM_TRIANGLE_FAN;
-
-                struct pan_shift_odd so =
-                        panfrost_padded_vertex_count(vertex_count, !is_triangle);
+                ctx->padded_count = panfrost_padded_vertex_count(vertex_count);
  
-                ctx->payloads[PIPE_SHADER_VERTEX].instance_shift = so.shift;
-                ctx->payloads[PIPE_SHADER_FRAGMENT].instance_shift = so.shift;
+                unsigned shift = __builtin_ctz(ctx->padded_count);
+                unsigned k = ctx->padded_count >> (shift + 1);
  
-                ctx->payloads[PIPE_SHADER_VERTEX].instance_odd = so.odd;
-                ctx->payloads[PIPE_SHADER_FRAGMENT].instance_odd = so.odd;
+                ctx->payloads[PIPE_SHADER_VERTEX].instance_shift = shift;
+                ctx->payloads[PIPE_SHADER_FRAGMENT].instance_shift = shift;
  
-                ctx->padded_count = pan_expand_shift_odd(so);
+                ctx->payloads[PIPE_SHADER_VERTEX].instance_odd = k;
+                ctx->payloads[PIPE_SHADER_FRAGMENT].instance_odd = k;
          } else {
                  ctx->padded_count = vertex_count;
  
@@ -1699,13 +1535,11 @@ panfrost_bind_rasterizer_state(
  {
          struct panfrost_context *ctx = pan_context(pctx);
  
-        /* TODO: Why can't rasterizer be NULL ever? Other drivers are fine.. */
+        ctx->rasterizer = hwcso;
+
          if (!hwcso)
                  return;
  
-        ctx->rasterizer = hwcso;
-        ctx->dirty |= PAN_DIRTY_RASTERIZER;
-
          ctx->fragment_shader_core.depth_units = ctx->rasterizer->base.offset_units * 2.0f;
          ctx->fragment_shader_core.depth_factor = ctx->rasterizer->base.offset_scale;
  
@@ -1760,15 +1594,14 @@ panfrost_bind_vertex_elements_state(
          void *hwcso)
  {
          struct panfrost_context *ctx = pan_context(pctx);
-
          ctx->vertex = hwcso;
-        ctx->dirty |= PAN_DIRTY_VERTEX;
  }
  
  static void *
  panfrost_create_shader_state(
          struct pipe_context *pctx,
-        const struct pipe_shader_state *cso)
+        const struct pipe_shader_state *cso,
+        enum pipe_shader_type stage)
  {
          struct panfrost_shader_variants *so = CALLOC_STRUCT(panfrost_shader_variants);
          so->base = *cso;
@@ -1778,6 +1611,21 @@ panfrost_create_shader_state(
          if (cso->type == PIPE_SHADER_IR_TGSI)
                  so->base.tokens = tgsi_dup_tokens(so->base.tokens);
  
+        /* Precompile for shader-db if we need to */
+        if (unlikely((pan_debug & PAN_DBG_PRECOMPILE) && cso->type == PIPE_SHADER_IR_NIR)) {
+                struct panfrost_context *ctx = pan_context(pctx);
+
+                struct mali_shader_meta meta;
+                struct panfrost_shader_state state;
+                uint64_t outputs_written;
+
+                panfrost_shader_compile(ctx, &meta,
+                              PIPE_SHADER_IR_NIR,
+                                      so->base.ir.nir,
+                                        tgsi_processor_to_shader_stage(stage), &state,
+                                        &outputs_written);
+        }
+
          return so;
  }
  
@@ -1797,6 +1645,7 @@ panfrost_delete_shader_state(
                  panfrost_bo_unreference(shader_state->bo);
                  shader_state->bo = NULL;
          }
+        free(cso->variants);
  
          free(so);
  }
@@ -1826,15 +1675,18 @@ panfrost_create_sampler_state(
                  .wrap_s = translate_tex_wrap(cso->wrap_s),
                  .wrap_t = translate_tex_wrap(cso->wrap_t),
                  .wrap_r = translate_tex_wrap(cso->wrap_r),
-                .compare_func = panfrost_translate_alt_compare_func(cso->compare_func),
+                .compare_func = panfrost_flip_compare_func(
+                                panfrost_translate_compare_func(
+                                        cso->compare_func)),
                  .border_color = {
                          cso->border_color.f[0],
                          cso->border_color.f[1],
                          cso->border_color.f[2],
                          cso->border_color.f[3]
                  },
-                .min_lod = FIXED_16(cso->min_lod),
-                .max_lod = FIXED_16(cso->max_lod),
+                .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
+                .max_lod = FIXED_16(cso->max_lod, false),
+                .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
                  .seamless_cube_map = cso->seamless_cube_map,
          };
  
@@ -1843,16 +1695,17 @@ panfrost_create_sampler_state(
           * essentially -- remember these are fixed point numbers, so
           * epsilon=1/256) */
  
-        if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
+        if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) {
                  sampler_descriptor.max_lod = sampler_descriptor.min_lod;
  
-        /* Enforce that there is something in the middle by adding epsilon*/
+                /* Enforce that there is something in the middle by adding epsilon*/
  
-        if (sampler_descriptor.min_lod == sampler_descriptor.max_lod)
-                sampler_descriptor.max_lod++;
+                if (sampler_descriptor.min_lod == sampler_descriptor.max_lod)
+                        sampler_descriptor.max_lod++;
  
-        /* Sanity check */
-        assert(sampler_descriptor.max_lod > sampler_descriptor.min_lod);
+                /* Sanity check */
+                assert(sampler_descriptor.max_lod > sampler_descriptor.min_lod);
+        }
  
          so->hw = sampler_descriptor;
  
@@ -1873,8 +1726,6 @@ panfrost_bind_sampler_states(
          /* XXX: Should upload, not just copy? */
          ctx->sampler_count[shader] = num_sampler;
          memcpy(ctx->samplers[shader], sampler, num_sampler * sizeof (void *));
-
-        ctx->dirty |= PAN_DIRTY_SAMPLERS;
  }
  
  static bool
@@ -1943,7 +1794,7 @@ update_so_info(struct pipe_stream_output_info *so_info,
                 uint64_t outputs_written)
  {
         uint64_t so_outputs = 0;
-       uint8_t reverse_map[64] = {};
+       uint8_t reverse_map[64] = {0};
         unsigned slot = 0;
  
         while (outputs_written)
@@ -1968,14 +1819,8 @@ panfrost_bind_shader_state(
          enum pipe_shader_type type)
  {
          struct panfrost_context *ctx = pan_context(pctx);
-
          ctx->shader[type] = hwcso;
  
-        if (type == PIPE_SHADER_FRAGMENT)
-                ctx->dirty |= PAN_DIRTY_FS;
-        else
-                ctx->dirty |= PAN_DIRTY_VS;
-
          if (!hwcso) return;
  
          /* Match the appropriate variant */
@@ -1993,7 +1838,25 @@ panfrost_bind_shader_state(
          if (variant == -1) {
                  /* No variant matched, so create a new one */
                  variant = variants->variant_count++;
-                assert(variants->variant_count < MAX_SHADER_VARIANTS);
+
+                if (variants->variant_count > variants->variant_space) {
+                        unsigned old_space = variants->variant_space;
+
+                        variants->variant_space *= 2;
+                        if (variants->variant_space == 0)
+                                variants->variant_space = 1;
+
+                        /* Arbitrary limit to stop runaway programs from
+                         * creating an unbounded number of shader variants. */
+                        assert(variants->variant_space < 1024);
+
+                        unsigned msize = sizeof(struct panfrost_shader_state);
+                        variants->variants = realloc(variants->variants,
+                                                     variants->variant_space * msize);
+
+                        memset(&variants->variants[old_space], 0,
+                               (variants->variant_space - old_space) * msize);
+                }
  
                  struct panfrost_shader_state *v =
                                  &variants->variants[variant];
@@ -2043,6 +1906,18 @@ panfrost_bind_shader_state(
          }
  }
  
+static void *
+panfrost_create_vs_state(struct pipe_context *pctx, const struct pipe_shader_state *hwcso)
+{
+        return panfrost_create_shader_state(pctx, hwcso, PIPE_SHADER_VERTEX);
+}
+
+static void *
+panfrost_create_fs_state(struct pipe_context *pctx, const struct pipe_shader_state *hwcso)
+{
+        return panfrost_create_shader_state(pctx, hwcso, PIPE_SHADER_FRAGMENT);
+}
+
  static void
  panfrost_bind_vs_state(struct pipe_context *pctx, void *hwcso)
  {
@@ -2097,9 +1972,6 @@ panfrost_set_stencil_ref(
  {
          struct panfrost_context *ctx = pan_context(pctx);
          ctx->stencil_ref = *ref;
-
-        /* Shader core dirty */
-        ctx->dirty |= PAN_DIRTY_FS;
  }
  
  static enum mali_texture_type
@@ -2134,8 +2006,8 @@ panfrost_create_sampler_view(
          struct pipe_resource *texture,
          const struct pipe_sampler_view *template)
  {
+        struct panfrost_screen *screen = pan_screen(pctx->screen);
          struct panfrost_sampler_view *so = rzalloc(pctx, struct panfrost_sampler_view);
-        int bytes_per_pixel = util_format_get_blocksize(texture->format);
  
          pipe_reference(NULL, &texture->reference);
  
@@ -2147,12 +2019,6 @@ panfrost_create_sampler_view(
          so->base.reference.count = 1;
          so->base.context = pctx;
  
-        /* sampler_views correspond to texture descriptors, minus the texture
-         * (data) itself. So, we serialise the descriptor here and cache it for
-         * later. */
-
-        const struct util_format_description *desc = util_format_description(prsrc->base.format);
-
          unsigned char user_swizzle[4] = {
                  template->swizzle_r,
                  template->swizzle_g,
@@ -2160,29 +2026,6 @@ panfrost_create_sampler_view(
                  template->swizzle_a
          };
  
-        enum mali_format format = panfrost_find_format(desc);
-
-        /* Check if we need to set a custom stride by computing the "expected"
-         * stride and comparing it to what the BO actually wants. Only applies
-         * to linear textures, since tiled/compressed textures have strict
-         * alignment requirements for their strides as it is */
-
-        unsigned first_level = template->u.tex.first_level;
-        unsigned last_level = template->u.tex.last_level;
-
-        if (prsrc->layout == PAN_LINEAR) {
-                for (unsigned l = first_level; l <= last_level; ++l) {
-                        unsigned actual_stride = prsrc->slices[l].stride;
-                        unsigned width = u_minify(texture->width0, l);
-                        unsigned comp_stride = width * bytes_per_pixel;
-
-                        if (comp_stride != actual_stride) {
-                                so->manual_stride = true;
-                                break;
-                        }
-                }
-        }
-
          /* In the hardware, array_size refers specifically to array textures,
           * whereas in Gallium, it also covers cubemaps */
  
@@ -2194,26 +2037,32 @@ panfrost_create_sampler_view(
                  array_size /= 6;
          }
  
-        struct mali_texture_descriptor texture_descriptor = {
-                .width = MALI_POSITIVE(u_minify(texture->width0, first_level)),
-                .height = MALI_POSITIVE(u_minify(texture->height0, first_level)),
-                .depth = MALI_POSITIVE(u_minify(texture->depth0, first_level)),
-                .array_size = MALI_POSITIVE(array_size),
-
-                .format = {
-                        .swizzle = panfrost_translate_swizzle_4(desc->swizzle),
-                        .format = format,
-                        .srgb = desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB,
-                        .type = panfrost_translate_texture_type(template->target),
-                        .unknown2 = 0x1,
-                },
-
-                .swizzle = panfrost_translate_swizzle_4(user_swizzle)
-        };
-
-        texture_descriptor.levels = last_level - first_level;
-
-        so->hw = texture_descriptor;
+        enum mali_texture_type type =
+                panfrost_translate_texture_type(template->target);
+
+        unsigned size = panfrost_estimate_texture_size(
+                        template->u.tex.first_level,
+                        template->u.tex.last_level,
+                        template->u.tex.first_layer,
+                        template->u.tex.last_layer,
+                        type, prsrc->layout);
+
+        so->bo = panfrost_bo_create(screen, size, 0);
+
+        panfrost_new_texture(
+                        so->bo->cpu,
+                        texture->width0, texture->height0,
+                        texture->depth0, array_size,
+                        texture->format,
+                        type, prsrc->layout,
+                        template->u.tex.first_level,
+                        template->u.tex.last_level,
+                        template->u.tex.first_layer,
+                        template->u.tex.last_layer,
+                        prsrc->cubemap_stride,
+                        panfrost_translate_swizzle_4(user_swizzle),
+                        prsrc->bo->gpu,
+                        prsrc->slices);
  
          return (struct pipe_sampler_view *) so;
  }
@@ -2226,27 +2075,34 @@ panfrost_set_sampler_views(
          struct pipe_sampler_view **views)
  {
          struct panfrost_context *ctx = pan_context(pctx);
+        unsigned new_nr = 0;
+        unsigned i;
  
          assert(start_slot == 0);
  
-        unsigned new_nr = 0;
-        for (unsigned i = 0; i < num_views; ++i) {
+        for (i = 0; i < num_views; ++i) {
                  if (views[i])
                          new_nr = i + 1;
+               pipe_sampler_view_reference((struct pipe_sampler_view **)&ctx->sampler_views[shader][i],
+                                           views[i]);
          }
  
+        for (; i < ctx->sampler_view_count[shader]; i++) {
+               pipe_sampler_view_reference((struct pipe_sampler_view **)&ctx->sampler_views[shader][i],
+                                           NULL);
+        }
          ctx->sampler_view_count[shader] = new_nr;
-        memcpy(ctx->sampler_views[shader], views, num_views * sizeof (void *));
-
-        ctx->dirty |= PAN_DIRTY_TEXTURES;
  }
  
  static void
  panfrost_sampler_view_destroy(
          struct pipe_context *pctx,
-        struct pipe_sampler_view *view)
+        struct pipe_sampler_view *pview)
  {
-        pipe_resource_reference(&view->texture, NULL);
+        struct panfrost_sampler_view *view = (struct panfrost_sampler_view *) pview;
+
+        pipe_resource_reference(&pview->texture, NULL);
+        panfrost_bo_unreference(view->bo);
          ralloc_free(view);
  }
  
@@ -2279,14 +2135,14 @@ panfrost_hint_afbc(
          for (unsigned i = 0; i < fb->nr_cbufs; ++i) {
                  struct pipe_surface *surf = fb->cbufs[i];
                  struct panfrost_resource *rsrc = pan_resource(surf->texture);
-                panfrost_resource_hint_layout(screen, rsrc, PAN_AFBC, 1);
+                panfrost_resource_hint_layout(screen, rsrc, MALI_TEXTURE_AFBC, 1);
          }
  
          /* Also hint it to the depth buffer */
  
          if (fb->zsbuf) {
                  struct panfrost_resource *rsrc = pan_resource(fb->zsbuf->texture);
-                panfrost_resource_hint_layout(screen, rsrc, PAN_AFBC, 1);
+                panfrost_resource_hint_layout(screen, rsrc, MALI_TEXTURE_AFBC, 1);
          }
  }
  
@@ -2352,8 +2208,6 @@ panfrost_bind_depth_stencil_state(struct pipe_context *pipe,
  
          /* Bounds test not implemented */
          assert(!depth_stencil->depth.bounds_test);
-
-        ctx->dirty |= PAN_DIRTY_FS;
  }
  
  static void
@@ -2494,7 +2348,7 @@ panfrost_begin_query(struct pipe_context *pipe, struct pipe_query *q)
                  break;
  
          default:
-                fprintf(stderr, "Skipping query %u\n", query->type);
+                DBG("Skipping query %u\n", query->type);
                  break;
          }
  
@@ -2626,7 +2480,6 @@ struct pipe_context *
  panfrost_create_context(struct pipe_screen *screen, void *priv, unsigned flags)
  {
          struct panfrost_context *ctx = rzalloc(screen, struct panfrost_context);
-        struct panfrost_screen *pscreen = pan_screen(screen);
          struct pipe_context *gallium = (struct pipe_context *) ctx;
  
          gallium->screen = screen;
@@ -2657,11 +2510,11 @@ panfrost_create_context(struct pipe_screen *screen, void *priv, unsigned flags)
          gallium->bind_vertex_elements_state = panfrost_bind_vertex_elements_state;
          gallium->delete_vertex_elements_state = panfrost_generic_cso_delete;
  
-        gallium->create_fs_state = panfrost_create_shader_state;
+        gallium->create_fs_state = panfrost_create_fs_state;
          gallium->delete_fs_state = panfrost_delete_shader_state;
          gallium->bind_fs_state = panfrost_bind_fs_state;
  
-        gallium->create_vs_state = panfrost_create_shader_state;
+        gallium->create_vs_state = panfrost_create_vs_state;
          gallium->delete_vs_state = panfrost_delete_shader_state;
          gallium->bind_vs_state = panfrost_bind_vs_state;
  
@@ -2715,7 +2568,6 @@ panfrost_create_context(struct pipe_screen *screen, void *priv, unsigned flags)
  
          panfrost_batch_init(ctx);
          panfrost_emit_vertex_payload(ctx);
-        panfrost_emit_tiler_payload(ctx);
          panfrost_invalidate_frame(ctx);
          panfrost_default_shader_backend(ctx);