panfrost: Preliminary work for mipmaps

[mesa.git] / src / gallium / drivers / panfrost / pan_context.c
diff --git a/src/gallium/drivers/panfrost/pan_context.c b/src/gallium/drivers/panfrost/pan_context.c

index f08461b03a69c4e3517b7670170e81d24ca7415b..d242c3f90ceab9b57825bc7d7a92460166ad9f81 100644 (file)
--- a/src/gallium/drivers/panfrost/pan_context.c
+++ b/src/gallium/drivers/panfrost/pan_context.c
@@ -41,41 +41,15 @@
  #include "pan_screen.h"
  #include "pan_blending.h"
  #include "pan_blend_shaders.h"
+#include "pan_util.h"
  #include "pan_wallpaper.h"
  
-#ifdef DUMP_PERFORMANCE_COUNTERS
  static int performance_counter_number = 0;
-#endif
+extern const char *pan_counters_base;
  
  /* Do not actually send anything to the GPU; merely generate the cmdstream as fast as possible. Disables framebuffer writes */
  //#define DRY_RUN
  
-#define SET_BIT(lval, bit, cond) \
-       if (cond) \
-               lval |= (bit); \
-       else \
-               lval &= ~(bit);
-
-/* TODO: Sample size, etc */
-
-static void
-panfrost_set_framebuffer_msaa(struct panfrost_context *ctx, bool enabled)
-{
-        SET_BIT(ctx->fragment_shader_core.unknown2_3, MALI_HAS_MSAA, enabled);
-        SET_BIT(ctx->fragment_shader_core.unknown2_4, MALI_NO_MSAA, !enabled);
-
-#ifdef SFBD
-        SET_BIT(ctx->fragment_fbd.format, MALI_FRAMEBUFFER_MSAA_A | MALI_FRAMEBUFFER_MSAA_B, enabled);
-#else
-        SET_BIT(ctx->fragment_rts[0].format, MALI_MFBD_FORMAT_MSAA, enabled);
-
-        SET_BIT(ctx->fragment_fbd.unk1, (1 << 4) | (1 << 1), enabled);
-
-        /* XXX */
-        ctx->fragment_fbd.rt_count_2 = enabled ? 4 : 1;
-#endif
-}
-
  /* AFBC is enabled on a per-resource basis (AFBC enabling is theoretically
   * indepdent between color buffers and depth/stencil). To enable, we allocate
   * the AFBC metadata buffer and mark that it is enabled. We do -not- actually
@@ -85,14 +59,18 @@ panfrost_set_framebuffer_msaa(struct panfrost_context *ctx, bool enabled)
  static void
  panfrost_enable_afbc(struct panfrost_context *ctx, struct panfrost_resource *rsrc, bool ds)
  {
-#ifdef MFBD
+        if (ctx->require_sfbd) {
+                DBG("AFBC not supported yet on SFBD\n");
+                assert(0);
+        }
+
          struct pipe_context *gallium = (struct pipe_context *) ctx;
          struct panfrost_screen *screen = pan_screen(gallium->screen);
         /* AFBC metadata is 16 bytes per tile */
          int tile_w = (rsrc->base.width0 + (MALI_TILE_LENGTH - 1)) >> MALI_TILE_SHIFT;
          int tile_h = (rsrc->base.height0 + (MALI_TILE_LENGTH - 1)) >> MALI_TILE_SHIFT;
          int bytes_per_pixel = util_format_get_blocksize(rsrc->base.format);
-        int stride = bytes_per_pixel * rsrc->base.width0; /* TODO: Alignment? */
+        int stride = bytes_per_pixel * ALIGN(rsrc->base.width0, 16);
  
          stride *= 2;  /* TODO: Should this be carried over? */
          int main_size = stride * rsrc->base.height0;
@@ -103,16 +81,12 @@ panfrost_enable_afbc(struct panfrost_context *ctx, struct panfrost_resource *rsr
                                 (rsrc->bo->afbc_metadata_size + main_size + 4095) / 4096,
                                 true, 0, 0, 0);
  
-        rsrc->bo->has_afbc = true;
+        rsrc->bo->layout = PAN_AFBC;
  
          /* Compressed textured reads use a tagged pointer to the metadata */
  
-        rsrc->bo->gpu[0] = rsrc->bo->afbc_slab.gpu | (ds ? 0 : 1);
-        rsrc->bo->cpu[0] = rsrc->bo->afbc_slab.cpu;
-#else
-        printf("AFBC not supported yet on SFBD\n");
-        assert(0);
-#endif
+        rsrc->bo->gpu = rsrc->bo->afbc_slab.gpu | (ds ? 0 : 1);
+        rsrc->bo->cpu = rsrc->bo->afbc_slab.cpu;
  }
  
  static void
@@ -131,66 +105,8 @@ panfrost_enable_checksum(struct panfrost_context *ctx, struct panfrost_resource
          rsrc->bo->has_checksum = true;
  }
  
-/* ..by contrast, this routine runs for every FRAGMENT job, but does no
- * allocation. AFBC is enabled on a per-surface basis */
-
-static void
-panfrost_set_fragment_afbc(struct panfrost_context *ctx)
-{
-        for (int cb = 0; cb < ctx->pipe_framebuffer.nr_cbufs; ++cb) {
-                struct panfrost_resource *rsrc = (struct panfrost_resource *) ctx->pipe_framebuffer.cbufs[cb]->texture;
-
-                /* Non-AFBC is the default */
-                if (!rsrc->bo->has_afbc)
-                        continue;
-
-                /* Enable AFBC for the render target */
-                ctx->fragment_rts[0].afbc.metadata = rsrc->bo->afbc_slab.gpu;
-                ctx->fragment_rts[0].afbc.stride = 0;
-                ctx->fragment_rts[0].afbc.unk = 0x30009;
-
-                ctx->fragment_rts[0].format |= MALI_MFBD_FORMAT_AFBC;
-
-                /* Point rendering to our special framebuffer */
-                ctx->fragment_rts[0].framebuffer = rsrc->bo->afbc_slab.gpu + rsrc->bo->afbc_metadata_size;
-
-                /* WAT? Stride is diff from the scanout case */
-                ctx->fragment_rts[0].framebuffer_stride = ctx->pipe_framebuffer.width * 2 * 4;
-        }
-
-        /* Enable depth/stencil AFBC for the framebuffer (not the render target) */
-        if (ctx->pipe_framebuffer.zsbuf) {
-                struct panfrost_resource *rsrc = (struct panfrost_resource *) ctx->pipe_framebuffer.zsbuf->texture;
-
-                if (rsrc->bo->has_afbc) {
-                        ctx->fragment_fbd.unk3 |= MALI_MFBD_EXTRA;
-
-                        ctx->fragment_extra.ds_afbc.depth_stencil_afbc_metadata = rsrc->bo->afbc_slab.gpu;
-                        ctx->fragment_extra.ds_afbc.depth_stencil_afbc_stride = 0;
-
-                        ctx->fragment_extra.ds_afbc.depth_stencil = rsrc->bo->afbc_slab.gpu + rsrc->bo->afbc_metadata_size;
-
-                        ctx->fragment_extra.ds_afbc.zero1 = 0x10009;
-                        ctx->fragment_extra.ds_afbc.padding = 0x1000;
-
-                        ctx->fragment_extra.unk = 0x435; /* General 0x400 in all unks. 0x5 for depth/stencil. 0x10 for AFBC encoded depth stencil. Unclear where the 0x20 is from */
-
-                        ctx->fragment_fbd.unk3 |= 0x400;
-                }
-        }
-
-        /* For the special case of a depth-only FBO, we need to attach a dummy render target */
-
-        if (ctx->pipe_framebuffer.nr_cbufs == 0) {
-                ctx->fragment_rts[0].format = 0x80008000;
-                ctx->fragment_rts[0].framebuffer = 0;
-                ctx->fragment_rts[0].framebuffer_stride = 0;
-        }
-}
-
  /* Framebuffer descriptor */
  
-#ifdef SFBD
  static void
  panfrost_set_framebuffer_resolution(struct mali_single_framebuffer *fb, int w, int h)
  {
@@ -204,28 +120,53 @@ panfrost_set_framebuffer_resolution(struct mali_single_framebuffer *fb, int w, i
  
          fb->resolution_check = ((w + h) / 3) << 4;
  }
-#endif
  
-static PANFROST_FRAMEBUFFER
-panfrost_emit_fbd(struct panfrost_context *ctx)
+struct mali_single_framebuffer
+panfrost_emit_sfbd(struct panfrost_context *ctx)
  {
-#ifdef SFBD
          struct mali_single_framebuffer framebuffer = {
                  .unknown2 = 0x1f,
                  .format = 0x30000000,
                  .clear_flags = 0x1000,
                  .unknown_address_0 = ctx->scratchpad.gpu,
-                .unknown_address_1 = ctx->scratchpad.gpu + 0x6000,
-                .unknown_address_2 = ctx->scratchpad.gpu + 0x6200,
+                .unknown_address_1 = ctx->misc_0.gpu,
+                .unknown_address_2 = ctx->misc_0.gpu + 40960,
                  .tiler_flags = 0xf0,
                  .tiler_heap_free = ctx->tiler_heap.gpu,
                  .tiler_heap_end = ctx->tiler_heap.gpu + ctx->tiler_heap.size,
          };
  
          panfrost_set_framebuffer_resolution(&framebuffer, ctx->pipe_framebuffer.width, ctx->pipe_framebuffer.height);
-#else
+
+        return framebuffer;
+}
+
+struct bifrost_framebuffer
+panfrost_emit_mfbd(struct panfrost_context *ctx)
+{
          struct bifrost_framebuffer framebuffer = {
-                .tiler_meta = 0xf00000c600,
+                /* It is not yet clear what tiler_meta means or how it's
+                 * calculated, but we can tell the lower 32-bits are a
+                 * (monotonically increasing?) function of tile count and
+                 * geometry complexity; I suspect it defines a memory size of
+                 * some kind? for the tiler. It's really unclear at the
+                 * moment... but to add to the confusion, the hardware is happy
+                 * enough to accept a zero in this field, so we don't even have
+                 * to worry about it right now.
+                 *
+                 * The byte (just after the 32-bit mark) is much more
+                 * interesting. The higher nibble I've only ever seen as 0xF,
+                 * but the lower one I've seen as 0x0 or 0xF, and it's not
+                 * obvious what the difference is. But what -is- obvious is
+                 * that when the lower nibble is zero, performance is severely
+                 * degraded compared to when the lower nibble is set.
+                 * Evidently, that nibble enables some sort of fast path,
+                 * perhaps relating to caching or tile flush? Regardless, at
+                 * this point there's no clear reason not to set it, aside from
+                 * substantially increased memory requirements (of the misc_0
+                 * buffer) */
+
+                .tiler_meta = ((uint64_t) 0xff << 32) | 0x0,
  
                  .width1 = MALI_POSITIVE(ctx->pipe_framebuffer.width),
                  .height1 = MALI_POSITIVE(ctx->pipe_framebuffer.height),
@@ -240,23 +181,34 @@ panfrost_emit_fbd(struct panfrost_context *ctx)
  
                  .unknown2 = 0x1f,
  
-                /* Presumably corresponds to unknown_address_X of SFBD */
+                /* Corresponds to unknown_address_X of SFBD */
                  .scratchpad = ctx->scratchpad.gpu,
                  .tiler_scratch_start  = ctx->misc_0.gpu,
-                .tiler_scratch_middle = ctx->misc_0.gpu + /*ctx->misc_0.size*/40960, /* Size depends on the size of the framebuffer and the number of vertices */
+
+                /* The constant added here is, like the lower word of
+                 * tiler_meta, (loosely) another product of framebuffer size
+                 * and geometry complexity. It must be sufficiently large for
+                 * the tiler_meta fast path to work; if it's too small, there
+                 * will be DATA_INVALID_FAULTs. Conversely, it must be less
+                 * than the total size of misc_0, or else there's no room. It's
+                 * possible this constant configures a partition between two
+                 * parts of misc_0? We haven't investigated the functionality,
+                 * as these buffers are internally used by the hardware
+                 * (presumably by the tiler) but not seemingly touched by the driver
+                 */
+
+                .tiler_scratch_middle = ctx->misc_0.gpu + 0xf0000,
  
                  .tiler_heap_start = ctx->tiler_heap.gpu,
                  .tiler_heap_end = ctx->tiler_heap.gpu + ctx->tiler_heap.size,
          };
  
-#endif
-
          return framebuffer;
  }
  
  /* Are we currently rendering to the screen (rather than an FBO)? */
  
-static bool
+bool
  panfrost_is_scanout(struct panfrost_context *ctx)
  {
          /* If there is no color buffer, it's an FBO */
@@ -272,60 +224,6 @@ panfrost_is_scanout(struct panfrost_context *ctx)
                 ctx->pipe_framebuffer.cbufs[0]->texture->bind & PIPE_BIND_SHARED;
  }
  
-/* The above function is for generalised fbd emission, used in both fragment as
- * well as vertex/tiler payloads. This payload is specific to fragment
- * payloads. */
-
-static void
-panfrost_new_frag_framebuffer(struct panfrost_context *ctx)
-{
-        mali_ptr framebuffer;
-        int stride;
-
-        if (ctx->pipe_framebuffer.nr_cbufs > 0) {
-               framebuffer = ((struct panfrost_resource *) ctx->pipe_framebuffer.cbufs[0]->texture)->bo->gpu[0];
-                stride = util_format_get_stride(ctx->pipe_framebuffer.cbufs[0]->format, ctx->pipe_framebuffer.width);
-        } else {
-                /* Depth-only framebuffer -> dummy RT */
-                framebuffer = 0;
-                stride = 0;
-        }
-
-        /* The default is upside down from OpenGL's perspective. */
-        if (panfrost_is_scanout(ctx)) {
-                framebuffer += stride * (ctx->pipe_framebuffer.height - 1);
-                stride = -stride;
-        }
-
-#ifdef SFBD
-        struct mali_single_framebuffer fb = panfrost_emit_fbd(ctx);
-
-        fb.framebuffer = framebuffer;
-        fb.stride = stride;
-
-        fb.format = 0xb84e0281; /* RGB32, no MSAA */
-#else
-        struct bifrost_framebuffer fb = panfrost_emit_fbd(ctx);
-
-        /* XXX: MRT case */
-        fb.rt_count_2 = 1;
-        fb.unk3 = 0x100;
-
-        struct bifrost_render_target rt = {
-                .unk1 = 0x4000000,
-                .format = 0x860a8899, /* RGBA32, no MSAA */
-                .framebuffer = framebuffer,
-                .framebuffer_stride = (stride / 16) & 0xfffffff,
-        };
-
-        memcpy(&ctx->fragment_rts[0], &rt, sizeof(rt));
-
-        memset(&ctx->fragment_extra, 0, sizeof(ctx->fragment_extra));
-#endif
-
-        memcpy(&ctx->fragment_fbd, &fb, sizeof(fb));
-}
-
  /* Maps float 0.0-1.0 to int 0x00-0xFF */
  static uint8_t
  normalised_float_to_u8(float f)
@@ -341,124 +239,38 @@ panfrost_clear(
          double depth, unsigned stencil)
  {
          struct panfrost_context *ctx = pan_context(pipe);
+        struct panfrost_job *job = panfrost_get_job_for_fbo(ctx);
  
-        if (!color) {
-                printf("Warning: clear color null?\n");
-                return;
-        }
-
-        /* Save settings for FBO switch */
-        ctx->last_clear.buffers = buffers;
-        ctx->last_clear.color = color;
-        ctx->last_clear.depth = depth;
-        ctx->last_clear.depth = depth;
-
-        bool clear_color = buffers & PIPE_CLEAR_COLOR;
-        bool clear_depth = buffers & PIPE_CLEAR_DEPTH;
-        bool clear_stencil = buffers & PIPE_CLEAR_STENCIL;
-
-        /* Remember that we've done something */
-        ctx->frame_cleared = true;
-
-        /* Alpha clear only meaningful without alpha channel */
-        bool has_alpha = ctx->pipe_framebuffer.nr_cbufs && util_format_has_alpha(ctx->pipe_framebuffer.cbufs[0]->format);
-        float clear_alpha = has_alpha ? color->f[3] : 1.0f;
-
-        uint32_t packed_color =
-                (normalised_float_to_u8(clear_alpha) << 24) |
-                (normalised_float_to_u8(color->f[2]) << 16) |
-                (normalised_float_to_u8(color->f[1]) <<  8) |
-                (normalised_float_to_u8(color->f[0]) <<  0);
-
-#ifdef MFBD
-        struct bifrost_render_target *buffer_color = &ctx->fragment_rts[0];
-#else
-        struct mali_single_framebuffer *buffer_color = &ctx->fragment_fbd;
-#endif
-
-#ifdef MFBD
-        struct bifrost_framebuffer *buffer_ds = &ctx->fragment_fbd;
-#else
-        struct mali_single_framebuffer *buffer_ds = buffer_color;
-#endif
+        if (buffers & PIPE_CLEAR_COLOR) {
+                /* Alpha clear only meaningful without alpha channel, TODO less ad hoc */
+                bool has_alpha = util_format_has_alpha(ctx->pipe_framebuffer.cbufs[0]->format);
+                float clear_alpha = has_alpha ? color->f[3] : 1.0f;
  
-        if (clear_color) {
-                /* Fields duplicated 4x for unknown reasons. Same in Utgard,
-                 * too, which is doubly weird. */
+                uint32_t packed_color =
+                        (normalised_float_to_u8(clear_alpha) << 24) |
+                        (normalised_float_to_u8(color->f[2]) << 16) |
+                        (normalised_float_to_u8(color->f[1]) <<  8) |
+                        (normalised_float_to_u8(color->f[0]) <<  0);
  
-                buffer_color->clear_color_1 = packed_color;
-                buffer_color->clear_color_2 = packed_color;
-                buffer_color->clear_color_3 = packed_color;
-                buffer_color->clear_color_4 = packed_color;
-        }
+                job->clear_color = packed_color;
  
-        if (clear_depth) {
-#ifdef SFBD
-                buffer_ds->clear_depth_1 = depth;
-                buffer_ds->clear_depth_2 = depth;
-                buffer_ds->clear_depth_3 = depth;
-                buffer_ds->clear_depth_4 = depth;
-#else
-                buffer_ds->clear_depth = depth;
-#endif
          }
  
-        if (clear_stencil) {
-                buffer_ds->clear_stencil = stencil;
+        if (buffers & PIPE_CLEAR_DEPTH) {
+                job->clear_depth = depth;
          }
  
-        /* Setup buffers depending on MFBD/SFBD */
-
-#ifdef MFBD
-
-        if (clear_depth || clear_stencil) {
-                /* Setup combined 24/8 depth/stencil */
-                ctx->fragment_fbd.unk3 |= MALI_MFBD_EXTRA;
-                //ctx->fragment_extra.unk = /*0x405*/0x404;
-                ctx->fragment_extra.unk = 0x405;
-                ctx->fragment_extra.ds_linear.depth = ctx->depth_stencil_buffer.gpu;
-                ctx->fragment_extra.ds_linear.depth_stride = ctx->pipe_framebuffer.width * 4;
+        if (buffers & PIPE_CLEAR_STENCIL) {
+                job->clear_stencil = stencil;
          }
  
-#else
-
-        if (clear_depth) {
-                buffer_ds->depth_buffer = ctx->depth_stencil_buffer.gpu;
-                buffer_ds->depth_buffer_enable = MALI_DEPTH_STENCIL_ENABLE;
-        }
-
-        if (clear_stencil) {
-                buffer_ds->stencil_buffer = ctx->depth_stencil_buffer.gpu;
-                buffer_ds->stencil_buffer_enable = MALI_DEPTH_STENCIL_ENABLE;
-        }
-
-#endif
-
-#ifdef SFBD
-        /* Set flags based on what has been cleared, for the SFBD case */
-        /* XXX: What do these flags mean? */
-        int clear_flags = 0x101100;
-
-        if (clear_color && clear_depth && clear_stencil) {
-                /* On a tiler like this, it's fastest to clear all three buffers at once */
-
-                clear_flags |= MALI_CLEAR_FAST;
-        } else {
-                clear_flags |= MALI_CLEAR_SLOW;
-
-                if (clear_stencil)
-                        clear_flags |= MALI_CLEAR_SLOW_STENCIL;
-        }
-
-        fbd->clear_flags = clear_flags;
-#endif
+        job->clear |= buffers;
  }
  
-static void
-panfrost_attach_vt_framebuffer(struct panfrost_context *ctx)
+static mali_ptr
+panfrost_attach_vt_mfbd(struct panfrost_context *ctx)
  {
-#ifdef MFBD
-        /* MFBD needs a sequential semi-render target upload, but this is, is beyond me for now */
+        /* MFBD needs a sequential semi-render target upload, but what exactly this is, is beyond me for now */
          struct bifrost_render_target rts_list[] = {
                  {
                          .chunknown = {
@@ -470,49 +282,33 @@ panfrost_attach_vt_framebuffer(struct panfrost_context *ctx)
          };
  
          /* Allocate memory for the three components */
-        int size = 1024 + sizeof(ctx->vt_framebuffer) + sizeof(rts_list);
+        int size = 1024 + sizeof(ctx->vt_framebuffer_mfbd) + sizeof(rts_list);
          struct panfrost_transfer transfer = panfrost_allocate_transient(ctx, size);
  
          /* Opaque 1024-block */
          rts_list[0].chunknown.pointer = transfer.gpu;
  
-        mali_ptr framebuffer = (transfer.gpu + 1024) | PANFROST_DEFAULT_FBD;
-        memcpy(transfer.cpu + 1024, &ctx->vt_framebuffer, sizeof(ctx->vt_framebuffer));
-        memcpy(transfer.cpu + 1024 + sizeof(ctx->vt_framebuffer), rts_list, sizeof(rts_list));
-#else
-        mali_ptr framebuffer = panfrost_upload_transient(ctx, &ctx->vt_framebuffer, sizeof(ctx->vt_framebuffer)) | PANFROST_DEFAULT_FBD;
-#endif
-        ctx->payload_vertex.postfix.framebuffer = framebuffer;
-        ctx->payload_tiler.postfix.framebuffer = framebuffer;
+        memcpy(transfer.cpu + 1024, &ctx->vt_framebuffer_mfbd, sizeof(ctx->vt_framebuffer_mfbd));
+        memcpy(transfer.cpu + 1024 + sizeof(ctx->vt_framebuffer_mfbd), rts_list, sizeof(rts_list));
+
+        return (transfer.gpu + 1024) | MALI_MFBD;
  }
  
-static void
-panfrost_viewport(struct panfrost_context *ctx,
-                  float depth_range_n,
-                  float depth_range_f,
-                  int viewport_x0, int viewport_y0,
-                  int viewport_x1, int viewport_y1)
+static mali_ptr
+panfrost_attach_vt_sfbd(struct panfrost_context *ctx)
  {
-        /* Viewport encoding is asymmetric. Purpose of the floats is unknown? */
-
-        struct mali_viewport ret = {
-                .floats = {
-#if 0
-                        -inff, -inff,
-                        inff, inff,
-#endif
-                        0.0, 0.0,
-                        2048.0, 1600.0,
-                },
-
-                .depth_range_n = depth_range_n,
-                .depth_range_f = depth_range_f,
+        return panfrost_upload_transient(ctx, &ctx->vt_framebuffer_sfbd, sizeof(ctx->vt_framebuffer_sfbd)) | MALI_SFBD;
+}
  
-                .viewport0 = { viewport_x0, viewport_y0 },
-                .viewport1 = { MALI_POSITIVE(viewport_x1), MALI_POSITIVE(viewport_y1) },
-        };
+static void
+panfrost_attach_vt_framebuffer(struct panfrost_context *ctx)
+{
+        mali_ptr framebuffer = ctx->require_sfbd ?
+                panfrost_attach_vt_sfbd(ctx) :
+                panfrost_attach_vt_mfbd(ctx);
  
-        memcpy(ctx->viewport, &ret, sizeof(ret));
+        ctx->payload_vertex.postfix.framebuffer = framebuffer;
+        ctx->payload_tiler.postfix.framebuffer = framebuffer;
  }
  
  /* Reset per-frame context, called on context initialisation as well as after
@@ -522,14 +318,16 @@ static void
  panfrost_invalidate_frame(struct panfrost_context *ctx)
  {
          unsigned transient_count = ctx->transient_pools[ctx->cmdstream_i].entry_index*ctx->transient_pools[0].entry_size + ctx->transient_pools[ctx->cmdstream_i].entry_offset;
-       printf("Uploaded transient %d bytes\n", transient_count);
+       DBG("Uploaded transient %d bytes\n", transient_count);
  
          /* Rotate cmdstream */
          if ((++ctx->cmdstream_i) == (sizeof(ctx->transient_pools) / sizeof(ctx->transient_pools[0])))
                  ctx->cmdstream_i = 0;
  
-        ctx->vt_framebuffer = panfrost_emit_fbd(ctx);
-        panfrost_new_frag_framebuffer(ctx);
+        if (ctx->require_sfbd)
+                ctx->vt_framebuffer_sfbd = panfrost_emit_sfbd(ctx);
+        else
+                ctx->vt_framebuffer_mfbd = panfrost_emit_mfbd(ctx);
  
          /* Reset varyings allocated */
          ctx->varying_height = 0;
@@ -563,7 +361,7 @@ panfrost_emit_vertex_payload(struct panfrost_context *ctx)
                          .workgroups_x_shift_2 = 0x2,
                          .workgroups_x_shift_3 = 0x5,
                  },
-                .gl_enables = 0x6
+               .gl_enables = 0x4 | (ctx->is_t6xx ? 0 : 0x2),
          };
  
          memcpy(&ctx->payload_vertex, &payload, sizeof(payload));
@@ -582,11 +380,6 @@ panfrost_emit_tiler_payload(struct panfrost_context *ctx)
                  },
          };
  
-        /* Reserve the viewport */
-        struct panfrost_transfer t = panfrost_allocate_chunk(ctx, sizeof(struct mali_viewport), HEAP_DESCRIPTOR);
-        ctx->viewport = (struct mali_viewport *) t.cpu;
-        payload.postfix.viewport = t.gpu;
-
          memcpy(&ctx->payload_tiler, &payload, sizeof(payload));
  }
  
@@ -751,14 +544,14 @@ panfrost_default_shader_backend(struct panfrost_context *ctx)
          struct mali_shader_meta shader = {
                  .alpha_coverage = ~MALI_ALPHA_COVERAGE(0.000000),
  
-                .unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x3010 /*| MALI_CAN_DISCARD*/,
-#ifdef T8XX
+                .unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x3010,
                  .unknown2_4 = MALI_NO_MSAA | 0x4e0,
-#else
-                .unknown2_4 = MALI_NO_MSAA | 0x4f0,
-#endif
          };
  
+       if (ctx->is_t6xx) {
+                shader.unknown2_4 |= 0x10;
+       }
+
          struct pipe_stencil_state default_stencil = {
                  .enabled = 0,
                  .func = PIPE_FUNC_ALWAYS,
@@ -801,14 +594,6 @@ panfrost_vertex_tiler_job(struct panfrost_context *ctx, bool is_tiler, bool is_e
  #endif
          };
  
-        /* XXX: What is this? */
-#ifdef T6XX
-
-        if (is_tiler)
-                job.unknown_flags = ctx->draw_count ? 64 : 1;
-
-#endif
-
          /* Only non-elided tiler jobs have dependencies which are known at this point */
  
          if (is_tiler && !is_elided_tiler) {
@@ -862,72 +647,88 @@ panfrost_set_value_job(struct panfrost_context *ctx)
          ctx->set_value_job = transfer.gpu;
  }
  
-/* Generate a fragment job. This should be called once per frame. (According to
- * presentations, this is supposed to correspond to eglSwapBuffers) */
+static mali_ptr
+panfrost_emit_varyings(
+                struct panfrost_context *ctx,
+                union mali_attr *slot,
+                unsigned stride,
+                unsigned count)
+{
+        mali_ptr varying_address = ctx->varying_mem.gpu + ctx->varying_height;
+
+        /* Fill out the descriptor */
+        slot->elements = varying_address | MALI_ATTR_LINEAR;
+        slot->stride = stride;
+        slot->size = stride * count;
+
+        ctx->varying_height += ALIGN(slot->size, 64);
+        assert(ctx->varying_height < ctx->varying_mem.size);
+
+        return varying_address;
+}
  
-mali_ptr
-panfrost_fragment_job(struct panfrost_context *ctx)
+static void
+panfrost_emit_point_coord(union mali_attr *slot)
  {
-        /* Update fragment FBD */
-        panfrost_set_fragment_afbc(ctx);
+        slot->elements = MALI_VARYING_POINT_COORD | MALI_ATTR_LINEAR;
+        slot->stride = slot->size = 0;
+}
  
-        if (ctx->pipe_framebuffer.nr_cbufs == 1) {
-                struct panfrost_resource *rsrc = (struct panfrost_resource *) ctx->pipe_framebuffer.cbufs[0]->texture;
-                int stride = util_format_get_stride(rsrc->base.format, rsrc->base.width0);
+static void
+panfrost_emit_varying_descriptor(
+                struct panfrost_context *ctx,
+                unsigned invocation_count)
+{
+        /* Load the shaders */
  
-                if (rsrc->bo->has_checksum) {
-                        //ctx->fragment_fbd.unk3 |= 0xa00000;
-                        //ctx->fragment_fbd.unk3 = 0xa02100;
-                        ctx->fragment_fbd.unk3 |= MALI_MFBD_EXTRA;
-                        ctx->fragment_extra.unk |= 0x420;
-                        ctx->fragment_extra.checksum_stride = rsrc->bo->checksum_stride;
-                        ctx->fragment_extra.checksum = rsrc->bo->gpu[0] + stride * rsrc->base.height0;
-                }
-        }
+        struct panfrost_shader_state *vs = &ctx->vs->variants[ctx->vs->active_variant];
+        struct panfrost_shader_state *fs = &ctx->fs->variants[ctx->fs->active_variant];
  
-        /* The frame is complete and therefore the framebuffer descriptor is
-         * ready for linkage and upload */
+        /* Allocate the varying descriptor */
  
-        size_t sz = sizeof(ctx->fragment_fbd) + sizeof(struct bifrost_fb_extra) + sizeof(struct bifrost_render_target) * 1;
-        struct panfrost_transfer fbd_t = panfrost_allocate_transient(ctx, sz);
-        off_t offset = 0;
+        size_t vs_size = sizeof(struct mali_attr_meta) * vs->tripipe->varying_count;
+        size_t fs_size = sizeof(struct mali_attr_meta) * fs->tripipe->varying_count;
  
-        memcpy(fbd_t.cpu, &ctx->fragment_fbd, sizeof(ctx->fragment_fbd));
-        offset += sizeof(ctx->fragment_fbd);
+        struct panfrost_transfer trans = panfrost_allocate_transient(ctx,
+                        vs_size + fs_size);
  
-        /* Upload extra framebuffer info if necessary */
-        if (ctx->fragment_fbd.unk3 & MALI_MFBD_EXTRA) {
-                memcpy(fbd_t.cpu + offset, &ctx->fragment_extra, sizeof(struct bifrost_fb_extra));
-                offset += sizeof(struct bifrost_fb_extra);
-        }
+        memcpy(trans.cpu, vs->varyings, vs_size);
+        memcpy(trans.cpu + vs_size, fs->varyings, fs_size);
  
-        /* Upload (single) render target */
-        memcpy(fbd_t.cpu + offset, &ctx->fragment_rts[0], sizeof(struct bifrost_render_target) * 1);
+        ctx->payload_vertex.postfix.varying_meta = trans.gpu;
+        ctx->payload_tiler.postfix.varying_meta = trans.gpu + vs_size;
  
-        /* Generate the fragment (frame) job */
+        /* Buffer indices must be in this order per our convention */
+        union mali_attr varyings[PIPE_MAX_ATTRIBS];
+        unsigned idx = 0;
  
-        struct mali_job_descriptor_header header = {
-                .job_type = JOB_TYPE_FRAGMENT,
-                .job_index = 1,
-#ifdef __LP64__
-                .job_descriptor_size = 1
-#endif
-        };
+        /* General varyings -- use the VS's, since those are more likely to be
+         * accurate on desktop */
+
+        panfrost_emit_varyings(ctx, &varyings[idx++],
+                        vs->general_varying_stride, invocation_count);
+
+        /* fp32 vec4 gl_Position */
+        ctx->payload_tiler.postfix.position_varying =
+                panfrost_emit_varyings(ctx, &varyings[idx++],
+                                sizeof(float) * 4, invocation_count);
  
-        struct mali_payload_fragment payload = {
-                .min_tile_coord = MALI_COORDINATE_TO_TILE_MIN(0, 0),
-                .max_tile_coord = MALI_COORDINATE_TO_TILE_MAX(ctx->pipe_framebuffer.width, ctx->pipe_framebuffer.height),
-                .framebuffer = fbd_t.gpu | PANFROST_DEFAULT_FBD | (ctx->fragment_fbd.unk3 & MALI_MFBD_EXTRA ? 2 : 0),
-        };
  
-        /* Normally, there should be no padding. However, fragment jobs are
-         * shared with 64-bit Bifrost systems, and accordingly there is 4-bytes
-         * of zero padding in between. */
+        if (vs->writes_point_size || fs->reads_point_coord) {
+                /* fp16 vec1 gl_PointSize */
+                ctx->payload_tiler.primitive_size.pointer =
+                        panfrost_emit_varyings(ctx, &varyings[idx++],
+                                        2, invocation_count);
+        }
+
+        if (fs->reads_point_coord) {
+                /* Special descriptor */
+                panfrost_emit_point_coord(&varyings[idx++]);
+        }
  
-        struct panfrost_transfer transfer = panfrost_allocate_transient(ctx, sizeof(header) + sizeof(payload));
-        memcpy(transfer.cpu, &header, sizeof(header));
-        memcpy(transfer.cpu + sizeof(header), &payload, sizeof(payload));
-        return transfer.gpu;
+        mali_ptr varyings_p = panfrost_upload_transient(ctx, &varyings, idx * sizeof(union mali_attr));
+        ctx->payload_vertex.postfix.varyings = varyings_p;
+        ctx->payload_tiler.postfix.varyings = varyings_p;
  }
  
  /* Emits attributes and varying descriptors, which should be called every draw,
@@ -938,7 +739,6 @@ panfrost_emit_vertex_data(struct panfrost_context *ctx)
  {
          /* TODO: Only update the dirtied buffers */
          union mali_attr attrs[PIPE_MAX_ATTRIBS];
-        union mali_attr varyings[PIPE_MAX_ATTRIBS];
  
          unsigned invocation_count = MALI_NEGATIVE(ctx->payload_tiler.prefix.invocation_count);
  
@@ -972,48 +772,18 @@ panfrost_emit_vertex_data(struct panfrost_context *ctx)
                   * rsrc->gpu. However, attribute buffers must be 64 aligned. If
                   * it is not, for now we have to duplicate the buffer. */
  
-                mali_ptr effective_address = (rsrc->bo->gpu[0] + buf->buffer_offset);
+                mali_ptr effective_address = (rsrc->bo->gpu + buf->buffer_offset);
  
                  if (effective_address & 0x3F) {
-                        attrs[i].elements = panfrost_upload_transient(ctx, rsrc->bo->cpu[0] + buf->buffer_offset, attrs[i].size) | 1;
+                        attrs[i].elements = panfrost_upload_transient(ctx, rsrc->bo->cpu + buf->buffer_offset, attrs[i].size) | 1;
                  } else {
                          attrs[i].elements = effective_address | 1;
                  }
          }
  
-        struct panfrost_varyings *vars = &ctx->vs->variants[ctx->vs->active_variant].varyings;
-
-        for (int i = 0; i < vars->varying_buffer_count; ++i) {
-                mali_ptr varying_address = ctx->varying_mem.gpu + ctx->varying_height;
-
-                varyings[i].elements = varying_address | 1;
-                varyings[i].stride = vars->varyings_stride[i];
-                varyings[i].size = vars->varyings_stride[i] * invocation_count;
-
-                /* If this varying has to be linked somewhere, do it now. See
-                 * pan_assemble.c for the indices. TODO: Use a more generic
-                 * linking interface */
-
-                if (i == 1) {
-                        /* gl_Position */
-                        ctx->payload_tiler.postfix.position_varying = varying_address;
-                } else if (i == 2) {
-                        /* gl_PointSize */
-                        ctx->payload_tiler.primitive_size.pointer = varying_address;
-                }
-
-                /* Varyings appear to need 64-byte alignment */
-                ctx->varying_height += ALIGN(varyings[i].size, 64);
-
-                /* Ensure that we fit */
-                assert(ctx->varying_height < ctx->varying_mem.size);
-        }
-
          ctx->payload_vertex.postfix.attributes = panfrost_upload_transient(ctx, attrs, ctx->vertex_buffer_count * sizeof(union mali_attr));
  
-        mali_ptr varyings_p = panfrost_upload_transient(ctx, &varyings, vars->varying_buffer_count * sizeof(union mali_attr));
-        ctx->payload_vertex.postfix.varyings = varyings_p;
-        ctx->payload_tiler.postfix.varyings = varyings_p;
+        panfrost_emit_varying_descriptor(ctx, invocation_count);
  }
  
  /* Go through dirty flags and actualise them in the cmdstream. */
@@ -1021,17 +791,32 @@ panfrost_emit_vertex_data(struct panfrost_context *ctx)
  void
  panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
  {
+        struct panfrost_job *job = panfrost_get_job_for_fbo(ctx);
+
          if (with_vertex_data) {
                  panfrost_emit_vertex_data(ctx);
          }
  
+        bool msaa = ctx->rasterizer->base.multisample;
+
          if (ctx->dirty & PAN_DIRTY_RASTERIZER) {
                  ctx->payload_tiler.gl_enables = ctx->rasterizer->tiler_gl_enables;
-                panfrost_set_framebuffer_msaa(ctx, ctx->rasterizer->base.multisample);
+
+                /* TODO: Sample size */
+                SET_BIT(ctx->fragment_shader_core.unknown2_3, MALI_HAS_MSAA, msaa);
+                SET_BIT(ctx->fragment_shader_core.unknown2_4, MALI_NO_MSAA, !msaa);
          }
  
+        /* Enable job requirements at draw-time */
+
+        if (msaa)
+                job->requirements |= PAN_REQ_MSAA;
+
+        if (ctx->depth_stencil->depth.writemask)
+                job->requirements |= PAN_REQ_DEPTH_WRITE;
+
          if (ctx->occlusion_query) {
-                ctx->payload_tiler.gl_enables |= MALI_OCCLUSION_BOOLEAN;
+                ctx->payload_tiler.gl_enables |= MALI_OCCLUSION_QUERY | MALI_OCCLUSION_PRECISE;
                  ctx->payload_tiler.postfix.occlusion_counter = ctx->occlusion_query->transfer.gpu;
          }
  
@@ -1041,6 +826,7 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
                  struct panfrost_shader_state *vs = &ctx->vs->variants[ctx->vs->active_variant];
  
                  /* Late shader descriptor assignments */
+
                  vs->tripipe->texture_count = ctx->sampler_view_count[PIPE_SHADER_VERTEX];
                  vs->tripipe->sampler_count = ctx->sampler_count[PIPE_SHADER_VERTEX];
  
@@ -1048,15 +834,6 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
                  vs->tripipe->midgard1.unknown1 = 0x2201;
  
                  ctx->payload_vertex.postfix._shader_upper = vs->tripipe_gpu >> 4;
-
-                /* Varying descriptor is tied to the vertex shader. Also the
-                 * fragment shader, I suppose, but it's generated with the
-                 * vertex shader so */
-
-                struct panfrost_varyings *varyings = &ctx->vs->variants[ctx->vs->active_variant].varyings;
-
-                ctx->payload_vertex.postfix.varying_meta = varyings->varyings_descriptor;
-                ctx->payload_tiler.postfix.varying_meta = varyings->varyings_descriptor_fragment;
          }
  
          if (ctx->dirty & (PAN_DIRTY_RASTERIZER | PAN_DIRTY_VS)) {
@@ -1114,10 +891,10 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
                  ctx->fragment_shader_core.stencil_back.ref = ctx->stencil_ref.ref_value[1];
  
                  /* CAN_DISCARD should be set if the fragment shader possibly
-                 * contains a 'discard' instruction, or maybe other
-                 * circumstances. It is likely this is related to optimizations
-                 * related to forward-pixel kill, as per "Mali Performance 3:
-                 * Is EGL_BUFFER_PRESERVED a good thing?" by Peter Harris
+                 * contains a 'discard' instruction. It is likely this is
+                 * related to optimizations related to forward-pixel kill, as
+                 * per "Mali Performance 3: Is EGL_BUFFER_PRESERVED a good
+                 * thing?" by Peter Harris
                   */
  
                  if (variant->can_discard) {
@@ -1127,8 +904,30 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
                          ctx->fragment_shader_core.midgard1.unknown1 = 0x4200;
                  }
  
-                if (ctx->blend->has_blend_shader)
-                        ctx->fragment_shader_core.blend_shader = ctx->blend->blend_shader;
+               /* Check if we're using the default blend descriptor (fast path) */
+
+               bool no_blending =
+                       !ctx->blend->has_blend_shader &&
+                       (ctx->blend->equation.rgb_mode == 0x122) &&
+                       (ctx->blend->equation.alpha_mode == 0x122) &&
+                       (ctx->blend->equation.color_mask == 0xf);
+
+                if (ctx->require_sfbd) {
+                        /* When only a single render target platform is used, the blend
+                         * information is inside the shader meta itself. We
+                         * additionally need to signal CAN_DISCARD for nontrivial blend
+                         * modes (so we're able to read back the destination buffer) */
+
+                        if (ctx->blend->has_blend_shader) {
+                                ctx->fragment_shader_core.blend_shader = ctx->blend->blend_shader;
+                        } else {
+                                memcpy(&ctx->fragment_shader_core.blend_equation, &ctx->blend->equation, sizeof(ctx->blend->equation));
+                        }
+
+                        if (!no_blending) {
+                                ctx->fragment_shader_core.unknown2_3 |= MALI_CAN_DISCARD;
+                        }
+                }
  
                  size_t size = sizeof(struct mali_shader_meta) + sizeof(struct mali_blend_meta);
                  struct panfrost_transfer transfer = panfrost_allocate_transient(ctx, size);
@@ -1136,51 +935,46 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
  
                  ctx->payload_tiler.postfix._shader_upper = (transfer.gpu) >> 4;
  
-#ifdef T8XX
-                /* Additional blend descriptor tacked on for newer systems */
+                if (!ctx->require_sfbd) {
+                        /* Additional blend descriptor tacked on for jobs using MFBD */
  
-                unsigned blend_count = 0;
+                        unsigned blend_count = 0;
  
-                if (ctx->blend->has_blend_shader) {
-                        /* For a blend shader, the bottom nibble corresponds to
-                         * the number of work registers used, which signals the
-                         * -existence- of a blend shader */
+                        if (ctx->blend->has_blend_shader) {
+                                /* For a blend shader, the bottom nibble corresponds to
+                                 * the number of work registers used, which signals the
+                                 * -existence- of a blend shader */
  
-                        assert(ctx->blend->blend_work_count >= 2);
-                        blend_count |= MIN2(ctx->blend->blend_work_count, 3);
-                } else {
-                        /* Otherwise, the bottom bit simply specifies if
-                         * blending (anything other than REPLACE) is enabled */
+                                assert(ctx->blend->blend_work_count >= 2);
+                                blend_count |= MIN2(ctx->blend->blend_work_count, 3);
+                        } else {
+                                /* Otherwise, the bottom bit simply specifies if
+                                 * blending (anything other than REPLACE) is enabled */
  
-                        /* XXX: Less ugly way to do this? */
-                        bool no_blending =
-                                (ctx->blend->equation.rgb_mode == 0x122) &&
-                                (ctx->blend->equation.alpha_mode == 0x122) &&
-                                (ctx->blend->equation.color_mask == 0xf);
  
-                        if (!no_blending)
-                                blend_count |= 0x1;
-                }
+                                if (!no_blending)
+                                        blend_count |= 0x1;
+                        }
  
-                /* Second blend equation is always a simple replace */
+                        /* Second blend equation is always a simple replace */
  
-                uint64_t replace_magic = 0xf0122122;
-                struct mali_blend_equation replace_mode;
-                memcpy(&replace_mode, &replace_magic, sizeof(replace_mode));
+                        uint64_t replace_magic = 0xf0122122;
+                        struct mali_blend_equation replace_mode;
+                        memcpy(&replace_mode, &replace_magic, sizeof(replace_mode));
  
-                struct mali_blend_meta blend_meta[] = {
-                        {
-                                .unk1 = 0x200 | blend_count,
-                                .blend_equation_1 = ctx->blend->equation,
-                                .blend_equation_2 = replace_mode
-                        },
-                };
+                        struct mali_blend_meta blend_meta[] = {
+                                {
+                                        .unk1 = 0x200 | blend_count,
+                                        .blend_equation_1 = ctx->blend->equation,
+                                        .blend_equation_2 = replace_mode
+                                },
+                        };
  
-                if (ctx->blend->has_blend_shader)
-                        memcpy(&blend_meta[0].blend_equation_1, &ctx->blend->blend_shader, sizeof(ctx->blend->blend_shader));
+                        if (ctx->blend->has_blend_shader)
+                                memcpy(&blend_meta[0].blend_equation_1, &ctx->blend->blend_shader, sizeof(ctx->blend->blend_shader));
  
-                memcpy(transfer.cpu + sizeof(struct mali_shader_meta), blend_meta, sizeof(blend_meta));
-#endif
+                        memcpy(transfer.cpu + sizeof(struct mali_shader_meta), blend_meta, sizeof(blend_meta));
+                }
          }
  
          if (ctx->dirty & PAN_DIRTY_VERTEX) {
@@ -1224,30 +1018,12 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
                                  struct panfrost_resource *rsrc = (struct panfrost_resource *) tex_rsrc;
  
                                  /* Inject the address in. */
-                                for (int l = 0; l < (tex_rsrc->last_level + 1); ++l)
-                                        ctx->sampler_views[t][i]->hw.swizzled_bitmaps[l] = rsrc->bo->gpu[l];
-
-                                /* Workaround maybe-errata (?) with non-mipmaps */
-                                int s = ctx->sampler_views[t][i]->hw.nr_mipmap_levels;
-
-                                if (!rsrc->bo->is_mipmap) {
-#ifdef T6XX
-                                        /* HW ERRATA, not needed after T6XX */
-                                        ctx->sampler_views[t][i]->hw.swizzled_bitmaps[1] = rsrc->bo->gpu[0];
-
-                                        ctx->sampler_views[t][i]->hw.unknown3A = 1;
-#endif
-                                        ctx->sampler_views[t][i]->hw.nr_mipmap_levels = 0;
+                                for (int l = 0; l <= tex_rsrc->last_level; ++l) {
+                                        ctx->sampler_views[t][i]->hw.swizzled_bitmaps[l] =
+                                                rsrc->bo->gpu + rsrc->bo->slices[l].offset;
                                  }
  
                                  trampolines[i] = panfrost_upload_transient(ctx, &ctx->sampler_views[t][i]->hw, sizeof(struct mali_texture_descriptor));
-
-                                /* Restore */
-                                ctx->sampler_views[t][i]->hw.nr_mipmap_levels = s;
-
-#ifdef T6XX
-                                ctx->sampler_views[t][i]->hw.unknown3A = 0;
-#endif
                          }
  
                          mali_ptr trampoline = panfrost_upload_transient(ctx, trampolines, sizeof(uint64_t) * ctx->sampler_view_count[t]);
@@ -1264,12 +1040,21 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
          /* Generate the viewport vector of the form: <width/2, height/2, centerx, centery> */
          const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
  
+        /* For flipped-Y buffers (signaled by negative scale), the translate is
+         * flipped as well */
+
+        bool invert_y = vp->scale[1] < 0.0;
+        float translate_y = vp->translate[1];
+
+        if (invert_y)
+                translate_y = ctx->pipe_framebuffer.height - translate_y;
+
          float viewport_vec4[] = {
                  vp->scale[0],
                  fabsf(vp->scale[1]),
  
                  vp->translate[0],
-                /* -1.0 * vp->translate[1] */ fabs(1.0 * vp->scale[1]) /* XXX */
+                translate_y
          };
  
          for (int i = 0; i < PIPE_SHADER_TYPES; ++i) {
@@ -1310,7 +1095,7 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
                                  break;
  
                          default:
-                                printf("Unknown shader stage %d in uniform upload\n", i);
+                                DBG("Unknown shader stage %d in uniform upload\n", i);
                                  assert(0);
                          }
  
@@ -1331,6 +1116,54 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
                  }
          }
  
+        /* TODO: Upload the viewport somewhere more appropriate */
+
+        /* Clip bounds are encoded as floats. The viewport itself is encoded as
+         * (somewhat) asymmetric ints. */
+        const struct pipe_scissor_state *ss = &ctx->scissor;
+
+        struct mali_viewport view = {
+                /* By default, do no viewport clipping, i.e. clip to (-inf,
+                 * inf) in each direction. Clipping to the viewport in theory
+                 * should work, but in practice causes issues when we're not
+                 * explicitly trying to scissor */
+
+                .clip_minx = -inff,
+                .clip_miny = -inff,
+                .clip_maxx = inff,
+                .clip_maxy = inff,
+
+                .clip_minz = 0.0,
+                .clip_maxz = 1.0,
+        };
+
+        /* Always scissor to the viewport by default. */
+        view.viewport0[0] = (int) (vp->translate[0] - vp->scale[0]);
+        view.viewport1[0] = MALI_POSITIVE((int) (vp->translate[0] + vp->scale[0]));
+
+        view.viewport0[1] = (int) (translate_y - fabs(vp->scale[1]));
+        view.viewport1[1] = MALI_POSITIVE((int) (translate_y + fabs(vp->scale[1])));
+
+        if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
+                /* Invert scissor if needed */
+                unsigned miny = invert_y ?
+                        ctx->pipe_framebuffer.height - ss->maxy : ss->miny;
+
+                unsigned maxy = invert_y ?
+                        ctx->pipe_framebuffer.height - ss->miny : ss->maxy;
+
+                /* Set the actual scissor */
+                view.viewport0[0] = ss->minx;
+                view.viewport0[1] = miny;
+                view.viewport1[0] = MALI_POSITIVE(ss->maxx);
+                view.viewport1[1] = MALI_POSITIVE(maxy);
+        } 
+
+        ctx->payload_tiler.postfix.viewport =
+                panfrost_upload_transient(ctx,
+                                &view,
+                                sizeof(struct mali_viewport));
+
          ctx->dirty = 0;
  }
  
@@ -1341,7 +1174,7 @@ panfrost_queue_draw(struct panfrost_context *ctx)
  {
          /* TODO: Expand the array? */
          if (ctx->draw_count >= MAX_DRAW_CALLS) {
-                printf("Job buffer overflow, ignoring draw\n");
+                DBG("Job buffer overflow, ignoring draw\n");
                  assert(0);
          }
  
@@ -1391,7 +1224,7 @@ panfrost_link_jobs(struct panfrost_context *ctx)
          for (int i = 0; i < ctx->vertex_job_count; ++i) {
                  bool isLast = (i + 1) == ctx->vertex_job_count;
  
-                panfrost_link_job_pair(ctx->u_vertex_jobs[i], isLast ? ctx->tiler_jobs[0]: ctx->vertex_jobs[i + 1]);
+                panfrost_link_job_pair(ctx->u_vertex_jobs[i], isLast ? ctx->tiler_jobs[0] : ctx->vertex_jobs[i + 1]);
          }
  
          /* T -> T/null */
@@ -1404,7 +1237,8 @@ panfrost_link_jobs(struct panfrost_context *ctx)
  /* The entire frame is in memory -- send it off to the kernel! */
  
  static void
-panfrost_submit_frame(struct panfrost_context *ctx, bool flush_immediate)
+panfrost_submit_frame(struct panfrost_context *ctx, bool flush_immediate,
+                     struct pipe_fence_handle **fence)
  {
          struct pipe_context *gallium = (struct pipe_context *) ctx;
          struct panfrost_screen *screen = pan_screen(gallium->screen);
@@ -1430,29 +1264,29 @@ panfrost_submit_frame(struct panfrost_context *ctx, bool flush_immediate)
  
          /* If visual, we can stall a frame */
  
-        if (panfrost_is_scanout(ctx) && !flush_immediate)
-                screen->driver->force_flush_fragment(ctx);
+        if (!flush_immediate)
+                screen->driver->force_flush_fragment(ctx, fence);
  
          screen->last_fragment_id = fragment_id;
          screen->last_fragment_flushed = false;
  
          /* If readback, flush now (hurts the pipelined performance) */
-        if (panfrost_is_scanout(ctx) && flush_immediate)
-                screen->driver->force_flush_fragment(ctx);
-
-#ifdef DUMP_PERFORMANCE_COUNTERS
-        char filename[128];
-        snprintf(filename, sizeof(filename), "/dev/shm/frame%d.mdgprf", ++performance_counter_number);
-        FILE *fp = fopen(filename, "wb");
-        fwrite(screen->perf_counters.cpu,  4096, sizeof(uint32_t), fp);
-        fclose(fp);
-#endif
+        if (flush_immediate)
+                screen->driver->force_flush_fragment(ctx, fence);
+
+        if (screen->driver->dump_counters && pan_counters_base) {
+                screen->driver->dump_counters(screen);
+
+                char filename[128];
+                snprintf(filename, sizeof(filename), "%s/frame%d.mdgprf", pan_counters_base, ++performance_counter_number);
+                FILE *fp = fopen(filename, "wb");
+                fwrite(screen->perf_counters.cpu,  4096, sizeof(uint32_t), fp);
+                fclose(fp);
+        }
  
  #endif
  }
  
-bool dont_scanout = false;
-
  void
  panfrost_flush(
          struct pipe_context *pipe,
@@ -1460,29 +1294,16 @@ panfrost_flush(
          unsigned flags)
  {
          struct panfrost_context *ctx = pan_context(pipe);
+        struct panfrost_job *job = panfrost_get_job_for_fbo(ctx);
  
-        /* If there is nothing drawn, skip the frame */
-        if (!ctx->draw_count && !ctx->frame_cleared) return;
-
-        if (!ctx->frame_cleared) {
-                /* While there are draws, there was no clear. This is a partial
-                 * update, which needs to be handled via the "wallpaper"
-                 * method. We also need to fake a clear, just to get the
-                 * FRAGMENT job correct. */
-
-                panfrost_clear(&ctx->base, ctx->last_clear.buffers, ctx->last_clear.color, ctx->last_clear.depth, ctx->last_clear.stencil);
-
-                panfrost_draw_wallpaper(pipe);
-        }
-
-        /* Frame clear handled, reset */
-        ctx->frame_cleared = false;
+        /* Nothing to do! */
+        if (!ctx->draw_count && !job->clear) return;
  
          /* Whether to stall the pipeline for immediately correct results */
          bool flush_immediate = flags & PIPE_FLUSH_END_OF_FRAME;
  
          /* Submit the frame itself */
-        panfrost_submit_frame(ctx, flush_immediate);
+        panfrost_submit_frame(ctx, flush_immediate, fence);
  
          /* Prepare for the next frame */
          panfrost_invalidate_frame(ctx);
@@ -1506,7 +1327,7 @@ g2m_draw_mode(enum pipe_prim_type mode)
                  DEFINE_CASE(POLYGON);
  
          default:
-                printf("Illegal draw mode %d\n", mode);
+                DBG("Illegal draw mode %d\n", mode);
                  assert(0);
                  return MALI_LINE_LOOP;
          }
@@ -1528,7 +1349,7 @@ panfrost_translate_index_size(unsigned size)
                  return MALI_DRAW_INDEXED_UINT32;
  
          default:
-                printf("Unknown index size %d\n", size);
+                DBG("Unknown index size %d\n", size);
                  assert(0);
                  return 0;
          }
@@ -1541,7 +1362,7 @@ panfrost_get_index_buffer_raw(const struct pipe_draw_info *info)
                  return (const uint8_t *) info->index.user;
          } else {
                  struct panfrost_resource *rsrc = (struct panfrost_resource *) (info->index.resource);
-                return (const uint8_t *) rsrc->bo->cpu[0];
+                return (const uint8_t *) rsrc->bo->cpu;
          }
  }
  
@@ -1557,7 +1378,7 @@ panfrost_get_index_buffer_mapped(struct panfrost_context *ctx, const struct pipe
  
          if (!info->has_user_indices) {
                  /* Only resources can be directly mapped */
-                return rsrc->bo->gpu[0] + offset;
+                return rsrc->bo->gpu + offset;
          } else {
                  /* Otherwise, we need to upload to transient memory */
                  const uint8_t *ibuf8 = panfrost_get_index_buffer_raw(info);
@@ -1565,11 +1386,6 @@ panfrost_get_index_buffer_mapped(struct panfrost_context *ctx, const struct pipe
          }
  }
  
-static void
-panfrost_draw_vbo(
-        struct pipe_context *pipe,
-        const struct pipe_draw_info *info);
-
  #define CALCULATE_MIN_MAX_INDEX(T, buffer, start, count) \
          for (unsigned _idx = (start); _idx < (start + count); ++_idx) { \
                  T idx = buffer[_idx]; \
@@ -1589,14 +1405,10 @@ panfrost_draw_vbo(
  
          int mode = info->mode;
  
-#if 0
-        /* Fallback for non-ES draw modes */
-        /* Primconvert not needed on Midgard anymore due to native
-         * QUADS/POLYGONS. Bifrost/desktop-GL may need it though so not
-         * removing */
+        /* Fallback for unsupported modes */
  
-        if (info->mode >= PIPE_PRIM_QUADS) {
-                if (info->mode == PIPE_PRIM_QUADS && info->count == 4 && ctx->rasterizer && !ctx->rasterizer->base.flatshade) {
+        if (!(ctx->draw_modes & (1 << mode))) {
+                if (mode == PIPE_PRIM_QUADS && info->count == 4 && ctx->rasterizer && !ctx->rasterizer->base.flatshade) {
                          mode = PIPE_PRIM_TRIANGLE_FAN;
                  } else {
                          if (info->count < 4) {
@@ -1609,7 +1421,11 @@ panfrost_draw_vbo(
                          return;
                  }
          }
-#endif
+
+        /* Now that we have a guaranteed terminating path, find the job.
+         * Assignment commented out to prevent unused warning */
+
+        /* struct panfrost_job *job = */ panfrost_get_job_for_fbo(ctx);
  
          ctx->payload_tiler.prefix.draw_mode = g2m_draw_mode(mode);
  
@@ -1623,7 +1439,7 @@ panfrost_draw_vbo(
           * rendering artefacts. It's not clear what these values mean yet. */
  
          ctx->payload_tiler.prefix.unknown_draw &= ~(0x3000 | 0x18000);
-        ctx->payload_tiler.prefix.unknown_draw |= (info->mode == PIPE_PRIM_POINTS || ctx->vertex_count > 65535) ? 0x3000 : 0x18000;
+        ctx->payload_tiler.prefix.unknown_draw |= (mode == PIPE_PRIM_POINTS || ctx->vertex_count > 65535) ? 0x3000 : 0x18000;
  
          if (info->index_size) {
                  /* Calculate the min/max index used so we can figure out how
@@ -1692,39 +1508,18 @@ panfrost_generic_cso_delete(struct pipe_context *pctx, void *hwcso)
          free(hwcso);
  }
  
-static void
-panfrost_set_scissor(struct panfrost_context *ctx)
-{
-        const struct pipe_scissor_state *ss = &ctx->scissor;
-
-        if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor && 0) {
-                ctx->viewport->viewport0[0] = ss->minx;
-                ctx->viewport->viewport0[1] = ss->miny;
-                ctx->viewport->viewport1[0] = MALI_POSITIVE(ss->maxx);
-                ctx->viewport->viewport1[1] = MALI_POSITIVE(ss->maxy);
-        } else {
-                ctx->viewport->viewport0[0] = 0;
-                ctx->viewport->viewport0[1] = 0;
-                ctx->viewport->viewport1[0] = MALI_POSITIVE(ctx->pipe_framebuffer.width);
-                ctx->viewport->viewport1[1] = MALI_POSITIVE(ctx->pipe_framebuffer.height);
-        }
-}
-
  static void *
  panfrost_create_rasterizer_state(
          struct pipe_context *pctx,
          const struct pipe_rasterizer_state *cso)
  {
+        struct panfrost_context *ctx = pan_context(pctx);
          struct panfrost_rasterizer *so = CALLOC_STRUCT(panfrost_rasterizer);
  
          so->base = *cso;
  
          /* Bitmask, unknown meaning of the start value */
-#ifdef T8XX
-        so->tiler_gl_enables = 0x7;
-#else
-        so->tiler_gl_enables = 0x105;
-#endif
+        so->tiler_gl_enables = ctx->is_t6xx ? 0x105 : 0x7;
  
          so->tiler_gl_enables |= MALI_FRONT_FACE(
                                          cso->front_ccw ? MALI_CCW : MALI_CW);
@@ -1744,21 +1539,12 @@ panfrost_bind_rasterizer_state(
          void *hwcso)
  {
          struct panfrost_context *ctx = pan_context(pctx);
-        struct pipe_rasterizer_state *cso = hwcso;
  
          /* TODO: Why can't rasterizer be NULL ever? Other drivers are fine.. */
          if (!hwcso)
                  return;
  
-        /* If scissor test has changed, we'll need to update that now */
-        bool update_scissor = !ctx->rasterizer || ctx->rasterizer->base.scissor != cso->scissor;
-
          ctx->rasterizer = hwcso;
-
-        /* Actualise late changes */
-        if (update_scissor)
-                panfrost_set_scissor(ctx);
-
          ctx->dirty |= PAN_DIRTY_RASTERIZER;
  }
  
@@ -1811,7 +1597,9 @@ panfrost_bind_vertex_elements_state(
  static void
  panfrost_delete_vertex_elements_state(struct pipe_context *pctx, void *hwcso)
  {
-        printf("Vertex elements delete leaks descriptor\n");
+        struct panfrost_vertex_state *so = (struct panfrost_vertex_state *) hwcso;
+        unsigned bytes = sizeof(struct mali_attr_meta) * so->num_elements;
+        DBG("Vertex elements delete leaks descriptor (%d bytes)\n", bytes);
          free(hwcso);
  }
  
@@ -1836,7 +1624,15 @@ panfrost_delete_shader_state(
          struct pipe_context *pctx,
          void *so)
  {
-        printf("Deleting shader state maybe leaks tokens, per-variant compiled shaders, per-variant  descriptors\n");
+        struct panfrost_shader_variants *cso = (struct panfrost_shader_variants *) so;
+
+        if (cso->base.type == PIPE_SHADER_IR_TGSI) {
+                DBG("Deleting TGSI shader leaks duplicated tokens\n");
+        }
+
+        unsigned leak = cso->variant_count * sizeof(struct mali_shader_meta);
+        DBG("Deleting shader state leaks descriptors (%d bytes), and shader bytecode\n", leak);
+
          free(so);
  }
  
@@ -1866,8 +1662,8 @@ panfrost_create_sampler_state(
                          cso->border_color.f[2],
                          cso->border_color.f[3]
                  },
-                .min_lod = FIXED_16(0.0),
-                .max_lod = FIXED_16(31.0),
+                .min_lod = FIXED_16(cso->min_lod),
+                .max_lod = FIXED_16(cso->max_lod),
                  .unknown2 = 1,
          };
  
@@ -2060,11 +1856,11 @@ panfrost_set_constant_buffer(
          struct panfrost_resource *rsrc = (struct panfrost_resource *) (buf->buffer);
  
          if (rsrc) {
-                cpu = rsrc->bo->cpu[0];
+                cpu = rsrc->bo->cpu;
          } else if (buf->user_buffer) {
                  cpu = buf->user_buffer;
          } else {
-                printf("No constant buffer?\n");
+                DBG("No constant buffer?\n");
                  return;
          }
  
@@ -2126,6 +1922,25 @@ panfrost_create_sampler_view(
  
          enum mali_format format = panfrost_find_format(desc);
  
+        bool is_depth = desc->format == PIPE_FORMAT_Z32_UNORM;
+
+        unsigned usage2_layout = 0x10;
+
+        switch (prsrc->bo->layout) {
+                case PAN_AFBC:
+                        usage2_layout |= 0x8 | 0x4;
+                        break;
+                case PAN_TILED:
+                        usage2_layout |= 0x1;
+                        break;
+                case PAN_LINEAR:
+                        usage2_layout |= is_depth ? 0x1 : 0x2;
+                        break;
+                default:
+                        assert(0);
+                        break;
+        }
+
          struct mali_texture_descriptor texture_descriptor = {
                  .width = MALI_POSITIVE(texture->width0),
                  .height = MALI_POSITIVE(texture->height0),
@@ -2139,11 +1954,7 @@ panfrost_create_sampler_view(
                          .usage1 = 0x0,
                          .is_not_cubemap = 1,
  
-                        /* 0x11 - regular texture 2d, uncompressed tiled */
-                        /* 0x12 - regular texture 2d, uncompressed linear */
-                        /* 0x1c - AFBC compressed (internally tiled, probably) texture 2D */
-
-                        .usage2 = prsrc->bo->has_afbc ? 0x1c : (prsrc->bo->tiled ? 0x11 : 0x12),
+                        .usage2 = usage2_layout
                  },
  
                  .swizzle = panfrost_translate_swizzle_4(user_swizzle)
@@ -2152,7 +1963,11 @@ panfrost_create_sampler_view(
          /* TODO: Other base levels require adjusting dimensions / level numbers / etc */
          assert (template->u.tex.first_level == 0);
  
-        texture_descriptor.nr_mipmap_levels = template->u.tex.last_level - template->u.tex.first_level;
+        /* Disable mipmapping for now to avoid regressions while automipmapping
+         * is being implemented. TODO: Remove me once automipmaps work */
+
+        //texture_descriptor.nr_mipmap_levels = template->u.tex.last_level - template->u.tex.first_level;
+        texture_descriptor.nr_mipmap_levels = 0;
  
          so->hw = texture_descriptor;
  
@@ -2213,7 +2028,7 @@ panfrost_set_framebuffer_state(struct pipe_context *pctx,
                  if (ctx->pipe_framebuffer.cbufs[i] == cb) continue;
  
                  if (cb && (i != 0)) {
-                        printf("XXX: Multiple render targets not supported before t7xx!\n");
+                        DBG("XXX: Multiple render targets not supported before t7xx!\n");
                          assert(0);
                  }
  
@@ -2223,15 +2038,17 @@ panfrost_set_framebuffer_state(struct pipe_context *pctx,
                  if (!cb)
                          continue;
  
-                ctx->vt_framebuffer = panfrost_emit_fbd(ctx);
+                if (ctx->require_sfbd)
+                        ctx->vt_framebuffer_sfbd = panfrost_emit_sfbd(ctx);
+                else
+                        ctx->vt_framebuffer_mfbd = panfrost_emit_mfbd(ctx);
+
                  panfrost_attach_vt_framebuffer(ctx);
-                panfrost_new_frag_framebuffer(ctx);
-                panfrost_set_scissor(ctx);
  
                  struct panfrost_resource *tex = ((struct panfrost_resource *) ctx->pipe_framebuffer.cbufs[i]->texture);
                  bool is_scanout = panfrost_is_scanout(ctx);
  
-                if (!is_scanout && !tex->bo->has_afbc) {
+                if (!is_scanout && tex->bo->layout != PAN_AFBC) {
                          /* The blob is aggressive about enabling AFBC. As such,
                           * it's pretty much necessary to use it here, since we
                           * have no traces of non-compressed FBO. */
@@ -2254,22 +2071,17 @@ panfrost_set_framebuffer_state(struct pipe_context *pctx,
                          if (zb) {
                                  /* FBO has depth */
  
-                                ctx->vt_framebuffer = panfrost_emit_fbd(ctx);
-                                panfrost_attach_vt_framebuffer(ctx);
-                                panfrost_new_frag_framebuffer(ctx);
-                                panfrost_set_scissor(ctx);
+                                if (ctx->require_sfbd)
+                                        ctx->vt_framebuffer_sfbd = panfrost_emit_sfbd(ctx);
+                                else
+                                        ctx->vt_framebuffer_mfbd = panfrost_emit_mfbd(ctx);
  
-                                struct panfrost_resource *tex = ((struct panfrost_resource *) ctx->pipe_framebuffer.zsbuf->texture);
+                                panfrost_attach_vt_framebuffer(ctx);
  
-                                if (!tex->bo->has_afbc && !panfrost_is_scanout(ctx))
-                                        panfrost_enable_afbc(ctx, tex, true);
+                                /* Keep the depth FBO linear */
                          }
                  }
          }
-
-        /* Force a clear XXX wrong? */
-        if (ctx->last_clear.color)
-                panfrost_clear(&ctx->base, ctx->last_clear.buffers, ctx->last_clear.color, ctx->last_clear.depth, ctx->last_clear.stencil);
  }
  
  static void *
@@ -2321,7 +2133,12 @@ static void
  panfrost_delete_blend_state(struct pipe_context *pipe,
                              void *blend)
  {
-        printf("Deleting blend state may leak blend shader\n");
+        struct panfrost_blend_state *so = (struct panfrost_blend_state *) blend;
+
+        if (so->has_blend_shader) {
+                DBG("Deleting blend state leak blend shaders bytecode\n");
+        }
+
          free(blend);
  }
  
@@ -2447,8 +2264,6 @@ panfrost_set_scissor_states(struct pipe_context *pipe,
          assert(num_scissors == 1);
  
          ctx->scissor = *scissors;
-
-        panfrost_set_scissor(ctx);
  }
  
  static void
@@ -2469,9 +2284,16 @@ static void
  panfrost_destroy(struct pipe_context *pipe)
  {
          struct panfrost_context *panfrost = pan_context(pipe);
+        struct panfrost_screen *screen = pan_screen(pipe->screen);
  
          if (panfrost->blitter)
                  util_blitter_destroy(panfrost->blitter);
+
+        screen->driver->free_slab(screen, &panfrost->scratchpad);
+        screen->driver->free_slab(screen, &panfrost->varying_mem);
+        screen->driver->free_slab(screen, &panfrost->shaders);
+        screen->driver->free_slab(screen, &panfrost->tiler_heap);
+        screen->driver->free_slab(screen, &panfrost->misc_0);
  }
  
  static struct pipe_query *
@@ -2500,6 +2322,7 @@ panfrost_begin_query(struct pipe_context *pipe, struct pipe_query *q)
          struct panfrost_query *query = (struct panfrost_query *) q;
  
          switch (query->type) {
+                case PIPE_QUERY_OCCLUSION_COUNTER:
                  case PIPE_QUERY_OCCLUSION_PREDICATE:
                  case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
                  {
@@ -2512,7 +2335,7 @@ panfrost_begin_query(struct pipe_context *pipe, struct pipe_query *q)
                  }
  
                  default:
-                        fprintf(stderr, "Skipping query %d\n", query->type);
+                        DBG("Skipping query %d\n", query->type);
                          break;
          }
  
@@ -2542,23 +2365,69 @@ panfrost_get_query_result(struct pipe_context *pipe,
          panfrost_flush(pipe, NULL, PIPE_FLUSH_END_OF_FRAME);
  
          switch (query->type) {
+                case PIPE_QUERY_OCCLUSION_COUNTER:
                  case PIPE_QUERY_OCCLUSION_PREDICATE:
                  case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
                          /* Read back the query results */
                          unsigned *result = (unsigned *) query->transfer.cpu;
                          unsigned passed = *result;
  
-                        vresult->b = !!passed;
+                        if (query->type == PIPE_QUERY_OCCLUSION_COUNTER) {
+                                vresult->u64 = passed;
+                        } else {
+                                vresult->b = !!passed;
+                        }
+
                          break;
                  }
                  default:
-                        fprintf(stderr, "Skipped query get %d\n", query->type);
+                        DBG("Skipped query get %d\n", query->type);
                          break;
          }
  
          return true;
  }
  
+static struct pipe_stream_output_target *
+panfrost_create_stream_output_target(struct pipe_context *pctx,
+                                struct pipe_resource *prsc,
+                                unsigned buffer_offset,
+                                unsigned buffer_size)
+{
+        struct pipe_stream_output_target *target;
+
+        target = CALLOC_STRUCT(pipe_stream_output_target);
+
+        if (!target)
+                return NULL;
+
+        pipe_reference_init(&target->reference, 1);
+        pipe_resource_reference(&target->buffer, prsc);
+
+        target->context = pctx;
+        target->buffer_offset = buffer_offset;
+        target->buffer_size = buffer_size;
+
+        return target;
+}
+
+static void
+panfrost_stream_output_target_destroy(struct pipe_context *pctx,
+                                 struct pipe_stream_output_target *target)
+{
+        pipe_resource_reference(&target->buffer, NULL);
+        free(target);
+}
+
+static void
+panfrost_set_stream_output_targets(struct pipe_context *pctx,
+                              unsigned num_targets,
+                              struct pipe_stream_output_target **targets,
+                              const unsigned *offsets)
+{
+        /* STUB */
+}
+
  static void
  panfrost_setup_hardware(struct panfrost_context *ctx)
  {
@@ -2576,10 +2445,10 @@ panfrost_setup_hardware(struct panfrost_context *ctx)
          }
  
          screen->driver->allocate_slab(screen, &ctx->scratchpad, 64, false, 0, 0, 0);
-        screen->driver->allocate_slab(screen, &ctx->varying_mem, 16384, false, 0, 0, 0);
+        screen->driver->allocate_slab(screen, &ctx->varying_mem, 16384, false, PAN_ALLOCATE_INVISIBLE | PAN_ALLOCATE_COHERENT_LOCAL, 0, 0);
          screen->driver->allocate_slab(screen, &ctx->shaders, 4096, true, PAN_ALLOCATE_EXECUTE, 0, 0);
-        screen->driver->allocate_slab(screen, &ctx->tiler_heap, 32768, false, PAN_ALLOCATE_GROWABLE, 1, 128);
-        screen->driver->allocate_slab(screen, &ctx->misc_0, 128, false, PAN_ALLOCATE_GROWABLE, 1, 128);
+        screen->driver->allocate_slab(screen, &ctx->tiler_heap, 32768, false, PAN_ALLOCATE_INVISIBLE | PAN_ALLOCATE_GROWABLE, 1, 128);
+        screen->driver->allocate_slab(screen, &ctx->misc_0, 128*128, false, PAN_ALLOCATE_INVISIBLE | PAN_ALLOCATE_GROWABLE, 1, 128);
  
  }
  
@@ -2590,8 +2459,15 @@ struct pipe_context *
  panfrost_create_context(struct pipe_screen *screen, void *priv, unsigned flags)
  {
          struct panfrost_context *ctx = CALLOC_STRUCT(panfrost_context);
+        struct panfrost_screen *pscreen = pan_screen(screen);
          memset(ctx, 0, sizeof(*ctx));
          struct pipe_context *gallium = (struct pipe_context *) ctx;
+        unsigned gpu_id;
+
+        gpu_id = pscreen->driver->query_gpu_version(pscreen);
+
+        ctx->is_t6xx = gpu_id <= 0x0750; /* For now, this flag means T760 or less */
+        ctx->require_sfbd = gpu_id < 0x0750; /* T760 is the first to support MFBD */
  
          gallium->screen = screen;
  
@@ -2656,8 +2532,14 @@ panfrost_create_context(struct pipe_screen *screen, void *priv, unsigned flags)
          gallium->end_query = panfrost_end_query;
          gallium->get_query_result = panfrost_get_query_result;
  
+        gallium->create_stream_output_target = panfrost_create_stream_output_target;
+        gallium->stream_output_target_destroy = panfrost_stream_output_target_destroy;
+        gallium->set_stream_output_targets = panfrost_set_stream_output_targets;
+
          panfrost_resource_context_init(gallium);
  
+        pscreen->driver->init_context(ctx);
+
          panfrost_setup_hardware(ctx);
  
          /* XXX: leaks */
@@ -2665,22 +2547,20 @@ panfrost_create_context(struct pipe_screen *screen, void *priv, unsigned flags)
          gallium->const_uploader = gallium->stream_uploader;
          assert(gallium->stream_uploader);
  
-        ctx->primconvert = util_primconvert_create(gallium,
-                           (1 << PIPE_PRIM_QUADS) - 1);
-        assert(ctx->primconvert);
+        /* Midgard supports ES modes, plus QUADS/QUAD_STRIPS/POLYGON */
+        ctx->draw_modes = (1 << (PIPE_PRIM_POLYGON + 1)) - 1;
+
+        ctx->primconvert = util_primconvert_create(gallium, ctx->draw_modes);
  
          ctx->blitter = util_blitter_create(gallium);
          assert(ctx->blitter);
  
          /* Prepare for render! */
  
-        /* TODO: XXX */
-        ctx->vt_framebuffer = panfrost_emit_fbd(ctx);
-
+        panfrost_job_init(ctx);
          panfrost_emit_vertex_payload(ctx);
          panfrost_emit_tiler_payload(ctx);
          panfrost_invalidate_frame(ctx);
-        panfrost_viewport(ctx, 0.0, 1.0, 0, 0, ctx->pipe_framebuffer.width, ctx->pipe_framebuffer.height);
          panfrost_default_shader_backend(ctx);
          panfrost_generate_space_filler_indices();