panfrost: nr_mipmap_levels -> levels
[mesa.git] / src / gallium / drivers / panfrost / pan_context.c
index 014f8f6a9d07a172e4192ac9ed35de00674008d5..1642bb436facdcb15372329fbe5ec15436ed3bba 100644 (file)
@@ -1,5 +1,7 @@
 /*
  * © Copyright 2018 Alyssa Rosenzweig
+ * Copyright © 2014-2017 Broadcom
+ * Copyright (C) 2017 Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -37,6 +39,7 @@
 #include "util/half_float.h"
 #include "util/u_helpers.h"
 #include "util/u_format.h"
+#include "util/u_prim.h"
 #include "util/u_prim_restart.h"
 #include "indices/u_primconvert.h"
 #include "tgsi/tgsi_parse.h"
 #include "pan_blending.h"
 #include "pan_blend_shaders.h"
 #include "pan_util.h"
-#include "pan_tiler.h"
-
-/* Do not actually send anything to the GPU; merely generate the cmdstream as fast as possible. Disables framebuffer writes */
-//#define DRY_RUN
 
 /* Framebuffer descriptor */
 
@@ -62,6 +61,7 @@ panfrost_emit_midg_tiler(
         unsigned vertex_count)
 {
         struct midgard_tiler_descriptor t = {};
+        struct panfrost_job *batch = panfrost_get_job_for_fbo(ctx);
 
         t.hierarchy_mask =
                 panfrost_choose_hierarchy_mask(width, height, vertex_count);
@@ -71,16 +71,15 @@ panfrost_emit_midg_tiler(
         unsigned header_size = panfrost_tiler_header_size(
                                        width, height, t.hierarchy_mask);
 
-        unsigned body_size = panfrost_tiler_body_size(
+        t.polygon_list_size = panfrost_tiler_full_size(
                                      width, height, t.hierarchy_mask);
 
         /* Sanity check */
 
         if (t.hierarchy_mask) {
-                assert(ctx->tiler_polygon_list.bo->size >= (header_size + body_size));
+                t.polygon_list = panfrost_job_get_polygon_list(batch,
+                                header_size + t.polygon_list_size);
 
-                /* Specify allocated tiler structures */
-                t.polygon_list = ctx->tiler_polygon_list.bo->gpu;
 
                 /* Allow the entire tiler heap */
                 t.heap_start = ctx->tiler_heap.bo->gpu;
@@ -94,16 +93,13 @@ panfrost_emit_midg_tiler(
                 /* Use a dummy polygon list */
                 t.polygon_list = ctx->tiler_dummy.bo->gpu;
 
-                /* Also, set a "tiler disabled?" flag? */
-                t.hierarchy_mask |= 0x1000;
+                /* Disable the tiler */
+                t.hierarchy_mask |= MALI_TILER_DISABLED;
         }
 
         t.polygon_list_body =
                 t.polygon_list + header_size;
 
-        t.polygon_list_size =
-                header_size + body_size;
-
         return t;
 }
 
@@ -189,21 +185,25 @@ panfrost_clear(
 static mali_ptr
 panfrost_attach_vt_mfbd(struct panfrost_context *ctx)
 {
-        return panfrost_upload_transient(ctx, &ctx->vt_framebuffer_mfbd, sizeof(ctx->vt_framebuffer_mfbd)) | MALI_MFBD;
+        struct bifrost_framebuffer mfbd = panfrost_emit_mfbd(ctx, ~0);
+
+        return panfrost_upload_transient(ctx, &mfbd, sizeof(mfbd)) | MALI_MFBD;
 }
 
 static mali_ptr
 panfrost_attach_vt_sfbd(struct panfrost_context *ctx)
 {
-        return panfrost_upload_transient(ctx, &ctx->vt_framebuffer_sfbd, sizeof(ctx->vt_framebuffer_sfbd)) | MALI_SFBD;
+        struct mali_single_framebuffer sfbd = panfrost_emit_sfbd(ctx, ~0);
+
+        return panfrost_upload_transient(ctx, &sfbd, sizeof(sfbd)) | MALI_SFBD;
 }
 
 static void
-panfrost_attach_vt_framebuffer(struct panfrost_context *ctx, bool skippable)
+panfrost_attach_vt_framebuffer(struct panfrost_context *ctx)
 {
         /* Skip the attach if we can */
 
-        if (skippable && ctx->payloads[PIPE_SHADER_VERTEX].postfix.framebuffer) {
+        if (ctx->payloads[PIPE_SHADER_VERTEX].postfix.framebuffer) {
                 assert(ctx->payloads[PIPE_SHADER_FRAGMENT].postfix.framebuffer);
                 return;
         }
@@ -223,13 +223,6 @@ panfrost_attach_vt_framebuffer(struct panfrost_context *ctx, bool skippable)
 static void
 panfrost_invalidate_frame(struct panfrost_context *ctx)
 {
-        struct panfrost_screen *screen = pan_screen(ctx->base.screen);
-
-        if (screen->require_sfbd)
-                ctx->vt_framebuffer_sfbd = panfrost_emit_sfbd(ctx, ~0);
-        else
-                ctx->vt_framebuffer_mfbd = panfrost_emit_mfbd(ctx, ~0);
-
         for (unsigned i = 0; i < PIPE_SHADER_TYPES; ++i)
                 ctx->payloads[i].postfix.framebuffer = 0;
 
@@ -238,6 +231,9 @@ panfrost_invalidate_frame(struct panfrost_context *ctx)
 
         /* XXX */
         ctx->dirty |= PAN_DIRTY_SAMPLERS | PAN_DIRTY_TEXTURES;
+
+        /* TODO: When does this need to be handled? */
+        ctx->active_queries = true;
 }
 
 /* In practice, every field of these payloads should be configurable
@@ -278,6 +274,8 @@ translate_tex_wrap(enum pipe_tex_wrap w)
         case PIPE_TEX_WRAP_REPEAT:
                 return MALI_WRAP_REPEAT;
 
+                /* TODO: lower GL_CLAMP? */
+        case PIPE_TEX_WRAP_CLAMP:
         case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
                 return MALI_WRAP_CLAMP_TO_EDGE;
 
@@ -467,162 +465,6 @@ panfrost_vertex_tiler_job(struct panfrost_context *ctx, bool is_tiler)
         return transfer;
 }
 
-static mali_ptr
-panfrost_emit_varyings(
-        struct panfrost_context *ctx,
-        union mali_attr *slot,
-        unsigned stride,
-        unsigned count)
-{
-        /* Fill out the descriptor */
-        slot->stride = stride;
-        slot->size = stride * count;
-        slot->shift = slot->extra_flags = 0;
-
-        struct panfrost_transfer transfer =
-                panfrost_allocate_transient(ctx, slot->size);
-
-        slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
-
-        return transfer.gpu;
-}
-
-static void
-panfrost_emit_point_coord(union mali_attr *slot)
-{
-        slot->elements = MALI_VARYING_POINT_COORD | MALI_ATTR_LINEAR;
-        slot->stride = slot->size = slot->shift = slot->extra_flags = 0;
-}
-
-static void
-panfrost_emit_front_face(union mali_attr *slot)
-{
-        slot->elements = MALI_VARYING_FRONT_FACING | MALI_ATTR_INTERNAL;
-}
-
-static void
-panfrost_emit_varying_descriptor(
-        struct panfrost_context *ctx,
-        unsigned vertex_count)
-{
-        /* Load the shaders */
-
-        struct panfrost_shader_state *vs = &ctx->shader[PIPE_SHADER_VERTEX]->variants[ctx->shader[PIPE_SHADER_VERTEX]->active_variant];
-        struct panfrost_shader_state *fs = &ctx->shader[PIPE_SHADER_FRAGMENT]->variants[ctx->shader[PIPE_SHADER_FRAGMENT]->active_variant];
-        unsigned int num_gen_varyings = 0;
-
-        /* Allocate the varying descriptor */
-
-        size_t vs_size = sizeof(struct mali_attr_meta) * vs->tripipe->varying_count;
-        size_t fs_size = sizeof(struct mali_attr_meta) * fs->tripipe->varying_count;
-
-        struct panfrost_transfer trans = panfrost_allocate_transient(ctx,
-                                         vs_size + fs_size);
-
-        /*
-         * Assign ->src_offset now that we know about all the general purpose
-         * varyings that will be used by the fragment and vertex shaders.
-         */
-        for (unsigned i = 0; i < vs->tripipe->varying_count; i++) {
-                /*
-                 * General purpose varyings have ->index set to 0, skip other
-                 * entries.
-                 */
-                if (vs->varyings[i].index)
-                        continue;
-
-                vs->varyings[i].src_offset = 16 * (num_gen_varyings++);
-        }
-
-        for (unsigned i = 0; i < fs->tripipe->varying_count; i++) {
-                unsigned j;
-
-                /* If we have a point sprite replacement, handle that here. We
-                 * have to translate location first.  TODO: Flip y in shader.
-                 * We're already keying ... just time crunch .. */
-
-                unsigned loc = fs->varyings_loc[i];
-                unsigned pnt_loc =
-                        (loc >= VARYING_SLOT_VAR0) ? (loc - VARYING_SLOT_VAR0) :
-                        (loc == VARYING_SLOT_PNTC) ? 8 :
-                        ~0;
-
-                if (~pnt_loc && fs->point_sprite_mask & (1 << pnt_loc)) {
-                        /* gl_PointCoord index by convention */
-                        fs->varyings[i].index = 3;
-                        fs->reads_point_coord = true;
-
-                        /* Swizzle out the z/w to 0/1 */
-                        fs->varyings[i].format = MALI_RG16F;
-                        fs->varyings[i].swizzle =
-                                panfrost_get_default_swizzle(2);
-
-                        continue;
-                }
-
-                if (fs->varyings[i].index)
-                        continue;
-
-                /*
-                 * Re-use the VS general purpose varying pos if it exists,
-                 * create a new one otherwise.
-                 */
-                for (j = 0; j < vs->tripipe->varying_count; j++) {
-                        if (fs->varyings_loc[i] == vs->varyings_loc[j])
-                                break;
-                }
-
-                if (j < vs->tripipe->varying_count)
-                        fs->varyings[i].src_offset = vs->varyings[j].src_offset;
-                else
-                        fs->varyings[i].src_offset = 16 * (num_gen_varyings++);
-        }
-
-        memcpy(trans.cpu, vs->varyings, vs_size);
-        memcpy(trans.cpu + vs_size, fs->varyings, fs_size);
-
-        ctx->payloads[PIPE_SHADER_VERTEX].postfix.varying_meta = trans.gpu;
-        ctx->payloads[PIPE_SHADER_FRAGMENT].postfix.varying_meta = trans.gpu + vs_size;
-
-        /* Buffer indices must be in this order per our convention */
-        union mali_attr varyings[PIPE_MAX_ATTRIBS];
-        unsigned idx = 0;
-
-        panfrost_emit_varyings(ctx, &varyings[idx++], num_gen_varyings * 16,
-                               vertex_count);
-
-        /* fp32 vec4 gl_Position */
-        ctx->payloads[PIPE_SHADER_FRAGMENT].postfix.position_varying =
-                panfrost_emit_varyings(ctx, &varyings[idx++],
-                                       sizeof(float) * 4, vertex_count);
-
-
-        if (vs->writes_point_size || fs->reads_point_coord) {
-                /* fp16 vec1 gl_PointSize */
-                ctx->payloads[PIPE_SHADER_FRAGMENT].primitive_size.pointer =
-                        panfrost_emit_varyings(ctx, &varyings[idx++],
-                                               2, vertex_count);
-        } else if (fs->reads_face) {
-                /* Dummy to advance index */
-                ++idx;
-        }
-
-        if (fs->reads_point_coord) {
-                /* Special descriptor */
-                panfrost_emit_point_coord(&varyings[idx++]);
-        } else if (fs->reads_face) {
-                ++idx;
-        }
-
-        if (fs->reads_face) {
-                panfrost_emit_front_face(&varyings[idx++]);
-        }
-
-        mali_ptr varyings_p = panfrost_upload_transient(ctx, &varyings, idx * sizeof(union mali_attr));
-        ctx->payloads[PIPE_SHADER_VERTEX].postfix.varyings = varyings_p;
-        ctx->payloads[PIPE_SHADER_FRAGMENT].postfix.varyings = varyings_p;
-}
-
 mali_ptr
 panfrost_vertex_buffer_address(struct panfrost_context *ctx, unsigned i)
 {
@@ -677,7 +519,7 @@ panfrost_stage_attributes(struct panfrost_context *ctx)
          * QED.
          */
 
-        unsigned start = ctx->payloads[PIPE_SHADER_VERTEX].draw_start;
+        unsigned start = ctx->payloads[PIPE_SHADER_VERTEX].offset_start;
 
         for (unsigned i = 0; i < so->num_elements; ++i) {
                 unsigned vbi = so->pipe[i].vertex_buffer_index;
@@ -840,6 +682,7 @@ struct sysval_uniform {
                 float f[4];
                 int32_t i[4];
                 uint32_t u[4];
+                uint64_t du[2];
         };
 };
 
@@ -888,6 +731,34 @@ static void panfrost_upload_txs_sysval(struct panfrost_context *ctx,
                 uniform->i[dim] = tex->texture->array_size;
 }
 
+static void panfrost_upload_ssbo_sysval(
+        struct panfrost_context *ctx,
+        enum pipe_shader_type st,
+        unsigned ssbo_id,
+        struct sysval_uniform *uniform)
+{
+        assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
+        struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
+
+        /* Compute address */
+        struct panfrost_job *batch = panfrost_get_job_for_fbo(ctx);
+        struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
+
+        panfrost_job_add_bo(batch, bo);
+
+        /* Upload address and size as sysval */
+        uniform->du[0] = bo->gpu + sb.buffer_offset;
+        uniform->u[2] = sb.buffer_size;
+}
+
+static void panfrost_upload_num_work_groups_sysval(struct panfrost_context *ctx,
+                struct sysval_uniform *uniform)
+{
+        uniform->u[0] = ctx->compute_grid->grid[0];
+        uniform->u[1] = ctx->compute_grid->grid[1];
+        uniform->u[2] = ctx->compute_grid->grid[2];
+}
+
 static void panfrost_upload_sysvals(struct panfrost_context *ctx, void *buf,
                                     struct panfrost_shader_state *ss,
                                     enum pipe_shader_type st)
@@ -908,6 +779,14 @@ static void panfrost_upload_sysvals(struct panfrost_context *ctx, void *buf,
                         panfrost_upload_txs_sysval(ctx, st, PAN_SYSVAL_ID(sysval),
                                                    &uniforms[i]);
                         break;
+                case PAN_SYSVAL_SSBO:
+                        panfrost_upload_ssbo_sysval(ctx, st, PAN_SYSVAL_ID(sysval),
+                                                    &uniforms[i]);
+                        break;
+                case PAN_SYSVAL_NUM_WORK_GROUPS:
+                        panfrost_upload_num_work_groups_sysval(ctx, &uniforms[i]);
+                        break;
+
                 default:
                         assert(0);
                 }
@@ -1016,7 +895,7 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
         struct panfrost_job *job = panfrost_get_job_for_fbo(ctx);
         struct panfrost_screen *screen = pan_screen(ctx->base.screen);
 
-        panfrost_attach_vt_framebuffer(ctx, true);
+        panfrost_attach_vt_framebuffer(ctx);
 
         if (with_vertex_data) {
                 panfrost_emit_vertex_data(job);
@@ -1067,6 +946,8 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
 
                 panfrost_patch_shader_state(ctx, variant, PIPE_SHADER_FRAGMENT, false);
 
+                panfrost_job_add_bo(job, variant->bo);
+
 #define COPY(name) ctx->fragment_shader_core.name = variant->tripipe->name
 
                 COPY(shader);
@@ -1084,13 +965,19 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
 #undef COPY
 
                 /* Get blending setup */
-                struct panfrost_blend_final blend =
-                        panfrost_get_blend_for_context(ctx, 0);
+                unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
+
+                struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 
-                /* If there is a blend shader, work registers are shared */
+                for (unsigned c = 0; c < rt_count; ++c)
+                        blend[c] = panfrost_get_blend_for_context(ctx, c);
 
-                if (blend.is_shader)
-                        ctx->fragment_shader_core.midgard1.work_count = /*MAX2(ctx->fragment_shader_core.midgard1.work_count, ctx->blend->blend_work_count)*/16;
+                /* If there is a blend shader, work registers are shared. XXX: opt */
+
+                for (unsigned c = 0; c < rt_count; ++c) {
+                        if (blend[c].is_shader)
+                                ctx->fragment_shader_core.midgard1.work_count = 16;
+                }
 
                 /* Set late due to depending on render state */
                 unsigned flags = ctx->fragment_shader_core.midgard1.flags;
@@ -1130,22 +1017,14 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
                         ctx->fragment_shader_core.midgard1.flags |= 0x400;
                 }
 
-                /* Check if we're using the default blend descriptor (fast path) */
-
-                bool no_blending =
-                        !blend.is_shader &&
-                        (blend.equation.equation->rgb_mode == 0x122) &&
-                        (blend.equation.equation->alpha_mode == 0x122) &&
-                        (blend.equation.equation->color_mask == 0xf);
-
                 /* Even on MFBD, the shader descriptor gets blend shaders. It's
                  * *also* copied to the blend_meta appended (by convention),
                  * but this is the field actually read by the hardware. (Or
                  * maybe both are read...?) */
 
-                if (blend.is_shader) {
+                if (blend[0].is_shader) {
                         ctx->fragment_shader_core.blend.shader =
-                                blend.shader.gpu;
+                                blend[0].shader.bo->gpu | blend[0].shader.first_tag;
                 } else {
                         ctx->fragment_shader_core.blend.shader = 0;
                 }
@@ -1156,19 +1035,19 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
                          * additionally need to signal CAN_DISCARD for nontrivial blend
                          * modes (so we're able to read back the destination buffer) */
 
-                        if (!blend.is_shader) {
+                        if (!blend[0].is_shader) {
                                 ctx->fragment_shader_core.blend.equation =
-                                        *blend.equation.equation;
+                                        *blend[0].equation.equation;
                                 ctx->fragment_shader_core.blend.constant =
-                                        blend.equation.constant;
+                                        blend[0].equation.constant;
                         }
 
-                        if (!no_blending) {
+                        if (!blend[0].no_blending) {
                                 ctx->fragment_shader_core.unknown2_3 |= MALI_CAN_DISCARD;
                         }
                 }
 
-                size_t size = sizeof(struct mali_shader_meta) + sizeof(struct midgard_blend_rt);
+                size_t size = sizeof(struct mali_shader_meta) + (sizeof(struct midgard_blend_rt) * rt_count);
                 struct panfrost_transfer transfer = panfrost_allocate_transient(ctx, size);
                 memcpy(transfer.cpu, &ctx->fragment_shader_core, sizeof(struct mali_shader_meta));
 
@@ -1177,27 +1056,27 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
                 if (!screen->require_sfbd) {
                         /* Additional blend descriptor tacked on for jobs using MFBD */
 
-                        unsigned blend_count = 0x200;
+                        struct midgard_blend_rt rts[4];
 
-                        if (blend.is_shader) {
-                                /* For a blend shader, the bottom nibble corresponds to
-                                 * the number of work registers used, which signals the
-                                 * -existence- of a blend shader */
+                        for (unsigned i = 0; i < rt_count; ++i) {
+                                unsigned blend_count = 0x200;
 
-                                assert(blend.shader.work_count >= 2);
-                                blend_count |= MIN2(blend.shader.work_count, 3);
-                        } else {
-                                /* Otherwise, the bottom bit simply specifies if
-                                 * blending (anything other than REPLACE) is enabled */
+                                if (blend[i].is_shader) {
+                                        /* For a blend shader, the bottom nibble corresponds to
+                                         * the number of work registers used, which signals the
+                                         * -existence- of a blend shader */
 
+                                        assert(blend[i].shader.work_count >= 2);
+                                        blend_count |= MIN2(blend[i].shader.work_count, 3);
+                                } else {
+                                        /* Otherwise, the bottom bit simply specifies if
+                                         * blending (anything other than REPLACE) is enabled */
 
-                                if (!no_blending)
-                                        blend_count |= 0x1;
-                        }
+                                        if (!blend[i].no_blending)
+                                                blend_count |= 0x1;
+                                }
 
-                        struct midgard_blend_rt rts[4];
 
-                        for (unsigned i = 0; i < ctx->pipe_framebuffer.nr_cbufs; ++i) {
                                 bool is_srgb =
                                         (ctx->pipe_framebuffer.nr_cbufs > i) &&
                                         (ctx->pipe_framebuffer.cbufs[i]) &&
@@ -1217,17 +1096,17 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
                                  * native Midgard ops for helping here, but
                                  * they're not well-understood yet. */
 
-                                assert(!(is_srgb && blend.is_shader));
+                                assert(!(is_srgb && blend[i].is_shader));
 
-                                if (blend.is_shader) {
-                                        rts[i].blend.shader = blend.shader.gpu;
+                                if (blend[i].is_shader) {
+                                        rts[i].blend.shader = blend[i].shader.bo->gpu | blend[i].shader.first_tag;
                                 } else {
-                                        rts[i].blend.equation = *blend.equation.equation;
-                                        rts[i].blend.constant = blend.equation.constant;
+                                        rts[i].blend.equation = *blend[i].equation.equation;
+                                        rts[i].blend.constant = blend[i].equation.constant;
                                 }
                         }
 
-                        memcpy(transfer.cpu + sizeof(struct mali_shader_meta), rts, sizeof(rts[0]) * 1);
+                        memcpy(transfer.cpu + sizeof(struct mali_shader_meta), rts, sizeof(rts[0]) * rt_count);
                 }
         }
 
@@ -1460,8 +1339,6 @@ panfrost_submit_frame(struct panfrost_context *ctx, bool flush_immediate,
         struct pipe_context *gallium = (struct pipe_context *) ctx;
         struct panfrost_screen *screen = pan_screen(gallium->screen);
 
-#ifndef DRY_RUN
-
         panfrost_job_submit(ctx, job);
 
         /* If visual, we can stall a frame */
@@ -1475,7 +1352,6 @@ panfrost_submit_frame(struct panfrost_context *ctx, bool flush_immediate,
         /* If readback, flush now (hurts the pipelined performance) */
         if (flush_immediate)
                 panfrost_drm_force_flush_fragment(ctx, fence);
-#endif
 }
 
 static void
@@ -1500,7 +1376,85 @@ panfrost_draw_wallpaper(struct pipe_context *pipe)
         struct panfrost_job *batch = panfrost_get_job_for_fbo(ctx);
 
         ctx->wallpaper_batch = batch;
-        panfrost_blit_wallpaper(ctx);
+
+        /* Clamp the rendering area to the damage extent. The
+         * KHR_partial_update() spec states that trying to render outside of
+         * the damage region is "undefined behavior", so we should be safe.
+         */
+        unsigned damage_width = (rsrc->damage.extent.maxx - rsrc->damage.extent.minx);
+        unsigned damage_height = (rsrc->damage.extent.maxy - rsrc->damage.extent.miny);
+
+        if (damage_width && damage_height) {
+                panfrost_job_intersection_scissor(batch, rsrc->damage.extent.minx,
+                                                  rsrc->damage.extent.miny,
+                                                  rsrc->damage.extent.maxx,
+                                                  rsrc->damage.extent.maxy);
+        }
+
+        /* FIXME: Looks like aligning on a tile is not enough, but
+         * aligning on twice the tile size seems to works. We don't
+         * know exactly what happens here but this deserves extra
+         * investigation to figure it out.
+         */
+        batch->minx = batch->minx & ~((MALI_TILE_LENGTH * 2) - 1);
+        batch->miny = batch->miny & ~((MALI_TILE_LENGTH * 2) - 1);
+        batch->maxx = MIN2(ALIGN_POT(batch->maxx, MALI_TILE_LENGTH * 2),
+                           rsrc->base.width0);
+        batch->maxy = MIN2(ALIGN_POT(batch->maxy, MALI_TILE_LENGTH * 2),
+                           rsrc->base.height0);
+
+        struct pipe_scissor_state damage;
+        struct pipe_box rects[4];
+
+        /* Clamp the damage box to the rendering area. */
+        damage.minx = MAX2(batch->minx, rsrc->damage.biggest_rect.x);
+        damage.miny = MAX2(batch->miny, rsrc->damage.biggest_rect.y);
+        damage.maxx = MIN2(batch->maxx,
+                           rsrc->damage.biggest_rect.x +
+                           rsrc->damage.biggest_rect.width);
+        damage.maxy = MIN2(batch->maxy,
+                           rsrc->damage.biggest_rect.y +
+                           rsrc->damage.biggest_rect.height);
+
+        /* One damage rectangle means we can end up with at most 4 reload
+         * regions:
+         * 1: left region, only exists if damage.x > 0
+         * 2: right region, only exists if damage.x + damage.width < fb->width
+         * 3: top region, only exists if damage.y > 0. The intersection with
+         *    the left and right regions are dropped
+         * 4: bottom region, only exists if damage.y + damage.height < fb->height.
+         *    The intersection with the left and right regions are dropped
+         *
+         *                    ____________________________
+         *                    |       |     3     |      |
+         *                    |       |___________|      |
+         *                    |       |   damage  |      |
+         *                    |   1   |    rect   |   2  |
+         *                    |       |___________|      |
+         *                    |       |     4     |      |
+         *                    |_______|___________|______|
+         */
+        u_box_2d(batch->minx, batch->miny, damage.minx - batch->minx,
+                 batch->maxy - batch->miny, &rects[0]);
+        u_box_2d(damage.maxx, batch->miny, batch->maxx - damage.maxx,
+                 batch->maxy - batch->miny, &rects[1]);
+        u_box_2d(damage.minx, batch->miny, damage.maxx - damage.minx,
+                 damage.miny - batch->miny, &rects[2]);
+        u_box_2d(damage.minx, damage.maxy, damage.maxx - damage.minx,
+                 batch->maxy - damage.maxy, &rects[3]);
+
+        for (unsigned i = 0; i < 4; i++) {
+                /* Width and height are always >= 0 even if width is declared as a
+                 * signed integer: u_box_2d() helper takes unsigned args and
+                 * panfrost_set_damage_region() is taking care of clamping
+                 * negative values.
+                 */
+                if (!rects[i].width || !rects[i].height)
+                        continue;
+
+                /* Blit the wallpaper in */
+                panfrost_blit_wallpaper(ctx, &rects[i]);
+        }
         ctx->wallpaper_batch = NULL;
 }
 
@@ -1608,6 +1562,26 @@ panfrost_scissor_culls_everything(struct panfrost_context *ctx)
         return (ss->minx == ss->maxx) || (ss->miny == ss->maxy);
 }
 
+/* Count generated primitives (when there is no geom/tess shaders) for
+ * transform feedback */
+
+static void
+panfrost_statistics_record(
+                struct panfrost_context *ctx,
+                const struct pipe_draw_info *info)
+{
+        if (!ctx->active_queries)
+                return;
+
+        uint32_t prims = u_prims_for_vertices(info->mode, info->count);
+        ctx->prims_generated += prims;
+
+        if (ctx->streamout.num_targets <= 0)
+                return;
+
+        ctx->tf_prims_generated += prims;
+}
+
 static void
 panfrost_draw_vbo(
         struct pipe_context *pipe,
@@ -1622,8 +1596,8 @@ panfrost_draw_vbo(
         if (panfrost_scissor_culls_everything(ctx))
                 return;
 
-        ctx->payloads[PIPE_SHADER_VERTEX].draw_start = info->start;
-        ctx->payloads[PIPE_SHADER_FRAGMENT].draw_start = info->start;
+        ctx->payloads[PIPE_SHADER_VERTEX].offset_start = info->start;
+        ctx->payloads[PIPE_SHADER_FRAGMENT].offset_start = info->start;
 
         int mode = info->mode;
 
@@ -1660,8 +1634,10 @@ panfrost_draw_vbo(
 
         ctx->payloads[PIPE_SHADER_FRAGMENT].prefix.draw_mode = g2m_draw_mode(mode);
 
-        ctx->vertex_count = info->count;
+        /* Take into account a negative bias */
+        ctx->vertex_count = info->count + abs(info->index_bias);
         ctx->instance_count = info->instance_count;
+        ctx->active_prim = info->mode;
 
         /* For non-indexed draws, they're the same */
         unsigned vertex_count = ctx->vertex_count;
@@ -1693,6 +1669,8 @@ panfrost_draw_vbo(
                 draw_flags |= 0x800;
         }
 
+        panfrost_statistics_record(ctx, info);
+
         if (info->index_size) {
                 /* Calculate the min/max index used so we can figure out how
                  * many times to invoke the vertex shader */
@@ -1709,14 +1687,13 @@ panfrost_draw_vbo(
 
                 /* Use the corresponding values */
                 vertex_count = max_index - min_index + 1;
-                ctx->payloads[PIPE_SHADER_VERTEX].draw_start = min_index;
-                ctx->payloads[PIPE_SHADER_FRAGMENT].draw_start = min_index;
+                ctx->payloads[PIPE_SHADER_VERTEX].offset_start = min_index + info->index_bias;
+                ctx->payloads[PIPE_SHADER_FRAGMENT].offset_start = min_index + info->index_bias;
 
-                ctx->payloads[PIPE_SHADER_FRAGMENT].prefix.negative_start = -min_index;
+                ctx->payloads[PIPE_SHADER_FRAGMENT].prefix.offset_bias_correction = -min_index;
                 ctx->payloads[PIPE_SHADER_FRAGMENT].prefix.index_count = MALI_POSITIVE(info->count);
 
                 //assert(!info->restart_index); /* TODO: Research */
-                assert(!info->index_bias);
 
                 draw_flags |= panfrost_translate_index_size(info->index_size);
                 ctx->payloads[PIPE_SHADER_FRAGMENT].prefix.indices = panfrost_get_index_buffer_mapped(ctx, info);
@@ -1724,7 +1701,7 @@ panfrost_draw_vbo(
                 /* Index count == vertex count, if no indexing is applied, as
                  * if it is internally indexed in the expected order */
 
-                ctx->payloads[PIPE_SHADER_FRAGMENT].prefix.negative_start = 0;
+                ctx->payloads[PIPE_SHADER_FRAGMENT].prefix.offset_bias_correction = 0;
                 ctx->payloads[PIPE_SHADER_FRAGMENT].prefix.index_count = MALI_POSITIVE(ctx->vertex_count);
 
                 /* Reverse index state */
@@ -1775,6 +1752,15 @@ panfrost_draw_vbo(
 
         /* Fire off the draw itself */
         panfrost_queue_draw(ctx);
+
+        /* Increment transform feedback offsets */
+
+        for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
+                unsigned output_count = u_stream_outputs_for_vertices(
+                                ctx->active_prim, ctx->vertex_count);
+
+                ctx->streamout.offsets[i] += output_count;
+        }
 }
 
 /* CSO state */
@@ -1909,6 +1895,12 @@ panfrost_delete_shader_state(
                 DBG("Deleting TGSI shader leaks duplicated tokens\n");
         }
 
+        for (unsigned i = 0; i < cso->variant_count; ++i) {
+                struct panfrost_shader_state *shader_state = &cso->variants[i];
+                panfrost_bo_unreference(pctx->screen, shader_state->bo);
+                shader_state->bo = NULL;
+        }
+
         free(so);
 }
 
@@ -2033,6 +2025,45 @@ panfrost_variant_matches(
         return true;
 }
 
+/**
+ * Fix an uncompiled shader's stream output info, and produce a bitmask
+ * of which VARYING_SLOT_* are captured for stream output.
+ *
+ * Core Gallium stores output->register_index as a "slot" number, where
+ * slots are assigned consecutively to all outputs in info->outputs_written.
+ * This naive packing of outputs doesn't work for us - we too have slots,
+ * but the layout is defined by the VUE map, which we won't have until we
+ * compile a specific shader variant.  So, we remap these and simply store
+ * VARYING_SLOT_* in our copy's output->register_index fields.
+ *
+ * We then produce a bitmask of outputs which are used for SO.
+ *
+ * Implementation from iris.
+ */
+
+static uint64_t
+update_so_info(struct pipe_stream_output_info *so_info,
+               uint64_t outputs_written)
+{
+       uint64_t so_outputs = 0;
+       uint8_t reverse_map[64] = {};
+       unsigned slot = 0;
+
+       while (outputs_written)
+               reverse_map[slot++] = u_bit_scan64(&outputs_written);
+
+       for (unsigned i = 0; i < so_info->num_outputs; i++) {
+               struct pipe_stream_output *output = &so_info->output[i];
+
+               /* Map Gallium's condensed "slots" back to real VARYING_SLOT_* enums */
+               output->register_index = reverse_map[output->register_index];
+
+               so_outputs |= 1ull << output->register_index;
+       }
+
+       return so_outputs;
+}
+
 static void
 panfrost_bind_shader_state(
         struct pipe_context *pctx,
@@ -2081,7 +2112,7 @@ panfrost_bind_shader_state(
                         }
                 }
 
-                variants->variants[variant].tripipe = malloc(sizeof(struct mali_shader_meta));
+                variants->variants[variant].tripipe = calloc(1, sizeof(struct mali_shader_meta));
 
         }
 
@@ -2094,14 +2125,24 @@ panfrost_bind_shader_state(
         /* We finally have a variant, so compile it */
 
         if (!shader_state->compiled) {
+                uint64_t outputs_written = 0;
+
                 panfrost_shader_compile(ctx, shader_state->tripipe,
                               variants->base.type,
                               variants->base.type == PIPE_SHADER_IR_NIR ?
                                       variants->base.ir.nir :
                                       variants->base.tokens,
-                                        tgsi_processor_to_shader_stage(type), shader_state);
+                                        tgsi_processor_to_shader_stage(type), shader_state,
+                                        &outputs_written);
 
                 shader_state->compiled = true;
+
+                /* Fixup the stream out information, since what Gallium returns
+                 * normally is mildly insane */
+
+                shader_state->stream_output = variants->base.stream_output;
+                shader_state->so_mask =
+                        update_so_info(&shader_state->stream_output, outputs_written);
         }
 }
 
@@ -2275,7 +2316,7 @@ panfrost_create_sampler_view(
                 .swizzle = panfrost_translate_swizzle_4(user_swizzle)
         };
 
-        texture_descriptor.nr_mipmap_levels = last_level - first_level;
+        texture_descriptor.levels = last_level - first_level;
 
         so->hw = texture_descriptor;
 
@@ -2368,23 +2409,42 @@ panfrost_set_framebuffer_state(struct pipe_context *pctx,
         bool is_scanout = panfrost_is_scanout(ctx);
         bool has_draws = job->last_job.gpu;
 
-        if (!ctx->wallpaper_batch && (!is_scanout || has_draws)) {
-                panfrost_flush(pctx, NULL, PIPE_FLUSH_END_OF_FRAME);
+        /* Bail out early when the current and new states are the same. */
+        if (util_framebuffer_state_equal(&ctx->pipe_framebuffer, fb))
+                return;
+
+        /* The wallpaper logic sets a new FB state before doing the blit and
+         * restore the old one when it's done. Those FB states are reported to
+         * be different because the surface they are pointing to are different,
+         * but those surfaces actually point to the same cbufs/zbufs. In that
+         * case we definitely don't want new FB descs to be emitted/attached
+         * since the job is expected to be flushed just after the blit is done,
+         * so let's just copy the new state and return here.
+         */
+        if (ctx->wallpaper_batch) {
+                util_copy_framebuffer_state(&ctx->pipe_framebuffer, fb);
+                return;
         }
 
+        if (!is_scanout || has_draws)
+                panfrost_flush(pctx, NULL, PIPE_FLUSH_END_OF_FRAME);
+        else
+                assert(!ctx->payloads[PIPE_SHADER_VERTEX].postfix.framebuffer &&
+                       !ctx->payloads[PIPE_SHADER_FRAGMENT].postfix.framebuffer);
+
+        /* Invalidate the FBO job cache since we've just been assigned a new
+         * FB state.
+         */
+        ctx->job = NULL;
+
         util_copy_framebuffer_state(&ctx->pipe_framebuffer, fb);
 
         /* Given that we're rendering, we'd love to have compression */
         struct panfrost_screen *screen = pan_screen(ctx->base.screen);
 
         panfrost_hint_afbc(screen, &ctx->pipe_framebuffer);
-
-        if (screen->require_sfbd)
-                ctx->vt_framebuffer_sfbd = panfrost_emit_sfbd(ctx, ~0);
-        else
-                ctx->vt_framebuffer_mfbd = panfrost_emit_mfbd(ctx, ~0);
-
-        panfrost_attach_vt_framebuffer(ctx, false);
+        for (unsigned i = 0; i < PIPE_SHADER_TYPES; ++i)
+                ctx->payloads[i].postfix.framebuffer = 0;
 }
 
 static void *
@@ -2498,7 +2558,8 @@ static void
 panfrost_set_active_query_state(struct pipe_context *pipe,
                                 bool enable)
 {
-        //struct panfrost_context *panfrost = pan_context(pipe);
+        struct panfrost_context *ctx = pan_context(pipe);
+        ctx->active_queries = enable;
 }
 
 static void
@@ -2514,9 +2575,7 @@ panfrost_destroy(struct pipe_context *pipe)
                 util_blitter_destroy(panfrost->blitter_wallpaper);
 
         panfrost_drm_free_slab(screen, &panfrost->scratchpad);
-        panfrost_drm_free_slab(screen, &panfrost->shaders);
         panfrost_drm_free_slab(screen, &panfrost->tiler_heap);
-        panfrost_drm_free_slab(screen, &panfrost->tiler_polygon_list);
         panfrost_drm_free_slab(screen, &panfrost->tiler_dummy);
 
         ralloc_free(pipe);
@@ -2550,17 +2609,24 @@ panfrost_begin_query(struct pipe_context *pipe, struct pipe_query *q)
         switch (query->type) {
         case PIPE_QUERY_OCCLUSION_COUNTER:
         case PIPE_QUERY_OCCLUSION_PREDICATE:
-        case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
+        case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
                 /* Allocate a word for the query results to be stored */
                 query->transfer = panfrost_allocate_transient(ctx, sizeof(unsigned));
-
                 ctx->occlusion_query = query;
+                break;
 
+        /* Geometry statistics are computed in the driver. XXX: geom/tess
+         * shaders.. */
+
+        case PIPE_QUERY_PRIMITIVES_GENERATED:
+                query->start = ctx->prims_generated;
+                break;
+        case PIPE_QUERY_PRIMITIVES_EMITTED:
+                query->start = ctx->tf_prims_generated;
                 break;
-        }
 
         default:
-                DBG("Skipping query %d\n", query->type);
+                fprintf(stderr, "Skipping query %d\n", query->type);
                 break;
         }
 
@@ -2571,7 +2637,22 @@ static bool
 panfrost_end_query(struct pipe_context *pipe, struct pipe_query *q)
 {
         struct panfrost_context *ctx = pan_context(pipe);
-        ctx->occlusion_query = NULL;
+        struct panfrost_query *query = (struct panfrost_query *) q;
+
+        switch (query->type) {
+        case PIPE_QUERY_OCCLUSION_COUNTER:
+        case PIPE_QUERY_OCCLUSION_PREDICATE:
+        case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+                ctx->occlusion_query = NULL;
+                break;
+        case PIPE_QUERY_PRIMITIVES_GENERATED:
+                query->end = ctx->prims_generated;
+                break;
+        case PIPE_QUERY_PRIMITIVES_EMITTED:
+                query->end = ctx->tf_prims_generated;
+                break;
+        }
+
         return true;
 }
 
@@ -2581,18 +2662,16 @@ panfrost_get_query_result(struct pipe_context *pipe,
                           bool wait,
                           union pipe_query_result *vresult)
 {
-        /* STUB */
         struct panfrost_query *query = (struct panfrost_query *) q;
 
-        /* We need to flush out the jobs to actually run the counter, TODO
-         * check wait, TODO wallpaper after if needed */
-
-        panfrost_flush(pipe, NULL, PIPE_FLUSH_END_OF_FRAME);
 
         switch (query->type) {
         case PIPE_QUERY_OCCLUSION_COUNTER:
         case PIPE_QUERY_OCCLUSION_PREDICATE:
-        case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
+        case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+                /* Flush first */
+                panfrost_flush(pipe, NULL, PIPE_FLUSH_END_OF_FRAME);
+
                 /* Read back the query results */
                 unsigned *result = (unsigned *) query->transfer.cpu;
                 unsigned passed = *result;
@@ -2604,7 +2683,13 @@ panfrost_get_query_result(struct pipe_context *pipe,
                 }
 
                 break;
-        }
+
+        case PIPE_QUERY_PRIMITIVES_GENERATED:
+        case PIPE_QUERY_PRIMITIVES_EMITTED:
+                panfrost_flush(pipe, NULL, PIPE_FLUSH_END_OF_FRAME);
+                vresult->u64 = query->end - query->start;
+                break;
+
         default:
                 DBG("Skipped query get %d\n", query->type);
                 break;
@@ -2650,7 +2735,22 @@ panfrost_set_stream_output_targets(struct pipe_context *pctx,
                                    struct pipe_stream_output_target **targets,
                                    const unsigned *offsets)
 {
-        /* STUB */
+        struct panfrost_context *ctx = pan_context(pctx);
+        struct panfrost_streamout *so = &ctx->streamout;
+
+        assert(num_targets <= ARRAY_SIZE(so->targets));
+
+        for (unsigned i = 0; i < num_targets; i++) {
+                if (offsets[i] != -1)
+                        so->offsets[i] = offsets[i];
+
+                pipe_so_target_reference(&so->targets[i], targets[i]);
+        }
+
+        for (unsigned i = 0; i < so->num_targets; i++)
+                pipe_so_target_reference(&so->targets[i], NULL);
+
+        so->num_targets = num_targets;
 }
 
 static void
@@ -2660,9 +2760,7 @@ panfrost_setup_hardware(struct panfrost_context *ctx)
         struct panfrost_screen *screen = pan_screen(gallium->screen);
 
         panfrost_drm_allocate_slab(screen, &ctx->scratchpad, 64*4, false, 0, 0, 0);
-        panfrost_drm_allocate_slab(screen, &ctx->shaders, 4096, true, PAN_ALLOCATE_EXECUTE, 0, 0);
         panfrost_drm_allocate_slab(screen, &ctx->tiler_heap, 4096, false, PAN_ALLOCATE_INVISIBLE | PAN_ALLOCATE_GROWABLE, 1, 128);
-        panfrost_drm_allocate_slab(screen, &ctx->tiler_polygon_list, 128*128, false, PAN_ALLOCATE_INVISIBLE | PAN_ALLOCATE_GROWABLE, 1, 128);
         panfrost_drm_allocate_slab(screen, &ctx->tiler_dummy, 1, false, PAN_ALLOCATE_INVISIBLE, 0, 0);
 }