panfrost/midgard: Add ult/ule ops
[mesa.git] / src / gallium / drivers / panfrost / pan_context.c
index 4987d92d01ccd21fdd468ef24052949419acb97c..fd1fa7f328bd5ad52666708d26cd5503b5e20e3e 100644 (file)
@@ -50,32 +50,6 @@ extern const char *pan_counters_base;
 /* Do not actually send anything to the GPU; merely generate the cmdstream as fast as possible. Disables framebuffer writes */
 //#define DRY_RUN
 
-#define SET_BIT(lval, bit, cond) \
-       if (cond) \
-               lval |= (bit); \
-       else \
-               lval &= ~(bit);
-
-/* TODO: Sample size, etc */
-
-static void
-panfrost_set_framebuffer_msaa(struct panfrost_context *ctx, bool enabled)
-{
-        SET_BIT(ctx->fragment_shader_core.unknown2_3, MALI_HAS_MSAA, enabled);
-        SET_BIT(ctx->fragment_shader_core.unknown2_4, MALI_NO_MSAA, !enabled);
-
-        if (ctx->require_sfbd) {
-                SET_BIT(ctx->fragment_sfbd.format, MALI_FRAMEBUFFER_MSAA_A | MALI_FRAMEBUFFER_MSAA_B, enabled);
-        } else {
-                SET_BIT(ctx->fragment_rts[0].format.flags, MALI_MFBD_FORMAT_MSAA, enabled);
-
-                SET_BIT(ctx->fragment_mfbd.unk1, (1 << 4) | (1 << 1), enabled);
-
-                /* XXX */
-                ctx->fragment_mfbd.rt_count_2 = enabled ? 4 : 1;
-        }
-}
-
 /* AFBC is enabled on a per-resource basis (AFBC enabling is theoretically
  * indepdent between color buffers and depth/stencil). To enable, we allocate
  * the AFBC metadata buffer and mark that it is enabled. We do -not- actually
@@ -96,7 +70,7 @@ panfrost_enable_afbc(struct panfrost_context *ctx, struct panfrost_resource *rsr
         int tile_w = (rsrc->base.width0 + (MALI_TILE_LENGTH - 1)) >> MALI_TILE_SHIFT;
         int tile_h = (rsrc->base.height0 + (MALI_TILE_LENGTH - 1)) >> MALI_TILE_SHIFT;
         int bytes_per_pixel = util_format_get_blocksize(rsrc->base.format);
-        int stride = bytes_per_pixel * rsrc->base.width0; /* TODO: Alignment? */
+        int stride = bytes_per_pixel * ALIGN(rsrc->base.width0, 16);
 
         stride *= 2;  /* TODO: Should this be carried over? */
         int main_size = stride * rsrc->base.height0;
@@ -131,83 +105,6 @@ panfrost_enable_checksum(struct panfrost_context *ctx, struct panfrost_resource
         rsrc->bo->has_checksum = true;
 }
 
-/* ..by contrast, this routine runs for every FRAGMENT job, but does no
- * allocation. AFBC is enabled on a per-surface basis */
-
-static void
-panfrost_set_fragment_afbc(struct panfrost_context *ctx)
-{
-        for (int cb = 0; cb < ctx->pipe_framebuffer.nr_cbufs; ++cb) {
-                struct panfrost_resource *rsrc = (struct panfrost_resource *) ctx->pipe_framebuffer.cbufs[cb]->texture;
-
-                /* Non-AFBC is the default */
-                if (rsrc->bo->layout != PAN_AFBC)
-                        continue;
-
-                if (ctx->require_sfbd) {
-                        DBG("Color AFBC not supported on SFBD\n");
-                        assert(0);
-                }
-
-                /* Enable AFBC for the render target */
-                ctx->fragment_rts[0].afbc.metadata = rsrc->bo->afbc_slab.gpu;
-                ctx->fragment_rts[0].afbc.stride = 0;
-                ctx->fragment_rts[0].afbc.unk = 0x30009;
-
-                ctx->fragment_rts[0].format.flags |= MALI_MFBD_FORMAT_AFBC;
-
-                /* Point rendering to our special framebuffer */
-                ctx->fragment_rts[0].framebuffer = rsrc->bo->afbc_slab.gpu + rsrc->bo->afbc_metadata_size;
-
-                /* WAT? Stride is diff from the scanout case */
-                ctx->fragment_rts[0].framebuffer_stride = ctx->pipe_framebuffer.width * 2 * 4;
-        }
-
-        /* Enable depth/stencil AFBC for the framebuffer (not the render target) */
-        if (ctx->pipe_framebuffer.zsbuf) {
-                struct panfrost_resource *rsrc = (struct panfrost_resource *) ctx->pipe_framebuffer.zsbuf->texture;
-
-                if (rsrc->bo->layout == PAN_AFBC) {
-                        if (ctx->require_sfbd) {
-                                DBG("Depth AFBC not supported on SFBD\n");
-                                assert(0);
-                        }
-
-                        ctx->fragment_mfbd.unk3 |= MALI_MFBD_EXTRA;
-
-                        ctx->fragment_extra.ds_afbc.depth_stencil_afbc_metadata = rsrc->bo->afbc_slab.gpu;
-                        ctx->fragment_extra.ds_afbc.depth_stencil_afbc_stride = 0;
-
-                        ctx->fragment_extra.ds_afbc.depth_stencil = rsrc->bo->afbc_slab.gpu + rsrc->bo->afbc_metadata_size;
-
-                        ctx->fragment_extra.ds_afbc.zero1 = 0x10009;
-                        ctx->fragment_extra.ds_afbc.padding = 0x1000;
-
-                        ctx->fragment_extra.unk = 0x435; /* General 0x400 in all unks. 0x5 for depth/stencil. 0x10 for AFBC encoded depth stencil. Unclear where the 0x20 is from */
-
-                        ctx->fragment_mfbd.unk3 |= 0x400;
-                }
-        }
-
-        /* For the special case of a depth-only FBO, we need to attach a dummy render target */
-
-        if (ctx->pipe_framebuffer.nr_cbufs == 0) {
-                if (ctx->require_sfbd) {
-                        DBG("Depth-only FBO not supported on SFBD\n");
-                        assert(0);
-                }
-
-                struct mali_rt_format null_rt = {
-                        .unk1 = 0x4000000,
-                        .unk4 = 0x8
-                };
-
-                ctx->fragment_rts[0].format = null_rt;
-                ctx->fragment_rts[0].framebuffer = 0;
-                ctx->fragment_rts[0].framebuffer_stride = 0;
-        }
-}
-
 /* Framebuffer descriptor */
 
 static void
@@ -224,7 +121,7 @@ panfrost_set_framebuffer_resolution(struct mali_single_framebuffer *fb, int w, i
         fb->resolution_check = ((w + h) / 3) << 4;
 }
 
-static struct mali_single_framebuffer
+struct mali_single_framebuffer
 panfrost_emit_sfbd(struct panfrost_context *ctx)
 {
         struct mali_single_framebuffer framebuffer = {
@@ -244,7 +141,7 @@ panfrost_emit_sfbd(struct panfrost_context *ctx)
         return framebuffer;
 }
 
-static struct bifrost_framebuffer
+struct bifrost_framebuffer
 panfrost_emit_mfbd(struct panfrost_context *ctx)
 {
         struct bifrost_framebuffer framebuffer = {
@@ -311,7 +208,7 @@ panfrost_emit_mfbd(struct panfrost_context *ctx)
 
 /* Are we currently rendering to the screen (rather than an FBO)? */
 
-static bool
+bool
 panfrost_is_scanout(struct panfrost_context *ctx)
 {
         /* If there is no color buffer, it's an FBO */
@@ -327,71 +224,6 @@ panfrost_is_scanout(struct panfrost_context *ctx)
                ctx->pipe_framebuffer.cbufs[0]->texture->bind & PIPE_BIND_SHARED;
 }
 
-/* The above function is for generalised fbd emission, used in both fragment as
- * well as vertex/tiler payloads. This payload is specific to fragment
- * payloads. */
-
-static void
-panfrost_new_frag_framebuffer(struct panfrost_context *ctx)
-{
-        mali_ptr framebuffer;
-        int stride;
-
-        if (ctx->pipe_framebuffer.nr_cbufs > 0) {
-               framebuffer = ((struct panfrost_resource *) ctx->pipe_framebuffer.cbufs[0]->texture)->bo->gpu[0];
-                stride = util_format_get_stride(ctx->pipe_framebuffer.cbufs[0]->format, ctx->pipe_framebuffer.width);
-        } else {
-                /* Depth-only framebuffer -> dummy RT */
-                framebuffer = 0;
-                stride = 0;
-        }
-
-        /* The default is upside down from OpenGL's perspective. */
-        if (panfrost_is_scanout(ctx)) {
-                framebuffer += stride * (ctx->pipe_framebuffer.height - 1);
-                stride = -stride;
-        }
-
-        if (ctx->require_sfbd) {
-                struct mali_single_framebuffer fb = panfrost_emit_sfbd(ctx);
-
-                fb.framebuffer = framebuffer;
-                fb.stride = stride;
-
-                fb.format = 0xb84e0281; /* RGB32, no MSAA */
-                memcpy(&ctx->fragment_sfbd, &fb, sizeof(fb));
-        } else {
-                struct bifrost_framebuffer fb = panfrost_emit_mfbd(ctx);
-
-                /* XXX: MRT case */
-                fb.rt_count_2 = 1;
-                fb.unk3 = 0x100;
-
-                /* By default, Gallium seems to need a BGR framebuffer */
-                unsigned char bgra[4] = {
-                        PIPE_SWIZZLE_Z, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_X, PIPE_SWIZZLE_W
-                };
-
-                struct bifrost_render_target rt = {
-                        .format = {
-                                .unk1 = 0x4000000,
-                                .unk2 = 0x1,
-                                .nr_channels = MALI_POSITIVE(4),
-                                .flags = 0x444,
-                                .swizzle = panfrost_translate_swizzle_4(bgra),
-                                .unk4 = 0x8
-                        },
-                        .framebuffer = framebuffer,
-                        .framebuffer_stride = (stride / 16) & 0xfffffff,
-                };
-
-                memcpy(&ctx->fragment_rts[0], &rt, sizeof(rt));
-
-                memset(&ctx->fragment_extra, 0, sizeof(ctx->fragment_extra));
-                memcpy(&ctx->fragment_mfbd, &fb, sizeof(fb));
-        }
-}
-
 /* Maps float 0.0-1.0 to int 0x00-0xFF */
 static uint8_t
 normalised_float_to_u8(float f)
@@ -399,85 +231,6 @@ normalised_float_to_u8(float f)
         return (uint8_t) (int) (f * 255.0f);
 }
 
-static void
-panfrost_clear_sfbd(struct panfrost_job *job)
-{
-        struct panfrost_context *ctx = job->ctx;
-        struct mali_single_framebuffer *sfbd = &ctx->fragment_sfbd;
-
-        if (job->clear & PIPE_CLEAR_COLOR) {
-                sfbd->clear_color_1 = job->clear_color;
-                sfbd->clear_color_2 = job->clear_color;
-                sfbd->clear_color_3 = job->clear_color;
-                sfbd->clear_color_4 = job->clear_color;
-        }
-
-        if (job->clear & PIPE_CLEAR_DEPTH) {
-                sfbd->clear_depth_1 = job->clear_depth;
-                sfbd->clear_depth_2 = job->clear_depth;
-                sfbd->clear_depth_3 = job->clear_depth;
-                sfbd->clear_depth_4 = job->clear_depth;
-
-                sfbd->depth_buffer = ctx->depth_stencil_buffer.gpu;
-                sfbd->depth_buffer_enable = MALI_DEPTH_STENCIL_ENABLE;
-        }
-
-        if (job->clear & PIPE_CLEAR_STENCIL) {
-                sfbd->clear_stencil = job->clear_stencil;
-
-                sfbd->stencil_buffer = ctx->depth_stencil_buffer.gpu;
-                sfbd->stencil_buffer_enable = MALI_DEPTH_STENCIL_ENABLE;
-        }
-
-        /* Set flags based on what has been cleared, for the SFBD case */
-        /* XXX: What do these flags mean? */
-        int clear_flags = 0x101100;
-
-        if (!(job->clear & ~(PIPE_CLEAR_COLOR | PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL))) {
-                /* On a tiler like this, it's fastest to clear all three buffers at once */
-
-                clear_flags |= MALI_CLEAR_FAST;
-        } else {
-                clear_flags |= MALI_CLEAR_SLOW;
-
-                if (job->clear & PIPE_CLEAR_STENCIL)
-                        clear_flags |= MALI_CLEAR_SLOW_STENCIL;
-        }
-
-        sfbd->clear_flags = clear_flags;
-}
-
-static void
-panfrost_clear_mfbd(struct panfrost_job *job)
-{
-        struct panfrost_context *ctx = job->ctx;
-        struct bifrost_render_target *buffer_color = &ctx->fragment_rts[0];
-        struct bifrost_framebuffer *buffer_ds = &ctx->fragment_mfbd;
-
-        if (job->clear & PIPE_CLEAR_COLOR) {
-                buffer_color->clear_color_1 = job->clear_color;
-                buffer_color->clear_color_2 = job->clear_color;
-                buffer_color->clear_color_3 = job->clear_color;
-                buffer_color->clear_color_4 = job->clear_color;
-        }
-
-        if (job->clear & PIPE_CLEAR_DEPTH) {
-                buffer_ds->clear_depth = job->clear_depth;
-        }
-
-        if (job->clear & PIPE_CLEAR_STENCIL) {
-                buffer_ds->clear_stencil = job->clear_stencil;
-        }
-
-        if (job->clear & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) {
-                /* Setup combined 24/8 depth/stencil */
-                ctx->fragment_mfbd.unk3 |= MALI_MFBD_EXTRA;
-                ctx->fragment_extra.unk = 0x405;
-                ctx->fragment_extra.ds_linear.depth = ctx->depth_stencil_buffer.gpu;
-                ctx->fragment_extra.ds_linear.depth_stride = ctx->pipe_framebuffer.width * 4;
-        }
-}
-
 static void
 panfrost_clear(
         struct pipe_context *pipe,
@@ -558,39 +311,6 @@ panfrost_attach_vt_framebuffer(struct panfrost_context *ctx)
         ctx->payload_tiler.postfix.framebuffer = framebuffer;
 }
 
-static void
-panfrost_viewport(struct panfrost_context *ctx,
-                  float depth_clip_near,
-                  float depth_clip_far,
-                  int viewport_x0, int viewport_y0,
-                  int viewport_x1, int viewport_y1)
-{
-        /* Clip bounds are encoded as floats. The viewport itself is encoded as
-         * (somewhat) asymmetric ints. */
-
-        struct mali_viewport ret = {
-                /* By default, do no viewport clipping, i.e. clip to (-inf,
-                 * inf) in each direction. Clipping to the viewport in theory
-                 * should work, but in practice causes issues when we're not
-                 * explicitly trying to scissor */
-
-                .clip_minx = -inff,
-                .clip_miny = -inff,
-                .clip_maxx = inff,
-                .clip_maxy = inff,
-
-                /* We always perform depth clipping (TODO: Can this be disabled?) */
-
-                .clip_minz = depth_clip_near,
-                .clip_maxz = depth_clip_far,
-
-                .viewport0 = { viewport_x0, viewport_y0 },
-                .viewport1 = { MALI_POSITIVE(viewport_x1), MALI_POSITIVE(viewport_y1) },
-        };
-
-        memcpy(ctx->viewport, &ret, sizeof(ret));
-}
-
 /* Reset per-frame context, called on context initialisation as well as after
  * flushing a frame */
 
@@ -609,8 +329,6 @@ panfrost_invalidate_frame(struct panfrost_context *ctx)
         else
                 ctx->vt_framebuffer_mfbd = panfrost_emit_mfbd(ctx);
 
-        panfrost_new_frag_framebuffer(ctx);
-
         /* Reset varyings allocated */
         ctx->varying_height = 0;
 
@@ -662,11 +380,6 @@ panfrost_emit_tiler_payload(struct panfrost_context *ctx)
                 },
         };
 
-        /* Reserve the viewport */
-        struct panfrost_transfer t = panfrost_allocate_chunk(ctx, sizeof(struct mali_viewport), HEAP_DESCRIPTOR);
-        ctx->viewport = (struct mali_viewport *) t.cpu;
-        payload.postfix.viewport = t.gpu;
-
         memcpy(&ctx->payload_tiler, &payload, sizeof(payload));
 }
 
@@ -934,100 +647,88 @@ panfrost_set_value_job(struct panfrost_context *ctx)
         ctx->set_value_job = transfer.gpu;
 }
 
-/* Generate a fragment job. This should be called once per frame. (According to
- * presentations, this is supposed to correspond to eglSwapBuffers) */
-
-mali_ptr
-panfrost_fragment_job(struct panfrost_context *ctx)
+static mali_ptr
+panfrost_emit_varyings(
+                struct panfrost_context *ctx,
+                union mali_attr *slot,
+                unsigned stride,
+                unsigned count)
 {
-        struct panfrost_job *job = panfrost_get_job_for_fbo(ctx);
+        mali_ptr varying_address = ctx->varying_mem.gpu + ctx->varying_height;
 
-        /* Actualize the clear late; TODO: Fix order dependency between clear
-         * and afbc */
+        /* Fill out the descriptor */
+        slot->elements = varying_address | MALI_ATTR_LINEAR;
+        slot->stride = stride;
+        slot->size = stride * count;
 
-        if (ctx->require_sfbd) {
-                panfrost_clear_sfbd(job);
-        } else {
-                panfrost_clear_mfbd(job);
-        }
+        ctx->varying_height += ALIGN(slot->size, 64);
+        assert(ctx->varying_height < ctx->varying_mem.size);
 
-        panfrost_set_fragment_afbc(ctx);
+        return varying_address;
+}
 
-        if (ctx->pipe_framebuffer.nr_cbufs == 1) {
-                struct panfrost_resource *rsrc = (struct panfrost_resource *) ctx->pipe_framebuffer.cbufs[0]->texture;
+static void
+panfrost_emit_point_coord(union mali_attr *slot)
+{
+        slot->elements = MALI_VARYING_POINT_COORD | MALI_ATTR_LINEAR;
+        slot->stride = slot->size = 0;
+}
 
-                if (rsrc->bo->has_checksum) {
-                        if (ctx->require_sfbd) {
-                                DBG("Checksumming not supported on SFBD\n");
-                                assert(0);
-                        }
+static void
+panfrost_emit_varying_descriptor(
+                struct panfrost_context *ctx,
+                unsigned invocation_count)
+{
+        /* Load the shaders */
 
-                        int stride = util_format_get_stride(rsrc->base.format, rsrc->base.width0);
+        struct panfrost_shader_state *vs = &ctx->vs->variants[ctx->vs->active_variant];
+        struct panfrost_shader_state *fs = &ctx->fs->variants[ctx->fs->active_variant];
 
-                        ctx->fragment_mfbd.unk3 |= MALI_MFBD_EXTRA;
-                        ctx->fragment_extra.unk |= 0x420;
-                        ctx->fragment_extra.checksum_stride = rsrc->bo->checksum_stride;
-                        ctx->fragment_extra.checksum = rsrc->bo->gpu[0] + stride * rsrc->base.height0;
-                }
-        }
+        /* Allocate the varying descriptor */
 
-        /* The frame is complete and therefore the framebuffer descriptor is
-         * ready for linkage and upload */
+        size_t vs_size = sizeof(struct mali_attr_meta) * vs->tripipe->varying_count;
+        size_t fs_size = sizeof(struct mali_attr_meta) * fs->tripipe->varying_count;
 
-        size_t sz = ctx->require_sfbd ? sizeof(struct mali_single_framebuffer) : (sizeof(struct bifrost_framebuffer) + sizeof(struct bifrost_fb_extra) + sizeof(struct bifrost_render_target) * 1);
-        struct panfrost_transfer fbd_t = panfrost_allocate_transient(ctx, sz);
-        off_t offset = 0;
+        struct panfrost_transfer trans = panfrost_allocate_transient(ctx,
+                        vs_size + fs_size);
 
-        if (ctx->require_sfbd) {
-                /* Upload just the SFBD all at once */
-                memcpy(fbd_t.cpu, &ctx->fragment_sfbd, sizeof(ctx->fragment_sfbd));
-                offset += sizeof(ctx->fragment_sfbd);
-        } else {
-                /* Upload the MFBD header */
-                memcpy(fbd_t.cpu, &ctx->fragment_mfbd, sizeof(ctx->fragment_mfbd));
-                offset += sizeof(ctx->fragment_mfbd);
-
-                /* Upload extra framebuffer info if necessary */
-                if (ctx->fragment_mfbd.unk3 & MALI_MFBD_EXTRA) {
-                        memcpy(fbd_t.cpu + offset, &ctx->fragment_extra, sizeof(struct bifrost_fb_extra));
-                        offset += sizeof(struct bifrost_fb_extra);
-                }
+        memcpy(trans.cpu, vs->varyings, vs_size);
+        memcpy(trans.cpu + vs_size, fs->varyings, fs_size);
 
-                /* Upload (single) render target */
-                memcpy(fbd_t.cpu + offset, &ctx->fragment_rts[0], sizeof(struct bifrost_render_target) * 1);
-        }
+        ctx->payload_vertex.postfix.varying_meta = trans.gpu;
+        ctx->payload_tiler.postfix.varying_meta = trans.gpu + vs_size;
 
-        /* Generate the fragment (frame) job */
+        /* Buffer indices must be in this order per our convention */
+        union mali_attr varyings[PIPE_MAX_ATTRIBS];
+        unsigned idx = 0;
 
-        struct mali_job_descriptor_header header = {
-                .job_type = JOB_TYPE_FRAGMENT,
-                .job_index = 1,
-#ifdef __LP64__
-                .job_descriptor_size = 1
-#endif
-        };
+        /* General varyings -- use the VS's, since those are more likely to be
+         * accurate on desktop */
 
-        struct mali_payload_fragment payload = {
-                .min_tile_coord = MALI_COORDINATE_TO_TILE_MIN(0, 0),
-                .max_tile_coord = MALI_COORDINATE_TO_TILE_MAX(ctx->pipe_framebuffer.width, ctx->pipe_framebuffer.height),
-                .framebuffer = fbd_t.gpu | (ctx->require_sfbd ? MALI_SFBD : MALI_MFBD),
-        };
+        panfrost_emit_varyings(ctx, &varyings[idx++],
+                        vs->general_varying_stride, invocation_count);
 
-       if (!ctx->require_sfbd && ctx->fragment_mfbd.unk3 & MALI_MFBD_EXTRA) {
-               /* Signal that there is an extra portion of the framebuffer
-                * descriptor */
+        /* fp32 vec4 gl_Position */
+        ctx->payload_tiler.postfix.position_varying =
+                panfrost_emit_varyings(ctx, &varyings[idx++],
+                                sizeof(float) * 4, invocation_count);
 
-               payload.framebuffer |= 2;
-       }
 
-        /* Normally, there should be no padding. However, fragment jobs are
-         * shared with 64-bit Bifrost systems, and accordingly there is 4-bytes
-         * of zero padding in between. */
+        if (vs->writes_point_size || fs->reads_point_coord) {
+                /* fp16 vec1 gl_PointSize */
+                ctx->payload_tiler.primitive_size.pointer =
+                        panfrost_emit_varyings(ctx, &varyings[idx++],
+                                        2, invocation_count);
+        }
+
+        if (fs->reads_point_coord) {
+                /* Special descriptor */
+                panfrost_emit_point_coord(&varyings[idx++]);
+        }
 
-        struct panfrost_transfer transfer = panfrost_allocate_transient(ctx, sizeof(header) + sizeof(payload));
-        memcpy(transfer.cpu, &header, sizeof(header));
-        memcpy(transfer.cpu + sizeof(header), &payload, sizeof(payload));
-        return transfer.gpu;
+        mali_ptr varyings_p = panfrost_upload_transient(ctx, &varyings, idx * sizeof(union mali_attr));
+        ctx->payload_vertex.postfix.varyings = varyings_p;
+        ctx->payload_tiler.postfix.varyings = varyings_p;
 }
 
 /* Emits attributes and varying descriptors, which should be called every draw,
@@ -1038,7 +739,6 @@ panfrost_emit_vertex_data(struct panfrost_context *ctx)
 {
         /* TODO: Only update the dirtied buffers */
         union mali_attr attrs[PIPE_MAX_ATTRIBS];
-        union mali_attr varyings[PIPE_MAX_ATTRIBS];
 
         unsigned invocation_count = MALI_NEGATIVE(ctx->payload_tiler.prefix.invocation_count);
 
@@ -1081,39 +781,9 @@ panfrost_emit_vertex_data(struct panfrost_context *ctx)
                 }
         }
 
-        struct panfrost_varyings *vars = &ctx->vs->variants[ctx->vs->active_variant].varyings;
-
-        for (int i = 0; i < vars->varying_buffer_count; ++i) {
-                mali_ptr varying_address = ctx->varying_mem.gpu + ctx->varying_height;
-
-                varyings[i].elements = varying_address | 1;
-                varyings[i].stride = vars->varyings_stride[i];
-                varyings[i].size = vars->varyings_stride[i] * invocation_count;
-
-                /* If this varying has to be linked somewhere, do it now. See
-                 * pan_assemble.c for the indices. TODO: Use a more generic
-                 * linking interface */
-
-                if (i == 1) {
-                        /* gl_Position */
-                        ctx->payload_tiler.postfix.position_varying = varying_address;
-                } else if (i == 2) {
-                        /* gl_PointSize */
-                        ctx->payload_tiler.primitive_size.pointer = varying_address;
-                }
-
-                /* Varyings appear to need 64-byte alignment */
-                ctx->varying_height += ALIGN(varyings[i].size, 64);
-
-                /* Ensure that we fit */
-                assert(ctx->varying_height < ctx->varying_mem.size);
-        }
-
         ctx->payload_vertex.postfix.attributes = panfrost_upload_transient(ctx, attrs, ctx->vertex_buffer_count * sizeof(union mali_attr));
 
-        mali_ptr varyings_p = panfrost_upload_transient(ctx, &varyings, vars->varying_buffer_count * sizeof(union mali_attr));
-        ctx->payload_vertex.postfix.varyings = varyings_p;
-        ctx->payload_tiler.postfix.varyings = varyings_p;
+        panfrost_emit_varying_descriptor(ctx, invocation_count);
 }
 
 /* Go through dirty flags and actualise them in the cmdstream. */
@@ -1121,15 +791,30 @@ panfrost_emit_vertex_data(struct panfrost_context *ctx)
 void
 panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
 {
+        struct panfrost_job *job = panfrost_get_job_for_fbo(ctx);
+
         if (with_vertex_data) {
                 panfrost_emit_vertex_data(ctx);
         }
 
+        bool msaa = ctx->rasterizer->base.multisample;
+
         if (ctx->dirty & PAN_DIRTY_RASTERIZER) {
                 ctx->payload_tiler.gl_enables = ctx->rasterizer->tiler_gl_enables;
-                panfrost_set_framebuffer_msaa(ctx, ctx->rasterizer->base.multisample);
+
+                /* TODO: Sample size */
+                SET_BIT(ctx->fragment_shader_core.unknown2_3, MALI_HAS_MSAA, msaa);
+                SET_BIT(ctx->fragment_shader_core.unknown2_4, MALI_NO_MSAA, !msaa);
         }
 
+        /* Enable job requirements at draw-time */
+
+        if (msaa)
+                job->requirements |= PAN_REQ_MSAA;
+
+        if (ctx->depth_stencil->depth.writemask)
+                job->requirements |= PAN_REQ_DEPTH_WRITE;
+
         if (ctx->occlusion_query) {
                 ctx->payload_tiler.gl_enables |= MALI_OCCLUSION_QUERY | MALI_OCCLUSION_PRECISE;
                 ctx->payload_tiler.postfix.occlusion_counter = ctx->occlusion_query->transfer.gpu;
@@ -1141,6 +826,7 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
                 struct panfrost_shader_state *vs = &ctx->vs->variants[ctx->vs->active_variant];
 
                 /* Late shader descriptor assignments */
+
                 vs->tripipe->texture_count = ctx->sampler_view_count[PIPE_SHADER_VERTEX];
                 vs->tripipe->sampler_count = ctx->sampler_count[PIPE_SHADER_VERTEX];
 
@@ -1148,15 +834,6 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
                 vs->tripipe->midgard1.unknown1 = 0x2201;
 
                 ctx->payload_vertex.postfix._shader_upper = vs->tripipe_gpu >> 4;
-
-                /* Varying descriptor is tied to the vertex shader. Also the
-                 * fragment shader, I suppose, but it's generated with the
-                 * vertex shader so */
-
-                struct panfrost_varyings *varyings = &ctx->vs->variants[ctx->vs->active_variant].varyings;
-
-                ctx->payload_vertex.postfix.varying_meta = varyings->varyings_descriptor;
-                ctx->payload_tiler.postfix.varying_meta = varyings->varyings_descriptor_fragment;
         }
 
         if (ctx->dirty & (PAN_DIRTY_RASTERIZER | PAN_DIRTY_VS)) {
@@ -1382,12 +1059,21 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
         /* Generate the viewport vector of the form: <width/2, height/2, centerx, centery> */
         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 
+        /* For flipped-Y buffers (signaled by negative scale), the translate is
+         * flipped as well */
+
+        bool invert_y = vp->scale[1] < 0.0;
+        float translate_y = vp->translate[1];
+
+        if (invert_y)
+                translate_y = ctx->pipe_framebuffer.height - translate_y;
+
         float viewport_vec4[] = {
                 vp->scale[0],
                 fabsf(vp->scale[1]),
 
                 vp->translate[0],
-                /* -1.0 * vp->translate[1] */ fabs(1.0 * vp->scale[1]) /* XXX */
+                translate_y
         };
 
         for (int i = 0; i < PIPE_SHADER_TYPES; ++i) {
@@ -1449,6 +1135,54 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
                 }
         }
 
+        /* TODO: Upload the viewport somewhere more appropriate */
+
+        /* Clip bounds are encoded as floats. The viewport itself is encoded as
+         * (somewhat) asymmetric ints. */
+        const struct pipe_scissor_state *ss = &ctx->scissor;
+
+        struct mali_viewport view = {
+                /* By default, do no viewport clipping, i.e. clip to (-inf,
+                 * inf) in each direction. Clipping to the viewport in theory
+                 * should work, but in practice causes issues when we're not
+                 * explicitly trying to scissor */
+
+                .clip_minx = -inff,
+                .clip_miny = -inff,
+                .clip_maxx = inff,
+                .clip_maxy = inff,
+
+                .clip_minz = 0.0,
+                .clip_maxz = 1.0,
+        };
+
+        /* Always scissor to the viewport by default. */
+        view.viewport0[0] = (int) (vp->translate[0] - vp->scale[0]);
+        view.viewport1[0] = MALI_POSITIVE((int) (vp->translate[0] + vp->scale[0]));
+
+        view.viewport0[1] = (int) (translate_y - fabs(vp->scale[1]));
+        view.viewport1[1] = MALI_POSITIVE((int) (translate_y + fabs(vp->scale[1])));
+
+        if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
+                /* Invert scissor if needed */
+                unsigned miny = invert_y ?
+                        ctx->pipe_framebuffer.height - ss->maxy : ss->miny;
+
+                unsigned maxy = invert_y ?
+                        ctx->pipe_framebuffer.height - ss->miny : ss->maxy;
+
+                /* Set the actual scissor */
+                view.viewport0[0] = ss->minx;
+                view.viewport0[1] = miny;
+                view.viewport1[0] = MALI_POSITIVE(ss->maxx);
+                view.viewport1[1] = MALI_POSITIVE(maxy);
+        } 
+
+        ctx->payload_tiler.postfix.viewport =
+                panfrost_upload_transient(ctx,
+                                &view,
+                                sizeof(struct mali_viewport));
+
         ctx->dirty = 0;
 }
 
@@ -1692,7 +1426,7 @@ panfrost_draw_vbo(
 
         /* Fallback for unsupported modes */
 
-        if (!(ctx->draw_modes & mode)) {
+        if (!(ctx->draw_modes & (1 << mode))) {
                 if (mode == PIPE_PRIM_QUADS && info->count == 4 && ctx->rasterizer && !ctx->rasterizer->base.flatshade) {
                         mode = PIPE_PRIM_TRIANGLE_FAN;
                 } else {
@@ -1793,24 +1527,6 @@ panfrost_generic_cso_delete(struct pipe_context *pctx, void *hwcso)
         free(hwcso);
 }
 
-static void
-panfrost_set_scissor(struct panfrost_context *ctx)
-{
-        const struct pipe_scissor_state *ss = &ctx->scissor;
-
-        if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor && 0) {
-                ctx->viewport->viewport0[0] = ss->minx;
-                ctx->viewport->viewport0[1] = ss->miny;
-                ctx->viewport->viewport1[0] = MALI_POSITIVE(ss->maxx);
-                ctx->viewport->viewport1[1] = MALI_POSITIVE(ss->maxy);
-        } else {
-                ctx->viewport->viewport0[0] = 0;
-                ctx->viewport->viewport0[1] = 0;
-                ctx->viewport->viewport1[0] = MALI_POSITIVE(ctx->pipe_framebuffer.width);
-                ctx->viewport->viewport1[1] = MALI_POSITIVE(ctx->pipe_framebuffer.height);
-        }
-}
-
 static void *
 panfrost_create_rasterizer_state(
         struct pipe_context *pctx,
@@ -1842,21 +1558,12 @@ panfrost_bind_rasterizer_state(
         void *hwcso)
 {
         struct panfrost_context *ctx = pan_context(pctx);
-        struct pipe_rasterizer_state *cso = hwcso;
 
         /* TODO: Why can't rasterizer be NULL ever? Other drivers are fine.. */
         if (!hwcso)
                 return;
 
-        /* If scissor test has changed, we'll need to update that now */
-        bool update_scissor = !ctx->rasterizer || ctx->rasterizer->base.scissor != cso->scissor;
-
         ctx->rasterizer = hwcso;
-
-        /* Actualise late changes */
-        if (update_scissor)
-                panfrost_set_scissor(ctx);
-
         ctx->dirty |= PAN_DIRTY_RASTERIZER;
 }
 
@@ -2234,17 +1941,19 @@ panfrost_create_sampler_view(
 
         enum mali_format format = panfrost_find_format(desc);
 
+        bool is_depth = desc->format == PIPE_FORMAT_Z32_UNORM;
+
         unsigned usage2_layout = 0x10;
 
         switch (prsrc->bo->layout) {
                 case PAN_AFBC:
-                        usage2_layout |= 0xc;
+                        usage2_layout |= 0x8 | 0x4;
                         break;
                 case PAN_TILED:
                         usage2_layout |= 0x1;
                         break;
                 case PAN_LINEAR:
-                        usage2_layout |= 0x2;
+                        usage2_layout |= is_depth ? 0x1 : 0x2;
                         break;
                 default:
                         assert(0);
@@ -2350,8 +2059,6 @@ panfrost_set_framebuffer_state(struct pipe_context *pctx,
                         ctx->vt_framebuffer_mfbd = panfrost_emit_mfbd(ctx);
 
                 panfrost_attach_vt_framebuffer(ctx);
-                panfrost_new_frag_framebuffer(ctx);
-                panfrost_set_scissor(ctx);
 
                 struct panfrost_resource *tex = ((struct panfrost_resource *) ctx->pipe_framebuffer.cbufs[i]->texture);
                 bool is_scanout = panfrost_is_scanout(ctx);
@@ -2385,13 +2092,8 @@ panfrost_set_framebuffer_state(struct pipe_context *pctx,
                                         ctx->vt_framebuffer_mfbd = panfrost_emit_mfbd(ctx);
 
                                 panfrost_attach_vt_framebuffer(ctx);
-                                panfrost_new_frag_framebuffer(ctx);
-                                panfrost_set_scissor(ctx);
 
-                                struct panfrost_resource *tex = ((struct panfrost_resource *) ctx->pipe_framebuffer.zsbuf->texture);
-
-                                if (tex->bo->layout != PAN_AFBC && !panfrost_is_scanout(ctx))
-                                        panfrost_enable_afbc(ctx, tex, true);
+                                /* Keep the depth FBO linear */
                         }
                 }
         }
@@ -2577,8 +2279,6 @@ panfrost_set_scissor_states(struct pipe_context *pipe,
         assert(num_scissors == 1);
 
         ctx->scissor = *scissors;
-
-        panfrost_set_scissor(ctx);
 }
 
 static void
@@ -2703,6 +2403,46 @@ panfrost_get_query_result(struct pipe_context *pipe,
         return true;
 }
 
+static struct pipe_stream_output_target *
+panfrost_create_stream_output_target(struct pipe_context *pctx,
+                                struct pipe_resource *prsc,
+                                unsigned buffer_offset,
+                                unsigned buffer_size)
+{
+        struct pipe_stream_output_target *target;
+
+        target = CALLOC_STRUCT(pipe_stream_output_target);
+
+        if (!target)
+                return NULL;
+
+        pipe_reference_init(&target->reference, 1);
+        pipe_resource_reference(&target->buffer, prsc);
+
+        target->context = pctx;
+        target->buffer_offset = buffer_offset;
+        target->buffer_size = buffer_size;
+
+        return target;
+}
+
+static void
+panfrost_stream_output_target_destroy(struct pipe_context *pctx,
+                                 struct pipe_stream_output_target *target)
+{
+        pipe_resource_reference(&target->buffer, NULL);
+        free(target);
+}
+
+static void
+panfrost_set_stream_output_targets(struct pipe_context *pctx,
+                              unsigned num_targets,
+                              struct pipe_stream_output_target **targets,
+                              const unsigned *offsets)
+{
+        /* STUB */
+}
+
 static void
 panfrost_setup_hardware(struct panfrost_context *ctx)
 {
@@ -2740,8 +2480,9 @@ panfrost_create_context(struct pipe_screen *screen, void *priv, unsigned flags)
         unsigned gpu_id;
 
         gpu_id = pscreen->driver->query_gpu_version(pscreen);
-        ctx->is_t6xx = gpu_id <= 0x0750; /* For now, this flag means t76x or less */
-        ctx->require_sfbd = gpu_id < 0x0750; /* t76x is the first to support MFD */
+
+        ctx->is_t6xx = gpu_id <= 0x0750; /* For now, this flag means T760 or less */
+        ctx->require_sfbd = gpu_id < 0x0750; /* T760 is the first to support MFBD */
 
         gallium->screen = screen;
 
@@ -2806,6 +2547,10 @@ panfrost_create_context(struct pipe_screen *screen, void *priv, unsigned flags)
         gallium->end_query = panfrost_end_query;
         gallium->get_query_result = panfrost_get_query_result;
 
+        gallium->create_stream_output_target = panfrost_create_stream_output_target;
+        gallium->stream_output_target_destroy = panfrost_stream_output_target_destroy;
+        gallium->set_stream_output_targets = panfrost_set_stream_output_targets;
+
         panfrost_resource_context_init(gallium);
 
         pscreen->driver->init_context(ctx);
@@ -2831,7 +2576,6 @@ panfrost_create_context(struct pipe_screen *screen, void *priv, unsigned flags)
         panfrost_emit_vertex_payload(ctx);
         panfrost_emit_tiler_payload(ctx);
         panfrost_invalidate_frame(ctx);
-        panfrost_viewport(ctx, 0.0, 1.0, 0, 0, ctx->pipe_framebuffer.width, ctx->pipe_framebuffer.height);
         panfrost_default_shader_backend(ctx);
         panfrost_generate_space_filler_indices();