src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28
  29 #include "panfrost-quirks.h"
  30
  31 #include "pan_allocate.h"
  32 #include "pan_bo.h"
  33 #include "pan_cmdstream.h"
  34 #include "pan_context.h"
  35 #include "pan_job.h"
  36
  37 /* If a BO is accessed for a particular shader stage, will it be in the primary
  38  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  39  * fragment will be primary, e.g. compute jobs will be considered
  40  * "vertex/tiler" by analogy */
  41
  42 static inline uint32_t
  43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  44 {
  45         assert(stage == PIPE_SHADER_FRAGMENT ||
  46                stage == PIPE_SHADER_VERTEX ||
  47                stage == PIPE_SHADER_COMPUTE);
  48
  49         return stage == PIPE_SHADER_FRAGMENT ?
  50                PAN_BO_ACCESS_FRAGMENT :
  51                PAN_BO_ACCESS_VERTEX_TILER;
  52 }
  53
  54 static void
  55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
  56                                struct mali_vertex_tiler_postfix *postfix)
  57 {
  58         struct panfrost_device *dev = pan_device(ctx->base.screen);
  59         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  60
  61         unsigned shift = panfrost_get_stack_shift(batch->stack_size);
  62         struct mali_shared_memory shared = {
  63                 .stack_shift = shift,
  64                 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
  65                 .shared_workgroup_count = ~0,
  66         };
  67         postfix->shared_memory = panfrost_upload_transient(batch, &shared, sizeof(shared));
  68 }
  69
  70 static void
  71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
  72                                struct mali_vertex_tiler_postfix *postfix)
  73 {
  74         struct panfrost_device *dev = pan_device(ctx->base.screen);
  75         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  76
  77         /* If we haven't, reserve space for the framebuffer */
  78
  79         if (!batch->framebuffer.gpu) {
  80                 unsigned size = (dev->quirks & MIDGARD_SFBD) ?
  81                         sizeof(struct mali_single_framebuffer) :
  82                         sizeof(struct mali_framebuffer);
  83
  84                 batch->framebuffer = panfrost_allocate_transient(batch, size);
  85
  86                 /* Tag the pointer */
  87                 if (!(dev->quirks & MIDGARD_SFBD))
  88                         batch->framebuffer.gpu |= MALI_MFBD;
  89         }
  90
  91         postfix->shared_memory = batch->framebuffer.gpu;
  92 }
  93
  94 static void
  95 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
  96                               struct mali_vertex_tiler_prefix *prefix,
  97                               struct mali_vertex_tiler_postfix *postfix)
  98 {
  99         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 100
 101         postfix->gl_enables |= 0x7;
 102         SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
 103                 rasterizer && rasterizer->base.front_ccw);
 104         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
 105                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
 106         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
 107                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
 108         SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
 109                 rasterizer && rasterizer->base.flatshade_first);
 110 }
 111
 112 void
 113 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
 114                                   struct mali_vertex_tiler_prefix *prefix,
 115                                   union midgard_primitive_size *primitive_size)
 116 {
 117         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 118
 119         if (!panfrost_writes_point_size(ctx)) {
 120                 bool points = prefix->draw_mode == MALI_POINTS;
 121                 float val = 0.0f;
 122
 123                 if (rasterizer)
 124                         val = points ?
 125                               rasterizer->base.point_size :
 126                               rasterizer->base.line_width;
 127
 128                 primitive_size->constant = val;
 129         }
 130 }
 131
 132 static void
 133 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
 134                                    struct mali_vertex_tiler_postfix *postfix)
 135 {
 136         SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
 137         if (ctx->occlusion_query)
 138                 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
 139         else
 140                 postfix->occlusion_counter = 0;
 141 }
 142
 143 void
 144 panfrost_vt_init(struct panfrost_context *ctx,
 145                  enum pipe_shader_type stage,
 146                  struct mali_vertex_tiler_prefix *prefix,
 147                  struct mali_vertex_tiler_postfix *postfix)
 148 {
 149         struct panfrost_device *device = pan_device(ctx->base.screen);
 150
 151         if (!ctx->shader[stage])
 152                 return;
 153
 154         memset(prefix, 0, sizeof(*prefix));
 155         memset(postfix, 0, sizeof(*postfix));
 156
 157         if (device->quirks & IS_BIFROST) {
 158                 postfix->gl_enables = 0x2;
 159                 panfrost_vt_emit_shared_memory(ctx, postfix);
 160         } else {
 161                 postfix->gl_enables = 0x6;
 162                 panfrost_vt_attach_framebuffer(ctx, postfix);
 163         }
 164
 165         if (stage == PIPE_SHADER_FRAGMENT) {
 166                 panfrost_vt_update_occlusion_query(ctx, postfix);
 167                 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
 168         }
 169 }
 170
 171 static unsigned
 172 panfrost_translate_index_size(unsigned size)
 173 {
 174         switch (size) {
 175         case 1:
 176                 return MALI_DRAW_INDEXED_UINT8;
 177
 178         case 2:
 179                 return MALI_DRAW_INDEXED_UINT16;
 180
 181         case 4:
 182                 return MALI_DRAW_INDEXED_UINT32;
 183
 184         default:
 185                 unreachable("Invalid index size");
 186         }
 187 }
 188
 189 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
 190  * good for the duration of the draw (transient), could last longer. Also get
 191  * the bounds on the index buffer for the range accessed by the draw. We do
 192  * these operations together because there are natural optimizations which
 193  * require them to be together. */
 194
 195 static mali_ptr
 196 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 197                                   const struct pipe_draw_info *info,
 198                                   unsigned *min_index, unsigned *max_index)
 199 {
 200         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 201         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 202         off_t offset = info->start * info->index_size;
 203         bool needs_indices = true;
 204         mali_ptr out = 0;
 205
 206         if (info->max_index != ~0u) {
 207                 *min_index = info->min_index;
 208                 *max_index = info->max_index;
 209                 needs_indices = false;
 210         }
 211
 212         if (!info->has_user_indices) {
 213                 /* Only resources can be directly mapped */
 214                 panfrost_batch_add_bo(batch, rsrc->bo,
 215                                       PAN_BO_ACCESS_SHARED |
 216                                       PAN_BO_ACCESS_READ |
 217                                       PAN_BO_ACCESS_VERTEX_TILER);
 218                 out = rsrc->bo->gpu + offset;
 219
 220                 /* Check the cache */
 221                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 222                                                            info->start,
 223                                                            info->count,
 224                                                            min_index,
 225                                                            max_index);
 226         } else {
 227                 /* Otherwise, we need to upload to transient memory */
 228                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 229                 out = panfrost_upload_transient(batch, ibuf8 + offset,
 230                                                 info->count *
 231                                                 info->index_size);
 232         }
 233
 234         if (needs_indices) {
 235                 /* Fallback */
 236                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 237
 238                 if (!info->has_user_indices)
 239                         panfrost_minmax_cache_add(rsrc->index_cache,
 240                                                   info->start, info->count,
 241                                                   *min_index, *max_index);
 242         }
 243
 244         return out;
 245 }
 246
 247 void
 248 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
 249                           const struct pipe_draw_info *info,
 250                           enum mali_draw_mode draw_mode,
 251                           struct mali_vertex_tiler_postfix *vertex_postfix,
 252                           struct mali_vertex_tiler_prefix *tiler_prefix,
 253                           struct mali_vertex_tiler_postfix *tiler_postfix,
 254                           unsigned *vertex_count,
 255                           unsigned *padded_count)
 256 {
 257         tiler_prefix->draw_mode = draw_mode;
 258
 259         unsigned draw_flags = 0;
 260
 261         if (panfrost_writes_point_size(ctx))
 262                 draw_flags |= MALI_DRAW_VARYING_SIZE;
 263
 264         if (info->primitive_restart)
 265                 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
 266
 267         /* These doesn't make much sense */
 268
 269         draw_flags |= 0x3000;
 270
 271         if (info->index_size) {
 272                 unsigned min_index = 0, max_index = 0;
 273
 274                 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
 275                                                                        info,
 276                                                                        &min_index,
 277                                                                        &max_index);
 278
 279                 /* Use the corresponding values */
 280                 *vertex_count = max_index - min_index + 1;
 281                 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
 282                 tiler_prefix->offset_bias_correction = -min_index;
 283                 tiler_prefix->index_count = MALI_POSITIVE(info->count);
 284                 draw_flags |= panfrost_translate_index_size(info->index_size);
 285         } else {
 286                 tiler_prefix->indices = 0;
 287                 *vertex_count = ctx->vertex_count;
 288                 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
 289                 tiler_prefix->offset_bias_correction = 0;
 290                 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
 291         }
 292
 293         tiler_prefix->unknown_draw = draw_flags;
 294
 295         /* Encode the padded vertex count */
 296
 297         if (info->instance_count > 1) {
 298                 *padded_count = panfrost_padded_vertex_count(*vertex_count);
 299
 300                 unsigned shift = __builtin_ctz(ctx->padded_count);
 301                 unsigned k = ctx->padded_count >> (shift + 1);
 302
 303                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
 304                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
 305         } else {
 306                 *padded_count = *vertex_count;
 307
 308                 /* Reset instancing state */
 309                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
 310                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
 311         }
 312 }
 313
 314 static void
 315 panfrost_shader_meta_init(struct panfrost_context *ctx,
 316                           enum pipe_shader_type st,
 317                           struct mali_shader_meta *meta)
 318 {
 319         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 320         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 321
 322         memset(meta, 0, sizeof(*meta));
 323         meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
 324         meta->attribute_count = ss->attribute_count;
 325         meta->varying_count = ss->varying_count;
 326         meta->texture_count = ctx->sampler_view_count[st];
 327         meta->sampler_count = ctx->sampler_count[st];
 328
 329         if (dev->quirks & IS_BIFROST) {
 330                 meta->bifrost1.unk1 = 0x800200;
 331                 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 332                 meta->bifrost2.preload_regs = 0xC0;
 333                 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
 334                                                     ss->uniform_cutoff);
 335         } else {
 336                 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
 337                                                     ss->uniform_cutoff);
 338                 meta->midgard1.work_count = ss->work_reg_count;
 339                 meta->midgard1.flags_hi = 0x8; /* XXX */
 340                 meta->midgard1.flags_lo = 0x220;
 341                 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 342         }
 343
 344 }
 345
 346 static unsigned
 347 panfrost_translate_compare_func(enum pipe_compare_func in)
 348 {
 349         switch (in) {
 350         case PIPE_FUNC_NEVER:
 351                 return MALI_FUNC_NEVER;
 352
 353         case PIPE_FUNC_LESS:
 354                 return MALI_FUNC_LESS;
 355
 356         case PIPE_FUNC_EQUAL:
 357                 return MALI_FUNC_EQUAL;
 358
 359         case PIPE_FUNC_LEQUAL:
 360                 return MALI_FUNC_LEQUAL;
 361
 362         case PIPE_FUNC_GREATER:
 363                 return MALI_FUNC_GREATER;
 364
 365         case PIPE_FUNC_NOTEQUAL:
 366                 return MALI_FUNC_NOTEQUAL;
 367
 368         case PIPE_FUNC_GEQUAL:
 369                 return MALI_FUNC_GEQUAL;
 370
 371         case PIPE_FUNC_ALWAYS:
 372                 return MALI_FUNC_ALWAYS;
 373
 374         default:
 375                 unreachable("Invalid func");
 376         }
 377 }
 378
 379 static unsigned
 380 panfrost_translate_stencil_op(enum pipe_stencil_op in)
 381 {
 382         switch (in) {
 383         case PIPE_STENCIL_OP_KEEP:
 384                 return MALI_STENCIL_KEEP;
 385
 386         case PIPE_STENCIL_OP_ZERO:
 387                 return MALI_STENCIL_ZERO;
 388
 389         case PIPE_STENCIL_OP_REPLACE:
 390                return MALI_STENCIL_REPLACE;
 391
 392         case PIPE_STENCIL_OP_INCR:
 393                 return MALI_STENCIL_INCR;
 394
 395         case PIPE_STENCIL_OP_DECR:
 396                 return MALI_STENCIL_DECR;
 397
 398         case PIPE_STENCIL_OP_INCR_WRAP:
 399                 return MALI_STENCIL_INCR_WRAP;
 400
 401         case PIPE_STENCIL_OP_DECR_WRAP:
 402                 return MALI_STENCIL_DECR_WRAP;
 403
 404         case PIPE_STENCIL_OP_INVERT:
 405                 return MALI_STENCIL_INVERT;
 406
 407         default:
 408                 unreachable("Invalid stencil op");
 409         }
 410 }
 411
 412 static unsigned
 413 translate_tex_wrap(enum pipe_tex_wrap w)
 414 {
 415         switch (w) {
 416         case PIPE_TEX_WRAP_REPEAT:
 417                 return MALI_WRAP_REPEAT;
 418
 419         case PIPE_TEX_WRAP_CLAMP:
 420                 return MALI_WRAP_CLAMP;
 421
 422         case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 423                 return MALI_WRAP_CLAMP_TO_EDGE;
 424
 425         case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
 426                 return MALI_WRAP_CLAMP_TO_BORDER;
 427
 428         case PIPE_TEX_WRAP_MIRROR_REPEAT:
 429                 return MALI_WRAP_MIRRORED_REPEAT;
 430
 431         case PIPE_TEX_WRAP_MIRROR_CLAMP:
 432                 return MALI_WRAP_MIRRORED_CLAMP;
 433
 434         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 435                 return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE;
 436
 437         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
 438                 return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER;
 439
 440         default:
 441                 unreachable("Invalid wrap");
 442         }
 443 }
 444
 445 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 446                                 struct mali_sampler_descriptor *hw)
 447 {
 448         unsigned func = panfrost_translate_compare_func(cso->compare_func);
 449         bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 450         bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 451         bool mip_linear  = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
 452         unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
 453         unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
 454         unsigned mip_filter = mip_linear  ?
 455                               (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
 456         unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
 457
 458         *hw = (struct mali_sampler_descriptor) {
 459                 .filter_mode = min_filter | mag_filter | mip_filter |
 460                                normalized,
 461                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 462                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 463                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 464                 .compare_func = panfrost_flip_compare_func(func),
 465                 .border_color = {
 466                         cso->border_color.f[0],
 467                         cso->border_color.f[1],
 468                         cso->border_color.f[2],
 469                         cso->border_color.f[3]
 470                 },
 471                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 472                 .max_lod = FIXED_16(cso->max_lod, false),
 473                 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
 474                 .seamless_cube_map = cso->seamless_cube_map,
 475         };
 476
 477         /* If necessary, we disable mipmapping in the sampler descriptor by
 478          * clamping the LOD as tight as possible (from 0 to epsilon,
 479          * essentially -- remember these are fixed point numbers, so
 480          * epsilon=1/256) */
 481
 482         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 483                 hw->max_lod = hw->min_lod + 1;
 484 }
 485
 486 static void
 487 panfrost_make_stencil_state(const struct pipe_stencil_state *in,
 488                             struct mali_stencil_test *out)
 489 {
 490         out->ref = 0; /* Gallium gets it from elsewhere */
 491
 492         out->mask = in->valuemask;
 493         out->func = panfrost_translate_compare_func(in->func);
 494         out->sfail = panfrost_translate_stencil_op(in->fail_op);
 495         out->dpfail = panfrost_translate_stencil_op(in->zfail_op);
 496         out->dppass = panfrost_translate_stencil_op(in->zpass_op);
 497 }
 498
 499 static void
 500 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
 501                                      struct mali_shader_meta *fragmeta)
 502 {
 503         if (!ctx->rasterizer) {
 504                 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
 505                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
 506                 fragmeta->depth_units = 0.0f;
 507                 fragmeta->depth_factor = 0.0f;
 508                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
 509                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
 510                 return;
 511         }
 512
 513         bool msaa = ctx->rasterizer->base.multisample;
 514
 515         /* TODO: Sample size */
 516         SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
 517         SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
 518         fragmeta->depth_units = ctx->rasterizer->base.offset_units * 2.0f;
 519         fragmeta->depth_factor = ctx->rasterizer->base.offset_scale;
 520
 521         /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
 522
 523         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A,
 524                 ctx->rasterizer->base.offset_tri);
 525         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B,
 526                 ctx->rasterizer->base.offset_tri);
 527 }
 528
 529 static void
 530 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
 531                               struct mali_shader_meta *fragmeta)
 532 {
 533         const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
 534         int zfunc = PIPE_FUNC_ALWAYS;
 535
 536         if (!zsa) {
 537                 struct pipe_stencil_state default_stencil = {
 538                         .enabled = 0,
 539                         .func = PIPE_FUNC_ALWAYS,
 540                         .fail_op = MALI_STENCIL_KEEP,
 541                         .zfail_op = MALI_STENCIL_KEEP,
 542                         .zpass_op = MALI_STENCIL_KEEP,
 543                         .writemask = 0xFF,
 544                         .valuemask = 0xFF
 545                 };
 546
 547                 panfrost_make_stencil_state(&default_stencil,
 548                                             &fragmeta->stencil_front);
 549                 fragmeta->stencil_mask_front = default_stencil.writemask;
 550                 fragmeta->stencil_back = fragmeta->stencil_front;
 551                 fragmeta->stencil_mask_back = default_stencil.writemask;
 552                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
 553                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
 554         } else {
 555                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
 556                         zsa->stencil[0].enabled);
 557                 panfrost_make_stencil_state(&zsa->stencil[0],
 558                                             &fragmeta->stencil_front);
 559                 fragmeta->stencil_mask_front = zsa->stencil[0].writemask;
 560                 fragmeta->stencil_front.ref = ctx->stencil_ref.ref_value[0];
 561
 562                 /* If back-stencil is not enabled, use the front values */
 563
 564                 if (zsa->stencil[1].enabled) {
 565                         panfrost_make_stencil_state(&zsa->stencil[1],
 566                                                     &fragmeta->stencil_back);
 567                         fragmeta->stencil_mask_back = zsa->stencil[1].writemask;
 568                         fragmeta->stencil_back.ref = ctx->stencil_ref.ref_value[1];
 569                 } else {
 570                         fragmeta->stencil_back = fragmeta->stencil_front;
 571                         fragmeta->stencil_mask_back = fragmeta->stencil_mask_front;
 572                         fragmeta->stencil_back.ref = fragmeta->stencil_front.ref;
 573                 }
 574
 575                 if (zsa->depth.enabled)
 576                         zfunc = zsa->depth.func;
 577
 578                 /* Depth state (TODO: Refactor) */
 579
 580                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
 581                         zsa->depth.writemask);
 582         }
 583
 584         fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
 585         fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
 586 }
 587
 588 static void
 589 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
 590                                 struct mali_shader_meta *fragmeta,
 591                                 struct midgard_blend_rt *rts)
 592 {
 593         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 594
 595         SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
 596                 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
 597                 !ctx->blend->base.dither);
 598
 599         /* Get blending setup */
 600         unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 601
 602         struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 603         unsigned shader_offset = 0;
 604         struct panfrost_bo *shader_bo = NULL;
 605
 606         for (unsigned c = 0; c < rt_count; ++c)
 607                 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
 608                                                           &shader_offset);
 609
 610          /* If there is a blend shader, work registers are shared. XXX: opt */
 611
 612         for (unsigned c = 0; c < rt_count; ++c) {
 613                 if (blend[c].is_shader)
 614                         fragmeta->midgard1.work_count = 16;
 615         }
 616
 617         /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
 618          * copied to the blend_meta appended (by convention), but this is the
 619          * field actually read by the hardware. (Or maybe both are read...?).
 620          * Specify the last RTi with a blend shader. */
 621
 622         fragmeta->blend.shader = 0;
 623
 624         for (signed rt = (rt_count - 1); rt >= 0; --rt) {
 625                 if (!blend[rt].is_shader)
 626                         continue;
 627
 628                 fragmeta->blend.shader = blend[rt].shader.gpu |
 629                                          blend[rt].shader.first_tag;
 630                 break;
 631         }
 632
 633         if (dev->quirks & MIDGARD_SFBD) {
 634                 /* When only a single render target platform is used, the blend
 635                  * information is inside the shader meta itself. We additionally
 636                  * need to signal CAN_DISCARD for nontrivial blend modes (so
 637                  * we're able to read back the destination buffer) */
 638
 639                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
 640                         blend[0].is_shader);
 641
 642                 if (!blend[0].is_shader) {
 643                         fragmeta->blend.equation = *blend[0].equation.equation;
 644                         fragmeta->blend.constant = blend[0].equation.constant;
 645                 }
 646
 647                 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
 648                         !blend[0].no_blending);
 649                 return;
 650         }
 651
 652         /* Additional blend descriptor tacked on for jobs using MFBD */
 653
 654         for (unsigned i = 0; i < rt_count; ++i) {
 655                 rts[i].flags = 0x200;
 656
 657                 bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
 658                                (ctx->pipe_framebuffer.cbufs[i]) &&
 659                                util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
 660
 661                 SET_BIT(rts[i].flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
 662                 SET_BIT(rts[i].flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
 663                 SET_BIT(rts[i].flags, MALI_BLEND_SRGB, is_srgb);
 664                 SET_BIT(rts[i].flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
 665
 666                 if (blend[i].is_shader) {
 667                         rts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 668                 } else {
 669                         rts[i].blend.equation = *blend[i].equation.equation;
 670                         rts[i].blend.constant = blend[i].equation.constant;
 671                 }
 672         }
 673 }
 674
 675 static void
 676 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
 677                                struct mali_shader_meta *fragmeta,
 678                                struct midgard_blend_rt *rts)
 679 {
 680         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 681         struct panfrost_shader_state *fs;
 682
 683         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 684
 685         fragmeta->alpha_coverage = ~MALI_ALPHA_COVERAGE(0.000000);
 686         fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x3010;
 687         fragmeta->unknown2_4 = 0x4e0;
 688
 689         /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
 690          * is required (independent of 32-bit/64-bit descriptors), or why it's
 691          * not used on later GPU revisions. Otherwise, all shader jobs fault on
 692          * these earlier chips (perhaps this is a chicken bit of some kind).
 693          * More investigation is needed. */
 694
 695         SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
 696
 697         /* Depending on whether it's legal to in the given shader, we try to
 698          * enable early-z testing (or forward-pixel kill?) */
 699
 700         SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
 701                 !fs->can_discard && !fs->writes_depth);
 702
 703         /* Add the writes Z/S flags if needed. */
 704         SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
 705         SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
 706
 707         /* Any time texturing is used, derivatives are implicitly calculated,
 708          * so we need to enable helper invocations */
 709
 710         SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
 711                 fs->helper_invocations);
 712
 713         /* CAN_DISCARD should be set if the fragment shader possibly contains a
 714          * 'discard' instruction. It is likely this is related to optimizations
 715          * related to forward-pixel kill, as per "Mali Performance 3: Is
 716          * EGL_BUFFER_PRESERVED a good thing?" by Peter Harris */
 717
 718         SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD, fs->can_discard);
 719         SET_BIT(fragmeta->midgard1.flags_lo, 0x400, fs->can_discard);
 720
 721         panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
 722         panfrost_frag_meta_zsa_update(ctx, fragmeta);
 723         panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
 724 }
 725
 726 void
 727 panfrost_emit_shader_meta(struct panfrost_batch *batch,
 728                           enum pipe_shader_type st,
 729                           struct mali_vertex_tiler_postfix *postfix)
 730 {
 731         struct panfrost_context *ctx = batch->ctx;
 732         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 733
 734         if (!ss) {
 735                 postfix->shader = 0;
 736                 return;
 737         }
 738
 739         struct mali_shader_meta meta;
 740
 741         panfrost_shader_meta_init(ctx, st, &meta);
 742
 743         /* Add the shader BO to the batch. */
 744         panfrost_batch_add_bo(batch, ss->bo,
 745                               PAN_BO_ACCESS_PRIVATE |
 746                               PAN_BO_ACCESS_READ |
 747                               panfrost_bo_access_for_stage(st));
 748
 749         mali_ptr shader_ptr;
 750
 751         if (st == PIPE_SHADER_FRAGMENT) {
 752                 struct panfrost_device *dev = pan_device(ctx->base.screen);
 753                 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 754                 size_t desc_size = sizeof(meta);
 755                 struct midgard_blend_rt rts[4];
 756                 struct panfrost_transfer xfer;
 757
 758                 assert(rt_count <= ARRAY_SIZE(rts));
 759
 760                 panfrost_frag_shader_meta_init(ctx, &meta, rts);
 761
 762                 if (!(dev->quirks & MIDGARD_SFBD))
 763                         desc_size += sizeof(*rts) * rt_count;
 764
 765                 xfer = panfrost_allocate_transient(batch, desc_size);
 766
 767                 memcpy(xfer.cpu, &meta, sizeof(meta));
 768                 memcpy(xfer.cpu + sizeof(meta), rts, sizeof(*rts) * rt_count);
 769
 770                 shader_ptr = xfer.gpu;
 771         } else {
 772                 shader_ptr = panfrost_upload_transient(batch, &meta,
 773                                                        sizeof(meta));
 774         }
 775
 776         postfix->shader = shader_ptr;
 777 }
 778
 779 static void
 780 panfrost_mali_viewport_init(struct panfrost_context *ctx,
 781                             struct mali_viewport *mvp)
 782 {
 783         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 784
 785         /* Clip bounds are encoded as floats. The viewport itself is encoded as
 786          * (somewhat) asymmetric ints. */
 787
 788         const struct pipe_scissor_state *ss = &ctx->scissor;
 789
 790         memset(mvp, 0, sizeof(*mvp));
 791
 792         /* By default, do no viewport clipping, i.e. clip to (-inf, inf) in
 793          * each direction. Clipping to the viewport in theory should work, but
 794          * in practice causes issues when we're not explicitly trying to
 795          * scissor */
 796
 797         *mvp = (struct mali_viewport) {
 798                 .clip_minx = -INFINITY,
 799                 .clip_miny = -INFINITY,
 800                 .clip_maxx = INFINITY,
 801                 .clip_maxy = INFINITY,
 802         };
 803
 804         /* Always scissor to the viewport by default. */
 805         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 806         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 807
 808         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 809         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 810
 811         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 812         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 813
 814         /* Apply the scissor test */
 815
 816         unsigned minx, miny, maxx, maxy;
 817
 818         if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
 819                 minx = MAX2(ss->minx, vp_minx);
 820                 miny = MAX2(ss->miny, vp_miny);
 821                 maxx = MIN2(ss->maxx, vp_maxx);
 822                 maxy = MIN2(ss->maxy, vp_maxy);
 823         } else {
 824                 minx = vp_minx;
 825                 miny = vp_miny;
 826                 maxx = vp_maxx;
 827                 maxy = vp_maxy;
 828         }
 829
 830         /* Hardware needs the min/max to be strictly ordered, so flip if we
 831          * need to. The viewport transformation in the vertex shader will
 832          * handle the negatives if we don't */
 833
 834         if (miny > maxy) {
 835                 unsigned temp = miny;
 836                 miny = maxy;
 837                 maxy = temp;
 838         }
 839
 840         if (minx > maxx) {
 841                 unsigned temp = minx;
 842                 minx = maxx;
 843                 maxx = temp;
 844         }
 845
 846         if (minz > maxz) {
 847                 float temp = minz;
 848                 minz = maxz;
 849                 maxz = temp;
 850         }
 851
 852         /* Clamp to the framebuffer size as a last check */
 853
 854         minx = MIN2(ctx->pipe_framebuffer.width, minx);
 855         maxx = MIN2(ctx->pipe_framebuffer.width, maxx);
 856
 857         miny = MIN2(ctx->pipe_framebuffer.height, miny);
 858         maxy = MIN2(ctx->pipe_framebuffer.height, maxy);
 859
 860         /* Upload */
 861
 862         mvp->viewport0[0] = minx;
 863         mvp->viewport1[0] = MALI_POSITIVE(maxx);
 864
 865         mvp->viewport0[1] = miny;
 866         mvp->viewport1[1] = MALI_POSITIVE(maxy);
 867
 868         mvp->clip_minz = minz;
 869         mvp->clip_maxz = maxz;
 870 }
 871
 872 void
 873 panfrost_emit_viewport(struct panfrost_batch *batch,
 874                        struct mali_vertex_tiler_postfix *tiler_postfix)
 875 {
 876         struct panfrost_context *ctx = batch->ctx;
 877         struct mali_viewport mvp;
 878
 879         panfrost_mali_viewport_init(batch->ctx,  &mvp);
 880
 881         /* Update the job, unless we're doing wallpapering (whose lack of
 882          * scissor we can ignore, since if we "miss" a tile of wallpaper, it'll
 883          * just... be faster :) */
 884
 885         if (!ctx->wallpaper_batch)
 886                 panfrost_batch_union_scissor(batch, mvp.viewport0[0],
 887                                              mvp.viewport0[1],
 888                                              mvp.viewport1[0] + 1,
 889                                              mvp.viewport1[1] + 1);
 890
 891         tiler_postfix->viewport = panfrost_upload_transient(batch, &mvp,
 892                                                             sizeof(mvp));
 893 }
 894
 895 static mali_ptr
 896 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
 897                                  enum pipe_shader_type st,
 898                                  struct panfrost_constant_buffer *buf,
 899                                  unsigned index)
 900 {
 901         struct pipe_constant_buffer *cb = &buf->cb[index];
 902         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
 903
 904         if (rsrc) {
 905                 panfrost_batch_add_bo(batch, rsrc->bo,
 906                                       PAN_BO_ACCESS_SHARED |
 907                                       PAN_BO_ACCESS_READ |
 908                                       panfrost_bo_access_for_stage(st));
 909
 910                 /* Alignment gauranteed by
 911                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
 912                 return rsrc->bo->gpu + cb->buffer_offset;
 913         } else if (cb->user_buffer) {
 914                 return panfrost_upload_transient(batch,
 915                                                  cb->user_buffer +
 916                                                  cb->buffer_offset,
 917                                                  cb->buffer_size);
 918         } else {
 919                 unreachable("No constant buffer");
 920         }
 921 }
 922
 923 struct sysval_uniform {
 924         union {
 925                 float f[4];
 926                 int32_t i[4];
 927                 uint32_t u[4];
 928                 uint64_t du[2];
 929         };
 930 };
 931
 932 static void
 933 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
 934                                       struct sysval_uniform *uniform)
 935 {
 936         struct panfrost_context *ctx = batch->ctx;
 937         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 938
 939         uniform->f[0] = vp->scale[0];
 940         uniform->f[1] = vp->scale[1];
 941         uniform->f[2] = vp->scale[2];
 942 }
 943
 944 static void
 945 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
 946                                        struct sysval_uniform *uniform)
 947 {
 948         struct panfrost_context *ctx = batch->ctx;
 949         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 950
 951         uniform->f[0] = vp->translate[0];
 952         uniform->f[1] = vp->translate[1];
 953         uniform->f[2] = vp->translate[2];
 954 }
 955
 956 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
 957                                        enum pipe_shader_type st,
 958                                        unsigned int sysvalid,
 959                                        struct sysval_uniform *uniform)
 960 {
 961         struct panfrost_context *ctx = batch->ctx;
 962         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
 963         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
 964         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
 965         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
 966
 967         assert(dim);
 968         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
 969
 970         if (dim > 1)
 971                 uniform->i[1] = u_minify(tex->texture->height0,
 972                                          tex->u.tex.first_level);
 973
 974         if (dim > 2)
 975                 uniform->i[2] = u_minify(tex->texture->depth0,
 976                                          tex->u.tex.first_level);
 977
 978         if (is_array)
 979                 uniform->i[dim] = tex->texture->array_size;
 980 }
 981
 982 static void
 983 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
 984                             enum pipe_shader_type st,
 985                             unsigned ssbo_id,
 986                             struct sysval_uniform *uniform)
 987 {
 988         struct panfrost_context *ctx = batch->ctx;
 989
 990         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
 991         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
 992
 993         /* Compute address */
 994         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
 995
 996         panfrost_batch_add_bo(batch, bo,
 997                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
 998                               panfrost_bo_access_for_stage(st));
 999
1000         /* Upload address and size as sysval */
1001         uniform->du[0] = bo->gpu + sb.buffer_offset;
1002         uniform->u[2] = sb.buffer_size;
1003 }
1004
1005 static void
1006 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1007                                enum pipe_shader_type st,
1008                                unsigned samp_idx,
1009                                struct sysval_uniform *uniform)
1010 {
1011         struct panfrost_context *ctx = batch->ctx;
1012         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1013
1014         uniform->f[0] = sampl->min_lod;
1015         uniform->f[1] = sampl->max_lod;
1016         uniform->f[2] = sampl->lod_bias;
1017
1018         /* Even without any errata, Midgard represents "no mipmapping" as
1019          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1020          * panfrost_create_sampler_state which also explains our choice of
1021          * epsilon value (again to keep behaviour consistent) */
1022
1023         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1024                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1025 }
1026
1027 static void
1028 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1029                                        struct sysval_uniform *uniform)
1030 {
1031         struct panfrost_context *ctx = batch->ctx;
1032
1033         uniform->u[0] = ctx->compute_grid->grid[0];
1034         uniform->u[1] = ctx->compute_grid->grid[1];
1035         uniform->u[2] = ctx->compute_grid->grid[2];
1036 }
1037
1038 static void
1039 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1040                         struct panfrost_shader_state *ss,
1041                         enum pipe_shader_type st)
1042 {
1043         struct sysval_uniform *uniforms = (void *)buf;
1044
1045         for (unsigned i = 0; i < ss->sysval_count; ++i) {
1046                 int sysval = ss->sysval[i];
1047
1048                 switch (PAN_SYSVAL_TYPE(sysval)) {
1049                 case PAN_SYSVAL_VIEWPORT_SCALE:
1050                         panfrost_upload_viewport_scale_sysval(batch,
1051                                                               &uniforms[i]);
1052                         break;
1053                 case PAN_SYSVAL_VIEWPORT_OFFSET:
1054                         panfrost_upload_viewport_offset_sysval(batch,
1055                                                                &uniforms[i]);
1056                         break;
1057                 case PAN_SYSVAL_TEXTURE_SIZE:
1058                         panfrost_upload_txs_sysval(batch, st,
1059                                                    PAN_SYSVAL_ID(sysval),
1060                                                    &uniforms[i]);
1061                         break;
1062                 case PAN_SYSVAL_SSBO:
1063                         panfrost_upload_ssbo_sysval(batch, st,
1064                                                     PAN_SYSVAL_ID(sysval),
1065                                                     &uniforms[i]);
1066                         break;
1067                 case PAN_SYSVAL_NUM_WORK_GROUPS:
1068                         panfrost_upload_num_work_groups_sysval(batch,
1069                                                                &uniforms[i]);
1070                         break;
1071                 case PAN_SYSVAL_SAMPLER:
1072                         panfrost_upload_sampler_sysval(batch, st,
1073                                                        PAN_SYSVAL_ID(sysval),
1074                                                        &uniforms[i]);
1075                         break;
1076                 default:
1077                         assert(0);
1078                 }
1079         }
1080 }
1081
1082 static const void *
1083 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1084                                  unsigned index)
1085 {
1086         struct pipe_constant_buffer *cb = &buf->cb[index];
1087         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1088
1089         if (rsrc)
1090                 return rsrc->bo->cpu;
1091         else if (cb->user_buffer)
1092                 return cb->user_buffer;
1093         else
1094                 unreachable("No constant buffer");
1095 }
1096
1097 void
1098 panfrost_emit_const_buf(struct panfrost_batch *batch,
1099                         enum pipe_shader_type stage,
1100                         struct mali_vertex_tiler_postfix *postfix)
1101 {
1102         struct panfrost_context *ctx = batch->ctx;
1103         struct panfrost_shader_variants *all = ctx->shader[stage];
1104
1105         if (!all)
1106                 return;
1107
1108         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1109
1110         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1111
1112         /* Uniforms are implicitly UBO #0 */
1113         bool has_uniforms = buf->enabled_mask & (1 << 0);
1114
1115         /* Allocate room for the sysval and the uniforms */
1116         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1117         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1118         size_t size = sys_size + uniform_size;
1119         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1120                                                                         size);
1121
1122         /* Upload sysvals requested by the shader */
1123         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1124
1125         /* Upload uniforms */
1126         if (has_uniforms && uniform_size) {
1127                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1128                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1129         }
1130
1131         /* Next up, attach UBOs. UBO #0 is the uniforms we just
1132          * uploaded */
1133
1134         unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1135         assert(ubo_count >= 1);
1136
1137         size_t sz = sizeof(uint64_t) * ubo_count;
1138         uint64_t ubos[PAN_MAX_CONST_BUFFERS];
1139         int uniform_count = ss->uniform_count;
1140
1141         /* Upload uniforms as a UBO */
1142         ubos[0] = MALI_MAKE_UBO(2 + uniform_count, transfer.gpu);
1143
1144         /* The rest are honest-to-goodness UBOs */
1145
1146         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1147                 size_t usz = buf->cb[ubo].buffer_size;
1148                 bool enabled = buf->enabled_mask & (1 << ubo);
1149                 bool empty = usz == 0;
1150
1151                 if (!enabled || empty) {
1152                         /* Stub out disabled UBOs to catch accesses */
1153                         ubos[ubo] = MALI_MAKE_UBO(0, 0xDEAD0000);
1154                         continue;
1155                 }
1156
1157                 mali_ptr gpu = panfrost_map_constant_buffer_gpu(batch, stage,
1158                                                                 buf, ubo);
1159
1160                 unsigned bytes_per_field = 16;
1161                 unsigned aligned = ALIGN_POT(usz, bytes_per_field);
1162                 ubos[ubo] = MALI_MAKE_UBO(aligned / bytes_per_field, gpu);
1163         }
1164
1165         mali_ptr ubufs = panfrost_upload_transient(batch, ubos, sz);
1166         postfix->uniforms = transfer.gpu;
1167         postfix->uniform_buffers = ubufs;
1168
1169         buf->dirty_mask = 0;
1170 }
1171
1172 void
1173 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1174                             const struct pipe_grid_info *info,
1175                             struct midgard_payload_vertex_tiler *vtp)
1176 {
1177         struct panfrost_context *ctx = batch->ctx;
1178         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1179         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1180         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1181                                                            128));
1182         unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1183                                info->grid[2] * 4;
1184         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1185                                                                   shared_size,
1186                                                                   1);
1187
1188         struct mali_shared_memory shared = {
1189                 .shared_memory = bo->gpu,
1190                 .shared_workgroup_count =
1191                         util_logbase2_ceil(info->grid[0]) +
1192                         util_logbase2_ceil(info->grid[1]) +
1193                         util_logbase2_ceil(info->grid[2]),
1194                 .shared_unk1 = 0x2,
1195                 .shared_shift = util_logbase2(single_size) - 1
1196         };
1197
1198         vtp->postfix.shared_memory = panfrost_upload_transient(batch, &shared,
1199                                                                sizeof(shared));
1200 }
1201
1202 static mali_ptr
1203 panfrost_get_tex_desc(struct panfrost_batch *batch,
1204                       enum pipe_shader_type st,
1205                       struct panfrost_sampler_view *view)
1206 {
1207         if (!view)
1208                 return (mali_ptr) 0;
1209
1210         struct pipe_sampler_view *pview = &view->base;
1211         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1212
1213         /* Add the BO to the job so it's retained until the job is done. */
1214
1215         panfrost_batch_add_bo(batch, rsrc->bo,
1216                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1217                               panfrost_bo_access_for_stage(st));
1218
1219         panfrost_batch_add_bo(batch, view->bo,
1220                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1221                               panfrost_bo_access_for_stage(st));
1222
1223         return view->bo->gpu;
1224 }
1225
1226 void
1227 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1228                                   enum pipe_shader_type stage,
1229                                   struct mali_vertex_tiler_postfix *postfix)
1230 {
1231         struct panfrost_context *ctx = batch->ctx;
1232
1233         if (!ctx->sampler_view_count[stage])
1234                 return;
1235
1236         uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1237
1238          for (int i = 0; i < ctx->sampler_view_count[stage]; ++i)
1239                 trampolines[i] = panfrost_get_tex_desc(batch, stage,
1240                                                        ctx->sampler_views[stage][i]);
1241
1242          postfix->texture_trampoline = panfrost_upload_transient(batch,
1243                                                                  trampolines,
1244                                                                  sizeof(uint64_t) *
1245                                                                  ctx->sampler_view_count[stage]);
1246 }
1247
1248 void
1249 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1250                                   enum pipe_shader_type stage,
1251                                   struct mali_vertex_tiler_postfix *postfix)
1252 {
1253         struct panfrost_context *ctx = batch->ctx;
1254
1255         if (!ctx->sampler_count[stage])
1256                 return;
1257
1258         size_t desc_size = sizeof(struct mali_sampler_descriptor);
1259         size_t transfer_size = desc_size * ctx->sampler_count[stage];
1260         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1261                                                                         transfer_size);
1262         struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1263
1264         for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1265                 desc[i] = ctx->samplers[stage][i]->hw;
1266
1267         postfix->sampler_descriptor = transfer.gpu;
1268 }
1269
1270 void
1271 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1272                                struct mali_vertex_tiler_postfix *vertex_postfix)
1273 {
1274         struct panfrost_context *ctx = batch->ctx;
1275
1276         if (!ctx->vertex)
1277                 return;
1278
1279         struct panfrost_vertex_state *so = ctx->vertex;
1280
1281         panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1282         vertex_postfix->attribute_meta = panfrost_upload_transient(batch, so->hw,
1283                                                                sizeof(*so->hw) *
1284                                                                PAN_MAX_ATTRIBUTE);
1285 }
1286
1287 void
1288 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1289                           struct mali_vertex_tiler_postfix *vertex_postfix)
1290 {
1291         struct panfrost_context *ctx = batch->ctx;
1292         struct panfrost_vertex_state *so = ctx->vertex;
1293
1294         /* Staged mali_attr, and index into them. i =/= k, depending on the
1295          * vertex buffer mask and instancing. Twice as much room is allocated,
1296          * for a worst case of NPOT_DIVIDEs which take up extra slot */
1297         union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1298         unsigned k = 0;
1299
1300         for (unsigned i = 0; i < so->num_elements; ++i) {
1301                 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1302                  * means duplicating some vertex buffers (who cares? aside from
1303                  * maybe some caching implications but I somehow doubt that
1304                  * matters) */
1305
1306                 struct pipe_vertex_element *elem = &so->pipe[i];
1307                 unsigned vbi = elem->vertex_buffer_index;
1308
1309                 /* The exception to 1:1 mapping is that we can have multiple
1310                  * entries (NPOT divisors), so we fixup anyways */
1311
1312                 so->hw[i].index = k;
1313
1314                 if (!(ctx->vb_mask & (1 << vbi)))
1315                         continue;
1316
1317                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1318                 struct panfrost_resource *rsrc;
1319
1320                 rsrc = pan_resource(buf->buffer.resource);
1321                 if (!rsrc)
1322                         continue;
1323
1324                 /* Align to 64 bytes by masking off the lower bits. This
1325                  * will be adjusted back when we fixup the src_offset in
1326                  * mali_attr_meta */
1327
1328                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1329                 mali_ptr addr = raw_addr & ~63;
1330                 unsigned chopped_addr = raw_addr - addr;
1331
1332                 /* Add a dependency of the batch on the vertex buffer */
1333                 panfrost_batch_add_bo(batch, rsrc->bo,
1334                                       PAN_BO_ACCESS_SHARED |
1335                                       PAN_BO_ACCESS_READ |
1336                                       PAN_BO_ACCESS_VERTEX_TILER);
1337
1338                 /* Set common fields */
1339                 attrs[k].elements = addr;
1340                 attrs[k].stride = buf->stride;
1341
1342                 /* Since we advanced the base pointer, we shrink the buffer
1343                  * size */
1344                 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1345
1346                 /* We need to add the extra size we masked off (for
1347                  * correctness) so the data doesn't get clamped away */
1348                 attrs[k].size += chopped_addr;
1349
1350                 /* For non-instancing make sure we initialize */
1351                 attrs[k].shift = attrs[k].extra_flags = 0;
1352
1353                 /* Instancing uses a dramatically different code path than
1354                  * linear, so dispatch for the actual emission now that the
1355                  * common code is finished */
1356
1357                 unsigned divisor = elem->instance_divisor;
1358
1359                 if (divisor && ctx->instance_count == 1) {
1360                         /* Silly corner case where there's a divisor(=1) but
1361                          * there's no legitimate instancing. So we want *every*
1362                          * attribute to be the same. So set stride to zero so
1363                          * we don't go anywhere. */
1364
1365                         attrs[k].size = attrs[k].stride + chopped_addr;
1366                         attrs[k].stride = 0;
1367                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1368                 } else if (ctx->instance_count <= 1) {
1369                         /* Normal, non-instanced attributes */
1370                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1371                 } else {
1372                         unsigned instance_shift = vertex_postfix->instance_shift;
1373                         unsigned instance_odd = vertex_postfix->instance_odd;
1374
1375                         k += panfrost_vertex_instanced(ctx->padded_count,
1376                                                        instance_shift,
1377                                                        instance_odd,
1378                                                        divisor, &attrs[k]);
1379                 }
1380         }
1381
1382         /* Add special gl_VertexID/gl_InstanceID buffers */
1383
1384         panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1385         so->hw[PAN_VERTEX_ID].index = k++;
1386         panfrost_instance_id(ctx->padded_count, &attrs[k]);
1387         so->hw[PAN_INSTANCE_ID].index = k++;
1388
1389         /* Upload whatever we emitted and go */
1390
1391         vertex_postfix->attributes = panfrost_upload_transient(batch, attrs,
1392                                                            k * sizeof(*attrs));
1393 }
1394
1395 static mali_ptr
1396 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1397                        unsigned stride, unsigned count)
1398 {
1399         /* Fill out the descriptor */
1400         slot->stride = stride;
1401         slot->size = stride * count;
1402         slot->shift = slot->extra_flags = 0;
1403
1404         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1405                                                                         slot->size);
1406
1407         slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1408
1409         return transfer.gpu;
1410 }
1411
1412 static void
1413 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1414                         unsigned stride, unsigned offset, unsigned count,
1415                         struct pipe_stream_output_target *target)
1416 {
1417         /* Fill out the descriptor */
1418         slot->stride = stride * 4;
1419         slot->shift = slot->extra_flags = 0;
1420
1421         unsigned max_size = target->buffer_size;
1422         unsigned expected_size = slot->stride * count;
1423
1424         slot->size = MIN2(max_size, expected_size);
1425
1426         /* Grab the BO and bind it to the batch */
1427         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1428
1429         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1430          * the perspective of the TILER and FRAGMENT.
1431          */
1432         panfrost_batch_add_bo(batch, bo,
1433                               PAN_BO_ACCESS_SHARED |
1434                               PAN_BO_ACCESS_RW |
1435                               PAN_BO_ACCESS_VERTEX_TILER |
1436                               PAN_BO_ACCESS_FRAGMENT);
1437
1438         mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1439         slot->elements = addr;
1440 }
1441
1442 /* Given a shader and buffer indices, link varying metadata together */
1443
1444 static bool
1445 is_special_varying(gl_varying_slot loc)
1446 {
1447         switch (loc) {
1448         case VARYING_SLOT_POS:
1449         case VARYING_SLOT_PSIZ:
1450         case VARYING_SLOT_PNTC:
1451         case VARYING_SLOT_FACE:
1452                 return true;
1453         default:
1454                 return false;
1455         }
1456 }
1457
1458 static void
1459 panfrost_emit_varying_meta(void *outptr, struct panfrost_shader_state *ss,
1460                            signed general, signed gl_Position,
1461                            signed gl_PointSize, signed gl_PointCoord,
1462                            signed gl_FrontFacing)
1463 {
1464         struct mali_attr_meta *out = (struct mali_attr_meta *) outptr;
1465
1466         for (unsigned i = 0; i < ss->varying_count; ++i) {
1467                 gl_varying_slot location = ss->varyings_loc[i];
1468                 int index = -1;
1469
1470                 switch (location) {
1471                 case VARYING_SLOT_POS:
1472                         index = gl_Position;
1473                         break;
1474                 case VARYING_SLOT_PSIZ:
1475                         index = gl_PointSize;
1476                         break;
1477                 case VARYING_SLOT_PNTC:
1478                         index = gl_PointCoord;
1479                         break;
1480                 case VARYING_SLOT_FACE:
1481                         index = gl_FrontFacing;
1482                         break;
1483                 default:
1484                         index = general;
1485                         break;
1486                 }
1487
1488                 assert(index >= 0);
1489                 out[i].index = index;
1490         }
1491 }
1492
1493 static bool
1494 has_point_coord(unsigned mask, gl_varying_slot loc)
1495 {
1496         if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1497                 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1498         else if (loc == VARYING_SLOT_PNTC)
1499                 return (mask & (1 << 8));
1500         else
1501                 return false;
1502 }
1503
1504 /* Helpers for manipulating stream out information so we can pack varyings
1505  * accordingly. Compute the src_offset for a given captured varying */
1506
1507 static struct pipe_stream_output *
1508 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1509 {
1510         for (unsigned i = 0; i < info->num_outputs; ++i) {
1511                 if (info->output[i].register_index == loc)
1512                         return &info->output[i];
1513         }
1514
1515         unreachable("Varying not captured");
1516 }
1517
1518 /* TODO: Integers */
1519 static enum mali_format
1520 pan_xfb_format(unsigned nr_components)
1521 {
1522         switch (nr_components) {
1523                 case 1: return MALI_R32F;
1524                 case 2: return MALI_RG32F;
1525                 case 3: return MALI_RGB32F;
1526                 case 4: return MALI_RGBA32F;
1527                 default: unreachable("Invalid format");
1528         }
1529 }
1530
1531 void
1532 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1533                                  unsigned vertex_count,
1534                                  struct mali_vertex_tiler_postfix *vertex_postfix,
1535                                  struct mali_vertex_tiler_postfix *tiler_postfix,
1536                                  union midgard_primitive_size *primitive_size)
1537 {
1538         /* Load the shaders */
1539         struct panfrost_context *ctx = batch->ctx;
1540         struct panfrost_shader_state *vs, *fs;
1541         unsigned int num_gen_varyings = 0;
1542         size_t vs_size, fs_size;
1543
1544         /* Allocate the varying descriptor */
1545
1546         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1547         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1548         vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
1549         fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
1550
1551         struct panfrost_transfer trans = panfrost_allocate_transient(batch,
1552                                                                      vs_size +
1553                                                                      fs_size);
1554
1555         struct pipe_stream_output_info *so = &vs->stream_output;
1556
1557         /* Check if this varying is linked by us. This is the case for
1558          * general-purpose, non-captured varyings. If it is, link it. If it's
1559          * not, use the provided stream out information to determine the
1560          * offset, since it was already linked for us. */
1561
1562         for (unsigned i = 0; i < vs->varying_count; i++) {
1563                 gl_varying_slot loc = vs->varyings_loc[i];
1564
1565                 bool special = is_special_varying(loc);
1566                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1567
1568                 if (captured) {
1569                         struct pipe_stream_output *o = pan_get_so(so, loc);
1570
1571                         unsigned dst_offset = o->dst_offset * 4; /* dwords */
1572                         vs->varyings[i].src_offset = dst_offset;
1573                 } else if (!special) {
1574                         vs->varyings[i].src_offset = 16 * (num_gen_varyings++);
1575                 }
1576         }
1577
1578         /* Conversely, we need to set src_offset for the captured varyings.
1579          * Here, the layout is defined by the stream out info, not us */
1580
1581         /* Link up with fragment varyings */
1582         bool reads_point_coord = fs->reads_point_coord;
1583
1584         for (unsigned i = 0; i < fs->varying_count; i++) {
1585                 gl_varying_slot loc = fs->varyings_loc[i];
1586                 unsigned src_offset;
1587                 signed vs_idx = -1;
1588
1589                 /* Link up */
1590                 for (unsigned j = 0; j < vs->varying_count; ++j) {
1591                         if (vs->varyings_loc[j] == loc) {
1592                                 vs_idx = j;
1593                                 break;
1594                         }
1595                 }
1596
1597                 /* Either assign or reuse */
1598                 if (vs_idx >= 0)
1599                         src_offset = vs->varyings[vs_idx].src_offset;
1600                 else
1601                         src_offset = 16 * (num_gen_varyings++);
1602
1603                 fs->varyings[i].src_offset = src_offset;
1604
1605                 if (has_point_coord(fs->point_sprite_mask, loc))
1606                         reads_point_coord = true;
1607         }
1608
1609         memcpy(trans.cpu, vs->varyings, vs_size);
1610         memcpy(trans.cpu + vs_size, fs->varyings, fs_size);
1611
1612         union mali_attr varyings[PIPE_MAX_ATTRIBS] = {0};
1613
1614         /* Figure out how many streamout buffers could be bound */
1615         unsigned so_count = ctx->streamout.num_targets;
1616         for (unsigned i = 0; i < vs->varying_count; i++) {
1617                 gl_varying_slot loc = vs->varyings_loc[i];
1618
1619                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1620                 if (!captured) continue;
1621
1622                 struct pipe_stream_output *o = pan_get_so(so, loc);
1623                 so_count = MAX2(so_count, o->output_buffer + 1);
1624         }
1625
1626         signed idx = so_count;
1627         signed general = idx++;
1628         signed gl_Position = idx++;
1629         signed gl_PointSize = vs->writes_point_size ? (idx++) : -1;
1630         signed gl_PointCoord = reads_point_coord ? (idx++) : -1;
1631         signed gl_FrontFacing = fs->reads_face ? (idx++) : -1;
1632         signed gl_FragCoord = fs->reads_frag_coord ? (idx++) : -1;
1633
1634         /* Emit the stream out buffers */
1635
1636         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1637                                                            ctx->vertex_count);
1638
1639         for (unsigned i = 0; i < so_count; ++i) {
1640                 if (i < ctx->streamout.num_targets) {
1641                         panfrost_emit_streamout(batch, &varyings[i],
1642                                                 so->stride[i],
1643                                                 ctx->streamout.offsets[i],
1644                                                 out_count,
1645                                                 ctx->streamout.targets[i]);
1646                 } else {
1647                         /* Emit a dummy buffer */
1648                         panfrost_emit_varyings(batch, &varyings[i],
1649                                                so->stride[i] * 4,
1650                                                out_count);
1651
1652                         /* Clear the attribute type */
1653                         varyings[i].elements &= ~0xF;
1654                 }
1655         }
1656
1657         panfrost_emit_varyings(batch, &varyings[general],
1658                                num_gen_varyings * 16,
1659                                vertex_count);
1660
1661         mali_ptr varyings_p;
1662
1663         /* fp32 vec4 gl_Position */
1664         varyings_p = panfrost_emit_varyings(batch, &varyings[gl_Position],
1665                                             sizeof(float) * 4, vertex_count);
1666         tiler_postfix->position_varying = varyings_p;
1667
1668
1669         if (panfrost_writes_point_size(ctx)) {
1670                 varyings_p = panfrost_emit_varyings(batch,
1671                                                     &varyings[gl_PointSize],
1672                                                     2, vertex_count);
1673                 primitive_size->pointer = varyings_p;
1674         }
1675
1676         if (reads_point_coord)
1677                 varyings[gl_PointCoord].elements = MALI_VARYING_POINT_COORD;
1678
1679         if (fs->reads_face)
1680                 varyings[gl_FrontFacing].elements = MALI_VARYING_FRONT_FACING;
1681
1682         if (fs->reads_frag_coord)
1683                 varyings[gl_FragCoord].elements = MALI_VARYING_FRAG_COORD;
1684
1685         struct panfrost_device *device = pan_device(ctx->base.screen);
1686         assert(!(device->quirks & IS_BIFROST) || !(reads_point_coord || fs->reads_face || fs->reads_frag_coord));
1687
1688         /* Let's go ahead and link varying meta to the buffer in question, now
1689          * that that information is available. VARYING_SLOT_POS is mapped to
1690          * gl_FragCoord for fragment shaders but gl_Positionf or vertex shaders
1691          * */
1692
1693         panfrost_emit_varying_meta(trans.cpu, vs, general, gl_Position,
1694                                    gl_PointSize, gl_PointCoord,
1695                                    gl_FrontFacing);
1696
1697         panfrost_emit_varying_meta(trans.cpu + vs_size, fs, general,
1698                                    gl_FragCoord, gl_PointSize,
1699                                    gl_PointCoord, gl_FrontFacing);
1700
1701         /* Replace streamout */
1702
1703         struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
1704         struct mali_attr_meta *ofs = ovs + vs->varying_count;
1705
1706         for (unsigned i = 0; i < vs->varying_count; i++) {
1707                 gl_varying_slot loc = vs->varyings_loc[i];
1708
1709                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1710                 if (!captured)
1711                         continue;
1712
1713                 struct pipe_stream_output *o = pan_get_so(so, loc);
1714                 ovs[i].index = o->output_buffer;
1715
1716                 /* Set the type appropriately. TODO: Integer varyings XXX */
1717                 assert(o->stream == 0);
1718                 ovs[i].format = pan_xfb_format(o->num_components);
1719                 ovs[i].swizzle = panfrost_get_default_swizzle(o->num_components);
1720
1721                 /* Link to the fragment */
1722                 signed fs_idx = -1;
1723
1724                 /* Link up */
1725                 for (unsigned j = 0; j < fs->varying_count; ++j) {
1726                         if (fs->varyings_loc[j] == loc) {
1727                                 fs_idx = j;
1728                                 break;
1729                         }
1730                 }
1731
1732                 if (fs_idx >= 0) {
1733                         ofs[fs_idx].index = ovs[i].index;
1734                         ofs[fs_idx].format = ovs[i].format;
1735                         ofs[fs_idx].swizzle = ovs[i].swizzle;
1736                 }
1737         }
1738
1739         /* Replace point sprite */
1740         for (unsigned i = 0; i < fs->varying_count; i++) {
1741                 /* If we have a point sprite replacement, handle that here. We
1742                  * have to translate location first.  TODO: Flip y in shader.
1743                  * We're already keying ... just time crunch .. */
1744
1745                 if (has_point_coord(fs->point_sprite_mask,
1746                                     fs->varyings_loc[i])) {
1747                         ofs[i].index = gl_PointCoord;
1748
1749                         /* Swizzle out the z/w to 0/1 */
1750                         ofs[i].format = MALI_RG16F;
1751                         ofs[i].swizzle = panfrost_get_default_swizzle(2);
1752                 }
1753         }
1754
1755         /* Fix up unaligned addresses */
1756         for (unsigned i = 0; i < so_count; ++i) {
1757                 if (varyings[i].elements < MALI_RECORD_SPECIAL)
1758                         continue;
1759
1760                 unsigned align = (varyings[i].elements & 63);
1761
1762                 /* While we're at it, the SO buffers are linear */
1763
1764                 if (!align) {
1765                         varyings[i].elements |= MALI_ATTR_LINEAR;
1766                         continue;
1767                 }
1768
1769                 /* We need to adjust alignment */
1770                 varyings[i].elements &= ~63;
1771                 varyings[i].elements |= MALI_ATTR_LINEAR;
1772                 varyings[i].size += align;
1773
1774                 for (unsigned v = 0; v < vs->varying_count; ++v) {
1775                         if (ovs[v].index != i)
1776                                 continue;
1777
1778                         ovs[v].src_offset = vs->varyings[v].src_offset + align;
1779                 }
1780
1781                 for (unsigned f = 0; f < fs->varying_count; ++f) {
1782                         if (ofs[f].index != i)
1783                                 continue;
1784
1785                         ofs[f].src_offset = fs->varyings[f].src_offset + align;
1786                 }
1787         }
1788
1789         varyings_p = panfrost_upload_transient(batch, varyings,
1790                                                idx * sizeof(*varyings));
1791         vertex_postfix->varyings = varyings_p;
1792         tiler_postfix->varyings = varyings_p;
1793
1794         vertex_postfix->varying_meta = trans.gpu;
1795         tiler_postfix->varying_meta = trans.gpu + vs_size;
1796 }
1797
1798 void
1799 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1800                                 struct mali_vertex_tiler_prefix *vertex_prefix,
1801                                 struct mali_vertex_tiler_postfix *vertex_postfix,
1802                                 struct mali_vertex_tiler_prefix *tiler_prefix,
1803                                 struct mali_vertex_tiler_postfix *tiler_postfix,
1804                                 union midgard_primitive_size *primitive_size)
1805 {
1806         struct panfrost_context *ctx = batch->ctx;
1807         struct panfrost_device *device = pan_device(ctx->base.screen);
1808         bool wallpapering = ctx->wallpaper_batch && batch->tiler_dep;
1809         struct bifrost_payload_vertex bifrost_vertex = {0,};
1810         struct bifrost_payload_tiler bifrost_tiler = {0,};
1811         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
1812         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
1813         void *vp, *tp;
1814         size_t vp_size, tp_size;
1815
1816         if (device->quirks & IS_BIFROST) {
1817                 bifrost_vertex.prefix = *vertex_prefix;
1818                 bifrost_vertex.postfix = *vertex_postfix;
1819                 vp = &bifrost_vertex;
1820                 vp_size = sizeof(bifrost_vertex);
1821
1822                 bifrost_tiler.prefix = *tiler_prefix;
1823                 bifrost_tiler.tiler.primitive_size = *primitive_size;
1824                 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
1825                 bifrost_tiler.postfix = *tiler_postfix;
1826                 tp = &bifrost_tiler;
1827                 tp_size = sizeof(bifrost_tiler);
1828         } else {
1829                 midgard_vertex.prefix = *vertex_prefix;
1830                 midgard_vertex.postfix = *vertex_postfix;
1831                 vp = &midgard_vertex;
1832                 vp_size = sizeof(midgard_vertex);
1833
1834                 midgard_tiler.prefix = *tiler_prefix;
1835                 midgard_tiler.postfix = *tiler_postfix;
1836                 midgard_tiler.primitive_size = *primitive_size;
1837                 tp = &midgard_tiler;
1838                 tp_size = sizeof(midgard_tiler);
1839         }
1840
1841         if (wallpapering) {
1842                 /* Inject in reverse order, with "predicted" job indices.
1843                  * THIS IS A HACK XXX */
1844                 panfrost_new_job(batch, JOB_TYPE_TILER, false,
1845                                  batch->job_index + 2, tp, tp_size, true);
1846                 panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
1847                                  vp, vp_size, true);
1848                 return;
1849         }
1850
1851         /* If rasterizer discard is enable, only submit the vertex */
1852
1853         bool rasterizer_discard = ctx->rasterizer &&
1854                                   ctx->rasterizer->base.rasterizer_discard;
1855
1856         unsigned vertex = panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
1857                                            vp, vp_size, false);
1858
1859         if (rasterizer_discard)
1860                 return;
1861
1862         panfrost_new_job(batch, JOB_TYPE_TILER, false, vertex, tp, tp_size,
1863                          false);
1864 }
1865
1866 /* TODO: stop hardcoding this */
1867 mali_ptr
1868 panfrost_emit_sample_locations(struct panfrost_batch *batch)
1869 {
1870         uint16_t locations[] = {
1871             128, 128,
1872             0, 256,
1873             0, 256,
1874             0, 256,
1875             0, 256,
1876             0, 256,
1877             0, 256,
1878             0, 256,
1879             0, 256,
1880             0, 256,
1881             0, 256,
1882             0, 256,
1883             0, 256,
1884             0, 256,
1885             0, 256,
1886             0, 256,
1887             0, 256,
1888             0, 256,
1889             0, 256,
1890             0, 256,
1891             0, 256,
1892             0, 256,
1893             0, 256,
1894             0, 256,
1895             0, 256,
1896             0, 256,
1897             0, 256,
1898             0, 256,
1899             0, 256,
1900             0, 256,
1901             0, 256,
1902             0, 256,
1903             128, 128,
1904             0, 0,
1905             0, 0,
1906             0, 0,
1907             0, 0,
1908             0, 0,
1909             0, 0,
1910             0, 0,
1911             0, 0,
1912             0, 0,
1913             0, 0,
1914             0, 0,
1915             0, 0,
1916             0, 0,
1917             0, 0,
1918             0, 0,
1919         };
1920
1921         return panfrost_upload_transient(batch, locations, 96 * sizeof(uint16_t));
1922 }