src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28
  29 #include "panfrost-quirks.h"
  30
  31 #include "pan_allocate.h"
  32 #include "pan_bo.h"
  33 #include "pan_cmdstream.h"
  34 #include "pan_context.h"
  35 #include "pan_job.h"
  36
  37 /* If a BO is accessed for a particular shader stage, will it be in the primary
  38  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  39  * fragment will be primary, e.g. compute jobs will be considered
  40  * "vertex/tiler" by analogy */
  41
  42 static inline uint32_t
  43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  44 {
  45         assert(stage == PIPE_SHADER_FRAGMENT ||
  46                stage == PIPE_SHADER_VERTEX ||
  47                stage == PIPE_SHADER_COMPUTE);
  48
  49         return stage == PIPE_SHADER_FRAGMENT ?
  50                PAN_BO_ACCESS_FRAGMENT :
  51                PAN_BO_ACCESS_VERTEX_TILER;
  52 }
  53
  54 static void
  55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
  56                                struct mali_vertex_tiler_postfix *postfix)
  57 {
  58         struct panfrost_device *dev = pan_device(ctx->base.screen);
  59         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  60
  61         unsigned shift = panfrost_get_stack_shift(batch->stack_size);
  62         struct mali_shared_memory shared = {
  63                 .stack_shift = shift,
  64                 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
  65                 .shared_workgroup_count = ~0,
  66         };
  67         postfix->shared_memory = panfrost_upload_transient(batch, &shared, sizeof(shared));
  68 }
  69
  70 static void
  71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
  72                                struct mali_vertex_tiler_postfix *postfix)
  73 {
  74         struct panfrost_device *dev = pan_device(ctx->base.screen);
  75         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  76
  77         /* If we haven't, reserve space for the framebuffer */
  78
  79         if (!batch->framebuffer.gpu) {
  80                 unsigned size = (dev->quirks & MIDGARD_SFBD) ?
  81                         sizeof(struct mali_single_framebuffer) :
  82                         sizeof(struct mali_framebuffer);
  83
  84                 batch->framebuffer = panfrost_allocate_transient(batch, size);
  85
  86                 /* Tag the pointer */
  87                 if (!(dev->quirks & MIDGARD_SFBD))
  88                         batch->framebuffer.gpu |= MALI_MFBD;
  89         }
  90
  91         postfix->shared_memory = batch->framebuffer.gpu;
  92 }
  93
  94 static void
  95 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
  96                               struct mali_vertex_tiler_prefix *prefix,
  97                               struct mali_vertex_tiler_postfix *postfix)
  98 {
  99         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 100
 101         postfix->gl_enables |= 0x7;
 102         SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
 103                 rasterizer && rasterizer->base.front_ccw);
 104         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
 105                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
 106         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
 107                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
 108         SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
 109                 rasterizer && rasterizer->base.flatshade_first);
 110 }
 111
 112 void
 113 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
 114                                   struct mali_vertex_tiler_prefix *prefix,
 115                                   union midgard_primitive_size *primitive_size)
 116 {
 117         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 118
 119         if (!panfrost_writes_point_size(ctx)) {
 120                 bool points = prefix->draw_mode == MALI_POINTS;
 121                 float val = 0.0f;
 122
 123                 if (rasterizer)
 124                         val = points ?
 125                               rasterizer->base.point_size :
 126                               rasterizer->base.line_width;
 127
 128                 primitive_size->constant = val;
 129         }
 130 }
 131
 132 static void
 133 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
 134                                    struct mali_vertex_tiler_postfix *postfix)
 135 {
 136         SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
 137         if (ctx->occlusion_query)
 138                 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
 139         else
 140                 postfix->occlusion_counter = 0;
 141 }
 142
 143 void
 144 panfrost_vt_init(struct panfrost_context *ctx,
 145                  enum pipe_shader_type stage,
 146                  struct mali_vertex_tiler_prefix *prefix,
 147                  struct mali_vertex_tiler_postfix *postfix)
 148 {
 149         struct panfrost_device *device = pan_device(ctx->base.screen);
 150
 151         if (!ctx->shader[stage])
 152                 return;
 153
 154         memset(prefix, 0, sizeof(*prefix));
 155         memset(postfix, 0, sizeof(*postfix));
 156
 157         if (device->quirks & IS_BIFROST) {
 158                 postfix->gl_enables = 0x2;
 159                 panfrost_vt_emit_shared_memory(ctx, postfix);
 160         } else {
 161                 postfix->gl_enables = 0x6;
 162                 panfrost_vt_attach_framebuffer(ctx, postfix);
 163         }
 164
 165         if (stage == PIPE_SHADER_FRAGMENT) {
 166                 panfrost_vt_update_occlusion_query(ctx, postfix);
 167                 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
 168         }
 169 }
 170
 171 static unsigned
 172 panfrost_translate_index_size(unsigned size)
 173 {
 174         switch (size) {
 175         case 1:
 176                 return MALI_DRAW_INDEXED_UINT8;
 177
 178         case 2:
 179                 return MALI_DRAW_INDEXED_UINT16;
 180
 181         case 4:
 182                 return MALI_DRAW_INDEXED_UINT32;
 183
 184         default:
 185                 unreachable("Invalid index size");
 186         }
 187 }
 188
 189 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
 190  * good for the duration of the draw (transient), could last longer. Also get
 191  * the bounds on the index buffer for the range accessed by the draw. We do
 192  * these operations together because there are natural optimizations which
 193  * require them to be together. */
 194
 195 static mali_ptr
 196 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 197                                   const struct pipe_draw_info *info,
 198                                   unsigned *min_index, unsigned *max_index)
 199 {
 200         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 201         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 202         off_t offset = info->start * info->index_size;
 203         bool needs_indices = true;
 204         mali_ptr out = 0;
 205
 206         if (info->max_index != ~0u) {
 207                 *min_index = info->min_index;
 208                 *max_index = info->max_index;
 209                 needs_indices = false;
 210         }
 211
 212         if (!info->has_user_indices) {
 213                 /* Only resources can be directly mapped */
 214                 panfrost_batch_add_bo(batch, rsrc->bo,
 215                                       PAN_BO_ACCESS_SHARED |
 216                                       PAN_BO_ACCESS_READ |
 217                                       PAN_BO_ACCESS_VERTEX_TILER);
 218                 out = rsrc->bo->gpu + offset;
 219
 220                 /* Check the cache */
 221                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 222                                                            info->start,
 223                                                            info->count,
 224                                                            min_index,
 225                                                            max_index);
 226         } else {
 227                 /* Otherwise, we need to upload to transient memory */
 228                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 229                 out = panfrost_upload_transient(batch, ibuf8 + offset,
 230                                                 info->count *
 231                                                 info->index_size);
 232         }
 233
 234         if (needs_indices) {
 235                 /* Fallback */
 236                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 237
 238                 if (!info->has_user_indices)
 239                         panfrost_minmax_cache_add(rsrc->index_cache,
 240                                                   info->start, info->count,
 241                                                   *min_index, *max_index);
 242         }
 243
 244         return out;
 245 }
 246
 247 void
 248 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
 249                           const struct pipe_draw_info *info,
 250                           enum mali_draw_mode draw_mode,
 251                           struct mali_vertex_tiler_postfix *vertex_postfix,
 252                           struct mali_vertex_tiler_prefix *tiler_prefix,
 253                           struct mali_vertex_tiler_postfix *tiler_postfix,
 254                           unsigned *vertex_count,
 255                           unsigned *padded_count)
 256 {
 257         tiler_prefix->draw_mode = draw_mode;
 258
 259         unsigned draw_flags = 0;
 260
 261         if (panfrost_writes_point_size(ctx))
 262                 draw_flags |= MALI_DRAW_VARYING_SIZE;
 263
 264         if (info->primitive_restart)
 265                 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
 266
 267         /* These doesn't make much sense */
 268
 269         draw_flags |= 0x3000;
 270
 271         if (info->index_size) {
 272                 unsigned min_index = 0, max_index = 0;
 273
 274                 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
 275                                                                        info,
 276                                                                        &min_index,
 277                                                                        &max_index);
 278
 279                 /* Use the corresponding values */
 280                 *vertex_count = max_index - min_index + 1;
 281                 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
 282                 tiler_prefix->offset_bias_correction = -min_index;
 283                 tiler_prefix->index_count = MALI_POSITIVE(info->count);
 284                 draw_flags |= panfrost_translate_index_size(info->index_size);
 285         } else {
 286                 tiler_prefix->indices = 0;
 287                 *vertex_count = ctx->vertex_count;
 288                 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
 289                 tiler_prefix->offset_bias_correction = 0;
 290                 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
 291         }
 292
 293         tiler_prefix->unknown_draw = draw_flags;
 294
 295         /* Encode the padded vertex count */
 296
 297         if (info->instance_count > 1) {
 298                 *padded_count = panfrost_padded_vertex_count(*vertex_count);
 299
 300                 unsigned shift = __builtin_ctz(ctx->padded_count);
 301                 unsigned k = ctx->padded_count >> (shift + 1);
 302
 303                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
 304                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
 305         } else {
 306                 *padded_count = *vertex_count;
 307
 308                 /* Reset instancing state */
 309                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
 310                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
 311         }
 312 }
 313
 314 static void
 315 panfrost_shader_meta_init(struct panfrost_context *ctx,
 316                           enum pipe_shader_type st,
 317                           struct mali_shader_meta *meta)
 318 {
 319         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 320         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 321
 322         memset(meta, 0, sizeof(*meta));
 323         meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
 324         meta->attribute_count = ss->attribute_count;
 325         meta->varying_count = ss->varying_count;
 326         meta->texture_count = ctx->sampler_view_count[st];
 327         meta->sampler_count = ctx->sampler_count[st];
 328
 329         if (dev->quirks & IS_BIFROST) {
 330                 meta->bifrost1.unk1 = 0x800200;
 331                 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 332                 meta->bifrost2.preload_regs = 0xC0;
 333                 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
 334                                                     ss->uniform_cutoff);
 335         } else {
 336                 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
 337                                                     ss->uniform_cutoff);
 338                 meta->midgard1.work_count = ss->work_reg_count;
 339                 meta->midgard1.flags_hi = 0x8; /* XXX */
 340                 meta->midgard1.flags_lo = 0x220;
 341                 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 342         }
 343
 344 }
 345
 346 static unsigned
 347 panfrost_translate_compare_func(enum pipe_compare_func in)
 348 {
 349         switch (in) {
 350         case PIPE_FUNC_NEVER:
 351                 return MALI_FUNC_NEVER;
 352
 353         case PIPE_FUNC_LESS:
 354                 return MALI_FUNC_LESS;
 355
 356         case PIPE_FUNC_EQUAL:
 357                 return MALI_FUNC_EQUAL;
 358
 359         case PIPE_FUNC_LEQUAL:
 360                 return MALI_FUNC_LEQUAL;
 361
 362         case PIPE_FUNC_GREATER:
 363                 return MALI_FUNC_GREATER;
 364
 365         case PIPE_FUNC_NOTEQUAL:
 366                 return MALI_FUNC_NOTEQUAL;
 367
 368         case PIPE_FUNC_GEQUAL:
 369                 return MALI_FUNC_GEQUAL;
 370
 371         case PIPE_FUNC_ALWAYS:
 372                 return MALI_FUNC_ALWAYS;
 373
 374         default:
 375                 unreachable("Invalid func");
 376         }
 377 }
 378
 379 static unsigned
 380 panfrost_translate_stencil_op(enum pipe_stencil_op in)
 381 {
 382         switch (in) {
 383         case PIPE_STENCIL_OP_KEEP:
 384                 return MALI_STENCIL_KEEP;
 385
 386         case PIPE_STENCIL_OP_ZERO:
 387                 return MALI_STENCIL_ZERO;
 388
 389         case PIPE_STENCIL_OP_REPLACE:
 390                return MALI_STENCIL_REPLACE;
 391
 392         case PIPE_STENCIL_OP_INCR:
 393                 return MALI_STENCIL_INCR;
 394
 395         case PIPE_STENCIL_OP_DECR:
 396                 return MALI_STENCIL_DECR;
 397
 398         case PIPE_STENCIL_OP_INCR_WRAP:
 399                 return MALI_STENCIL_INCR_WRAP;
 400
 401         case PIPE_STENCIL_OP_DECR_WRAP:
 402                 return MALI_STENCIL_DECR_WRAP;
 403
 404         case PIPE_STENCIL_OP_INVERT:
 405                 return MALI_STENCIL_INVERT;
 406
 407         default:
 408                 unreachable("Invalid stencil op");
 409         }
 410 }
 411
 412 static unsigned
 413 translate_tex_wrap(enum pipe_tex_wrap w)
 414 {
 415         switch (w) {
 416         case PIPE_TEX_WRAP_REPEAT:
 417                 return MALI_WRAP_REPEAT;
 418
 419         case PIPE_TEX_WRAP_CLAMP:
 420                 return MALI_WRAP_CLAMP;
 421
 422         case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 423                 return MALI_WRAP_CLAMP_TO_EDGE;
 424
 425         case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
 426                 return MALI_WRAP_CLAMP_TO_BORDER;
 427
 428         case PIPE_TEX_WRAP_MIRROR_REPEAT:
 429                 return MALI_WRAP_MIRRORED_REPEAT;
 430
 431         case PIPE_TEX_WRAP_MIRROR_CLAMP:
 432                 return MALI_WRAP_MIRRORED_CLAMP;
 433
 434         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 435                 return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE;
 436
 437         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
 438                 return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER;
 439
 440         default:
 441                 unreachable("Invalid wrap");
 442         }
 443 }
 444
 445 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 446                                 struct mali_sampler_descriptor *hw)
 447 {
 448         unsigned func = panfrost_translate_compare_func(cso->compare_func);
 449         bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 450         bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 451         bool mip_linear  = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
 452         unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
 453         unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
 454         unsigned mip_filter = mip_linear  ?
 455                               (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
 456         unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
 457
 458         *hw = (struct mali_sampler_descriptor) {
 459                 .filter_mode = min_filter | mag_filter | mip_filter |
 460                                normalized,
 461                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 462                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 463                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 464                 .compare_func = panfrost_flip_compare_func(func),
 465                 .border_color = {
 466                         cso->border_color.f[0],
 467                         cso->border_color.f[1],
 468                         cso->border_color.f[2],
 469                         cso->border_color.f[3]
 470                 },
 471                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 472                 .max_lod = FIXED_16(cso->max_lod, false),
 473                 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
 474                 .seamless_cube_map = cso->seamless_cube_map,
 475         };
 476
 477         /* If necessary, we disable mipmapping in the sampler descriptor by
 478          * clamping the LOD as tight as possible (from 0 to epsilon,
 479          * essentially -- remember these are fixed point numbers, so
 480          * epsilon=1/256) */
 481
 482         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 483                 hw->max_lod = hw->min_lod + 1;
 484 }
 485
 486 static void
 487 panfrost_make_stencil_state(const struct pipe_stencil_state *in,
 488                             struct mali_stencil_test *out)
 489 {
 490         out->ref = 0; /* Gallium gets it from elsewhere */
 491
 492         out->mask = in->valuemask;
 493         out->func = panfrost_translate_compare_func(in->func);
 494         out->sfail = panfrost_translate_stencil_op(in->fail_op);
 495         out->dpfail = panfrost_translate_stencil_op(in->zfail_op);
 496         out->dppass = panfrost_translate_stencil_op(in->zpass_op);
 497 }
 498
 499 static void
 500 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
 501                                      struct mali_shader_meta *fragmeta)
 502 {
 503         if (!ctx->rasterizer) {
 504                 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
 505                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
 506                 fragmeta->depth_units = 0.0f;
 507                 fragmeta->depth_factor = 0.0f;
 508                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
 509                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
 510                 return;
 511         }
 512
 513         bool msaa = ctx->rasterizer->base.multisample;
 514
 515         /* TODO: Sample size */
 516         SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
 517         SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
 518         fragmeta->depth_units = ctx->rasterizer->base.offset_units * 2.0f;
 519         fragmeta->depth_factor = ctx->rasterizer->base.offset_scale;
 520
 521         /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
 522
 523         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A,
 524                 ctx->rasterizer->base.offset_tri);
 525         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B,
 526                 ctx->rasterizer->base.offset_tri);
 527 }
 528
 529 static void
 530 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
 531                               struct mali_shader_meta *fragmeta)
 532 {
 533         const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
 534         int zfunc = PIPE_FUNC_ALWAYS;
 535
 536         if (!zsa) {
 537                 struct pipe_stencil_state default_stencil = {
 538                         .enabled = 0,
 539                         .func = PIPE_FUNC_ALWAYS,
 540                         .fail_op = MALI_STENCIL_KEEP,
 541                         .zfail_op = MALI_STENCIL_KEEP,
 542                         .zpass_op = MALI_STENCIL_KEEP,
 543                         .writemask = 0xFF,
 544                         .valuemask = 0xFF
 545                 };
 546
 547                 panfrost_make_stencil_state(&default_stencil,
 548                                             &fragmeta->stencil_front);
 549                 fragmeta->stencil_mask_front = default_stencil.writemask;
 550                 fragmeta->stencil_back = fragmeta->stencil_front;
 551                 fragmeta->stencil_mask_back = default_stencil.writemask;
 552                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
 553                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
 554         } else {
 555                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
 556                         zsa->stencil[0].enabled);
 557                 panfrost_make_stencil_state(&zsa->stencil[0],
 558                                             &fragmeta->stencil_front);
 559                 fragmeta->stencil_mask_front = zsa->stencil[0].writemask;
 560                 fragmeta->stencil_front.ref = ctx->stencil_ref.ref_value[0];
 561
 562                 /* If back-stencil is not enabled, use the front values */
 563
 564                 if (zsa->stencil[1].enabled) {
 565                         panfrost_make_stencil_state(&zsa->stencil[1],
 566                                                     &fragmeta->stencil_back);
 567                         fragmeta->stencil_mask_back = zsa->stencil[1].writemask;
 568                         fragmeta->stencil_back.ref = ctx->stencil_ref.ref_value[1];
 569                 } else {
 570                         fragmeta->stencil_back = fragmeta->stencil_front;
 571                         fragmeta->stencil_mask_back = fragmeta->stencil_mask_front;
 572                         fragmeta->stencil_back.ref = fragmeta->stencil_front.ref;
 573                 }
 574
 575                 if (zsa->depth.enabled)
 576                         zfunc = zsa->depth.func;
 577
 578                 /* Depth state (TODO: Refactor) */
 579
 580                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
 581                         zsa->depth.writemask);
 582         }
 583
 584         fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
 585         fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
 586 }
 587
 588 static void
 589 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
 590                                 struct mali_shader_meta *fragmeta,
 591                                 struct midgard_blend_rt *rts)
 592 {
 593         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 594
 595         SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
 596                 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
 597                 !ctx->blend->base.dither);
 598
 599         /* Get blending setup */
 600         unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 601
 602         struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 603         unsigned shader_offset = 0;
 604         struct panfrost_bo *shader_bo = NULL;
 605
 606         for (unsigned c = 0; c < rt_count; ++c)
 607                 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
 608                                                           &shader_offset);
 609
 610          /* If there is a blend shader, work registers are shared. XXX: opt */
 611
 612         for (unsigned c = 0; c < rt_count; ++c) {
 613                 if (blend[c].is_shader)
 614                         fragmeta->midgard1.work_count = 16;
 615         }
 616
 617         /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
 618          * copied to the blend_meta appended (by convention), but this is the
 619          * field actually read by the hardware. (Or maybe both are read...?).
 620          * Specify the last RTi with a blend shader. */
 621
 622         fragmeta->blend.shader = 0;
 623
 624         for (signed rt = (rt_count - 1); rt >= 0; --rt) {
 625                 if (!blend[rt].is_shader)
 626                         continue;
 627
 628                 fragmeta->blend.shader = blend[rt].shader.gpu |
 629                                          blend[rt].shader.first_tag;
 630                 break;
 631         }
 632
 633         if (dev->quirks & MIDGARD_SFBD) {
 634                 /* When only a single render target platform is used, the blend
 635                  * information is inside the shader meta itself. We additionally
 636                  * need to signal CAN_DISCARD for nontrivial blend modes (so
 637                  * we're able to read back the destination buffer) */
 638
 639                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
 640                         blend[0].is_shader);
 641
 642                 if (!blend[0].is_shader) {
 643                         fragmeta->blend.equation = *blend[0].equation.equation;
 644                         fragmeta->blend.constant = blend[0].equation.constant;
 645                 }
 646
 647                 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
 648                         !blend[0].no_blending);
 649                 return;
 650         }
 651
 652         /* Additional blend descriptor tacked on for jobs using MFBD */
 653
 654         for (unsigned i = 0; i < rt_count; ++i) {
 655                 rts[i].flags = 0x200;
 656
 657                 bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
 658                                (ctx->pipe_framebuffer.cbufs[i]) &&
 659                                util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
 660
 661                 SET_BIT(rts[i].flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
 662                 SET_BIT(rts[i].flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
 663                 SET_BIT(rts[i].flags, MALI_BLEND_SRGB, is_srgb);
 664                 SET_BIT(rts[i].flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
 665
 666                 if (blend[i].is_shader) {
 667                         rts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 668                 } else {
 669                         rts[i].blend.equation = *blend[i].equation.equation;
 670                         rts[i].blend.constant = blend[i].equation.constant;
 671                 }
 672         }
 673 }
 674
 675 static void
 676 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
 677                                struct mali_shader_meta *fragmeta,
 678                                struct midgard_blend_rt *rts)
 679 {
 680         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 681         struct panfrost_shader_state *fs;
 682
 683         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 684
 685         fragmeta->alpha_coverage = ~MALI_ALPHA_COVERAGE(0.000000);
 686         fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x3010;
 687         fragmeta->unknown2_4 = 0x4e0;
 688
 689         /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
 690          * is required (independent of 32-bit/64-bit descriptors), or why it's
 691          * not used on later GPU revisions. Otherwise, all shader jobs fault on
 692          * these earlier chips (perhaps this is a chicken bit of some kind).
 693          * More investigation is needed. */
 694
 695         SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
 696
 697         /* Depending on whether it's legal to in the given shader, we try to
 698          * enable early-z testing (or forward-pixel kill?) */
 699
 700         SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
 701                 !fs->can_discard && !fs->writes_depth);
 702
 703         /* Add the writes Z/S flags if needed. */
 704         SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
 705         SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
 706
 707         /* Any time texturing is used, derivatives are implicitly calculated,
 708          * so we need to enable helper invocations */
 709
 710         SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
 711                 fs->helper_invocations);
 712
 713         /* CAN_DISCARD should be set if the fragment shader possibly contains a
 714          * 'discard' instruction. It is likely this is related to optimizations
 715          * related to forward-pixel kill, as per "Mali Performance 3: Is
 716          * EGL_BUFFER_PRESERVED a good thing?" by Peter Harris */
 717
 718         SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD, fs->can_discard);
 719         SET_BIT(fragmeta->midgard1.flags_lo, 0x400, fs->can_discard);
 720
 721         panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
 722         panfrost_frag_meta_zsa_update(ctx, fragmeta);
 723         panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
 724 }
 725
 726 void
 727 panfrost_emit_shader_meta(struct panfrost_batch *batch,
 728                           enum pipe_shader_type st,
 729                           struct mali_vertex_tiler_postfix *postfix)
 730 {
 731         struct panfrost_context *ctx = batch->ctx;
 732         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 733
 734         if (!ss) {
 735                 postfix->shader = 0;
 736                 return;
 737         }
 738
 739         struct mali_shader_meta meta;
 740
 741         panfrost_shader_meta_init(ctx, st, &meta);
 742
 743         /* Add the shader BO to the batch. */
 744         panfrost_batch_add_bo(batch, ss->bo,
 745                               PAN_BO_ACCESS_PRIVATE |
 746                               PAN_BO_ACCESS_READ |
 747                               panfrost_bo_access_for_stage(st));
 748
 749         mali_ptr shader_ptr;
 750
 751         if (st == PIPE_SHADER_FRAGMENT) {
 752                 struct panfrost_device *dev = pan_device(ctx->base.screen);
 753                 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 754                 size_t desc_size = sizeof(meta);
 755                 struct midgard_blend_rt rts[4];
 756                 struct panfrost_transfer xfer;
 757
 758                 assert(rt_count <= ARRAY_SIZE(rts));
 759
 760                 panfrost_frag_shader_meta_init(ctx, &meta, rts);
 761
 762                 if (!(dev->quirks & MIDGARD_SFBD))
 763                         desc_size += sizeof(*rts) * rt_count;
 764
 765                 xfer = panfrost_allocate_transient(batch, desc_size);
 766
 767                 memcpy(xfer.cpu, &meta, sizeof(meta));
 768                 memcpy(xfer.cpu + sizeof(meta), rts, sizeof(*rts) * rt_count);
 769
 770                 shader_ptr = xfer.gpu;
 771         } else {
 772                 shader_ptr = panfrost_upload_transient(batch, &meta,
 773                                                        sizeof(meta));
 774         }
 775
 776         postfix->shader = shader_ptr;
 777 }
 778
 779 static void
 780 panfrost_mali_viewport_init(struct panfrost_context *ctx,
 781                             struct mali_viewport *mvp)
 782 {
 783         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 784
 785         /* Clip bounds are encoded as floats. The viewport itself is encoded as
 786          * (somewhat) asymmetric ints. */
 787
 788         const struct pipe_scissor_state *ss = &ctx->scissor;
 789
 790         memset(mvp, 0, sizeof(*mvp));
 791
 792         /* By default, do no viewport clipping, i.e. clip to (-inf, inf) in
 793          * each direction. Clipping to the viewport in theory should work, but
 794          * in practice causes issues when we're not explicitly trying to
 795          * scissor */
 796
 797         *mvp = (struct mali_viewport) {
 798                 .clip_minx = -INFINITY,
 799                 .clip_miny = -INFINITY,
 800                 .clip_maxx = INFINITY,
 801                 .clip_maxy = INFINITY,
 802         };
 803
 804         /* Always scissor to the viewport by default. */
 805         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 806         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 807
 808         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 809         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 810
 811         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 812         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 813
 814         /* Apply the scissor test */
 815
 816         unsigned minx, miny, maxx, maxy;
 817
 818         if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
 819                 minx = MAX2(ss->minx, vp_minx);
 820                 miny = MAX2(ss->miny, vp_miny);
 821                 maxx = MIN2(ss->maxx, vp_maxx);
 822                 maxy = MIN2(ss->maxy, vp_maxy);
 823         } else {
 824                 minx = vp_minx;
 825                 miny = vp_miny;
 826                 maxx = vp_maxx;
 827                 maxy = vp_maxy;
 828         }
 829
 830         /* Hardware needs the min/max to be strictly ordered, so flip if we
 831          * need to. The viewport transformation in the vertex shader will
 832          * handle the negatives if we don't */
 833
 834         if (miny > maxy) {
 835                 unsigned temp = miny;
 836                 miny = maxy;
 837                 maxy = temp;
 838         }
 839
 840         if (minx > maxx) {
 841                 unsigned temp = minx;
 842                 minx = maxx;
 843                 maxx = temp;
 844         }
 845
 846         if (minz > maxz) {
 847                 float temp = minz;
 848                 minz = maxz;
 849                 maxz = temp;
 850         }
 851
 852         /* Clamp to the framebuffer size as a last check */
 853
 854         minx = MIN2(ctx->pipe_framebuffer.width, minx);
 855         maxx = MIN2(ctx->pipe_framebuffer.width, maxx);
 856
 857         miny = MIN2(ctx->pipe_framebuffer.height, miny);
 858         maxy = MIN2(ctx->pipe_framebuffer.height, maxy);
 859
 860         /* Upload */
 861
 862         mvp->viewport0[0] = minx;
 863         mvp->viewport1[0] = MALI_POSITIVE(maxx);
 864
 865         mvp->viewport0[1] = miny;
 866         mvp->viewport1[1] = MALI_POSITIVE(maxy);
 867
 868         mvp->clip_minz = minz;
 869         mvp->clip_maxz = maxz;
 870 }
 871
 872 void
 873 panfrost_emit_viewport(struct panfrost_batch *batch,
 874                        struct mali_vertex_tiler_postfix *tiler_postfix)
 875 {
 876         struct panfrost_context *ctx = batch->ctx;
 877         struct mali_viewport mvp;
 878
 879         panfrost_mali_viewport_init(batch->ctx,  &mvp);
 880
 881         /* Update the job, unless we're doing wallpapering (whose lack of
 882          * scissor we can ignore, since if we "miss" a tile of wallpaper, it'll
 883          * just... be faster :) */
 884
 885         if (!ctx->wallpaper_batch)
 886                 panfrost_batch_union_scissor(batch, mvp.viewport0[0],
 887                                              mvp.viewport0[1],
 888                                              mvp.viewport1[0] + 1,
 889                                              mvp.viewport1[1] + 1);
 890
 891         tiler_postfix->viewport = panfrost_upload_transient(batch, &mvp,
 892                                                             sizeof(mvp));
 893 }
 894
 895 static mali_ptr
 896 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
 897                                  enum pipe_shader_type st,
 898                                  struct panfrost_constant_buffer *buf,
 899                                  unsigned index)
 900 {
 901         struct pipe_constant_buffer *cb = &buf->cb[index];
 902         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
 903
 904         if (rsrc) {
 905                 panfrost_batch_add_bo(batch, rsrc->bo,
 906                                       PAN_BO_ACCESS_SHARED |
 907                                       PAN_BO_ACCESS_READ |
 908                                       panfrost_bo_access_for_stage(st));
 909
 910                 /* Alignment gauranteed by
 911                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
 912                 return rsrc->bo->gpu + cb->buffer_offset;
 913         } else if (cb->user_buffer) {
 914                 return panfrost_upload_transient(batch,
 915                                                  cb->user_buffer +
 916                                                  cb->buffer_offset,
 917                                                  cb->buffer_size);
 918         } else {
 919                 unreachable("No constant buffer");
 920         }
 921 }
 922
 923 struct sysval_uniform {
 924         union {
 925                 float f[4];
 926                 int32_t i[4];
 927                 uint32_t u[4];
 928                 uint64_t du[2];
 929         };
 930 };
 931
 932 static void
 933 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
 934                                       struct sysval_uniform *uniform)
 935 {
 936         struct panfrost_context *ctx = batch->ctx;
 937         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 938
 939         uniform->f[0] = vp->scale[0];
 940         uniform->f[1] = vp->scale[1];
 941         uniform->f[2] = vp->scale[2];
 942 }
 943
 944 static void
 945 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
 946                                        struct sysval_uniform *uniform)
 947 {
 948         struct panfrost_context *ctx = batch->ctx;
 949         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 950
 951         uniform->f[0] = vp->translate[0];
 952         uniform->f[1] = vp->translate[1];
 953         uniform->f[2] = vp->translate[2];
 954 }
 955
 956 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
 957                                        enum pipe_shader_type st,
 958                                        unsigned int sysvalid,
 959                                        struct sysval_uniform *uniform)
 960 {
 961         struct panfrost_context *ctx = batch->ctx;
 962         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
 963         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
 964         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
 965         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
 966
 967         assert(dim);
 968         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
 969
 970         if (dim > 1)
 971                 uniform->i[1] = u_minify(tex->texture->height0,
 972                                          tex->u.tex.first_level);
 973
 974         if (dim > 2)
 975                 uniform->i[2] = u_minify(tex->texture->depth0,
 976                                          tex->u.tex.first_level);
 977
 978         if (is_array)
 979                 uniform->i[dim] = tex->texture->array_size;
 980 }
 981
 982 static void
 983 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
 984                             enum pipe_shader_type st,
 985                             unsigned ssbo_id,
 986                             struct sysval_uniform *uniform)
 987 {
 988         struct panfrost_context *ctx = batch->ctx;
 989
 990         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
 991         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
 992
 993         /* Compute address */
 994         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
 995
 996         panfrost_batch_add_bo(batch, bo,
 997                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
 998                               panfrost_bo_access_for_stage(st));
 999
1000         /* Upload address and size as sysval */
1001         uniform->du[0] = bo->gpu + sb.buffer_offset;
1002         uniform->u[2] = sb.buffer_size;
1003 }
1004
1005 static void
1006 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1007                                enum pipe_shader_type st,
1008                                unsigned samp_idx,
1009                                struct sysval_uniform *uniform)
1010 {
1011         struct panfrost_context *ctx = batch->ctx;
1012         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1013
1014         uniform->f[0] = sampl->min_lod;
1015         uniform->f[1] = sampl->max_lod;
1016         uniform->f[2] = sampl->lod_bias;
1017
1018         /* Even without any errata, Midgard represents "no mipmapping" as
1019          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1020          * panfrost_create_sampler_state which also explains our choice of
1021          * epsilon value (again to keep behaviour consistent) */
1022
1023         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1024                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1025 }
1026
1027 static void
1028 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1029                                        struct sysval_uniform *uniform)
1030 {
1031         struct panfrost_context *ctx = batch->ctx;
1032
1033         uniform->u[0] = ctx->compute_grid->grid[0];
1034         uniform->u[1] = ctx->compute_grid->grid[1];
1035         uniform->u[2] = ctx->compute_grid->grid[2];
1036 }
1037
1038 static void
1039 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1040                         struct panfrost_shader_state *ss,
1041                         enum pipe_shader_type st)
1042 {
1043         struct sysval_uniform *uniforms = (void *)buf;
1044
1045         for (unsigned i = 0; i < ss->sysval_count; ++i) {
1046                 int sysval = ss->sysval[i];
1047
1048                 switch (PAN_SYSVAL_TYPE(sysval)) {
1049                 case PAN_SYSVAL_VIEWPORT_SCALE:
1050                         panfrost_upload_viewport_scale_sysval(batch,
1051                                                               &uniforms[i]);
1052                         break;
1053                 case PAN_SYSVAL_VIEWPORT_OFFSET:
1054                         panfrost_upload_viewport_offset_sysval(batch,
1055                                                                &uniforms[i]);
1056                         break;
1057                 case PAN_SYSVAL_TEXTURE_SIZE:
1058                         panfrost_upload_txs_sysval(batch, st,
1059                                                    PAN_SYSVAL_ID(sysval),
1060                                                    &uniforms[i]);
1061                         break;
1062                 case PAN_SYSVAL_SSBO:
1063                         panfrost_upload_ssbo_sysval(batch, st,
1064                                                     PAN_SYSVAL_ID(sysval),
1065                                                     &uniforms[i]);
1066                         break;
1067                 case PAN_SYSVAL_NUM_WORK_GROUPS:
1068                         panfrost_upload_num_work_groups_sysval(batch,
1069                                                                &uniforms[i]);
1070                         break;
1071                 case PAN_SYSVAL_SAMPLER:
1072                         panfrost_upload_sampler_sysval(batch, st,
1073                                                        PAN_SYSVAL_ID(sysval),
1074                                                        &uniforms[i]);
1075                         break;
1076                 default:
1077                         assert(0);
1078                 }
1079         }
1080 }
1081
1082 static const void *
1083 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1084                                  unsigned index)
1085 {
1086         struct pipe_constant_buffer *cb = &buf->cb[index];
1087         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1088
1089         if (rsrc)
1090                 return rsrc->bo->cpu;
1091         else if (cb->user_buffer)
1092                 return cb->user_buffer;
1093         else
1094                 unreachable("No constant buffer");
1095 }
1096
1097 void
1098 panfrost_emit_const_buf(struct panfrost_batch *batch,
1099                         enum pipe_shader_type stage,
1100                         struct mali_vertex_tiler_postfix *postfix)
1101 {
1102         struct panfrost_context *ctx = batch->ctx;
1103         struct panfrost_shader_variants *all = ctx->shader[stage];
1104
1105         if (!all)
1106                 return;
1107
1108         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1109
1110         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1111
1112         /* Uniforms are implicitly UBO #0 */
1113         bool has_uniforms = buf->enabled_mask & (1 << 0);
1114
1115         /* Allocate room for the sysval and the uniforms */
1116         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1117         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1118         size_t size = sys_size + uniform_size;
1119         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1120                                                                         size);
1121
1122         /* Upload sysvals requested by the shader */
1123         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1124
1125         /* Upload uniforms */
1126         if (has_uniforms && uniform_size) {
1127                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1128                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1129         }
1130
1131         /* Next up, attach UBOs. UBO #0 is the uniforms we just
1132          * uploaded */
1133
1134         unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1135         assert(ubo_count >= 1);
1136
1137         size_t sz = sizeof(uint64_t) * ubo_count;
1138         uint64_t ubos[PAN_MAX_CONST_BUFFERS];
1139         int uniform_count = ss->uniform_count;
1140
1141         /* Upload uniforms as a UBO */
1142         ubos[0] = MALI_MAKE_UBO(2 + uniform_count, transfer.gpu);
1143
1144         /* The rest are honest-to-goodness UBOs */
1145
1146         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1147                 size_t usz = buf->cb[ubo].buffer_size;
1148                 bool enabled = buf->enabled_mask & (1 << ubo);
1149                 bool empty = usz == 0;
1150
1151                 if (!enabled || empty) {
1152                         /* Stub out disabled UBOs to catch accesses */
1153                         ubos[ubo] = MALI_MAKE_UBO(0, 0xDEAD0000);
1154                         continue;
1155                 }
1156
1157                 mali_ptr gpu = panfrost_map_constant_buffer_gpu(batch, stage,
1158                                                                 buf, ubo);
1159
1160                 unsigned bytes_per_field = 16;
1161                 unsigned aligned = ALIGN_POT(usz, bytes_per_field);
1162                 ubos[ubo] = MALI_MAKE_UBO(aligned / bytes_per_field, gpu);
1163         }
1164
1165         mali_ptr ubufs = panfrost_upload_transient(batch, ubos, sz);
1166         postfix->uniforms = transfer.gpu;
1167         postfix->uniform_buffers = ubufs;
1168
1169         buf->dirty_mask = 0;
1170 }
1171
1172 void
1173 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1174                             const struct pipe_grid_info *info,
1175                             struct midgard_payload_vertex_tiler *vtp)
1176 {
1177         struct panfrost_context *ctx = batch->ctx;
1178         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1179         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1180         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1181                                                            128));
1182         unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1183                                info->grid[2] * 4;
1184         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1185                                                                   shared_size,
1186                                                                   1);
1187
1188         struct mali_shared_memory shared = {
1189                 .shared_memory = bo->gpu,
1190                 .shared_workgroup_count =
1191                         util_logbase2_ceil(info->grid[0]) +
1192                         util_logbase2_ceil(info->grid[1]) +
1193                         util_logbase2_ceil(info->grid[2]),
1194                 .shared_unk1 = 0x2,
1195                 .shared_shift = util_logbase2(single_size) - 1
1196         };
1197
1198         vtp->postfix.shared_memory = panfrost_upload_transient(batch, &shared,
1199                                                                sizeof(shared));
1200 }
1201
1202 static mali_ptr
1203 panfrost_get_tex_desc(struct panfrost_batch *batch,
1204                       enum pipe_shader_type st,
1205                       struct panfrost_sampler_view *view)
1206 {
1207         if (!view)
1208                 return (mali_ptr) 0;
1209
1210         struct pipe_sampler_view *pview = &view->base;
1211         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1212
1213         /* Add the BO to the job so it's retained until the job is done. */
1214
1215         panfrost_batch_add_bo(batch, rsrc->bo,
1216                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1217                               panfrost_bo_access_for_stage(st));
1218
1219         panfrost_batch_add_bo(batch, view->bo,
1220                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1221                               panfrost_bo_access_for_stage(st));
1222
1223         return view->bo->gpu;
1224 }
1225
1226 void
1227 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1228                                   enum pipe_shader_type stage,
1229                                   struct mali_vertex_tiler_postfix *postfix)
1230 {
1231         struct panfrost_context *ctx = batch->ctx;
1232
1233         if (!ctx->sampler_view_count[stage])
1234                 return;
1235
1236         uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1237
1238          for (int i = 0; i < ctx->sampler_view_count[stage]; ++i)
1239                 trampolines[i] = panfrost_get_tex_desc(batch, stage,
1240                                                        ctx->sampler_views[stage][i]);
1241
1242          postfix->texture_trampoline = panfrost_upload_transient(batch,
1243                                                                  trampolines,
1244                                                                  sizeof(uint64_t) *
1245                                                                  ctx->sampler_view_count[stage]);
1246 }
1247
1248 void
1249 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1250                                   enum pipe_shader_type stage,
1251                                   struct mali_vertex_tiler_postfix *postfix)
1252 {
1253         struct panfrost_context *ctx = batch->ctx;
1254
1255         if (!ctx->sampler_count[stage])
1256                 return;
1257
1258         size_t desc_size = sizeof(struct mali_sampler_descriptor);
1259         size_t transfer_size = desc_size * ctx->sampler_count[stage];
1260         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1261                                                                         transfer_size);
1262         struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1263
1264         for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1265                 desc[i] = ctx->samplers[stage][i]->hw;
1266
1267         postfix->sampler_descriptor = transfer.gpu;
1268 }
1269
1270 void
1271 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1272                                struct mali_vertex_tiler_postfix *vertex_postfix)
1273 {
1274         struct panfrost_context *ctx = batch->ctx;
1275
1276         if (!ctx->vertex)
1277                 return;
1278
1279         struct panfrost_vertex_state *so = ctx->vertex;
1280
1281         panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1282         vertex_postfix->attribute_meta = panfrost_upload_transient(batch, so->hw,
1283                                                                sizeof(*so->hw) *
1284                                                                PAN_MAX_ATTRIBUTE);
1285 }
1286
1287 void
1288 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1289                           struct mali_vertex_tiler_postfix *vertex_postfix)
1290 {
1291         struct panfrost_context *ctx = batch->ctx;
1292         struct panfrost_vertex_state *so = ctx->vertex;
1293
1294         /* Staged mali_attr, and index into them. i =/= k, depending on the
1295          * vertex buffer mask and instancing. Twice as much room is allocated,
1296          * for a worst case of NPOT_DIVIDEs which take up extra slot */
1297         union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1298         unsigned k = 0;
1299
1300         for (unsigned i = 0; i < so->num_elements; ++i) {
1301                 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1302                  * means duplicating some vertex buffers (who cares? aside from
1303                  * maybe some caching implications but I somehow doubt that
1304                  * matters) */
1305
1306                 struct pipe_vertex_element *elem = &so->pipe[i];
1307                 unsigned vbi = elem->vertex_buffer_index;
1308
1309                 /* The exception to 1:1 mapping is that we can have multiple
1310                  * entries (NPOT divisors), so we fixup anyways */
1311
1312                 so->hw[i].index = k;
1313
1314                 if (!(ctx->vb_mask & (1 << vbi)))
1315                         continue;
1316
1317                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1318                 struct panfrost_resource *rsrc;
1319
1320                 rsrc = pan_resource(buf->buffer.resource);
1321                 if (!rsrc)
1322                         continue;
1323
1324                 /* Align to 64 bytes by masking off the lower bits. This
1325                  * will be adjusted back when we fixup the src_offset in
1326                  * mali_attr_meta */
1327
1328                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1329                 mali_ptr addr = raw_addr & ~63;
1330                 unsigned chopped_addr = raw_addr - addr;
1331
1332                 /* Add a dependency of the batch on the vertex buffer */
1333                 panfrost_batch_add_bo(batch, rsrc->bo,
1334                                       PAN_BO_ACCESS_SHARED |
1335                                       PAN_BO_ACCESS_READ |
1336                                       PAN_BO_ACCESS_VERTEX_TILER);
1337
1338                 /* Set common fields */
1339                 attrs[k].elements = addr;
1340                 attrs[k].stride = buf->stride;
1341
1342                 /* Since we advanced the base pointer, we shrink the buffer
1343                  * size */
1344                 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1345
1346                 /* We need to add the extra size we masked off (for
1347                  * correctness) so the data doesn't get clamped away */
1348                 attrs[k].size += chopped_addr;
1349
1350                 /* For non-instancing make sure we initialize */
1351                 attrs[k].shift = attrs[k].extra_flags = 0;
1352
1353                 /* Instancing uses a dramatically different code path than
1354                  * linear, so dispatch for the actual emission now that the
1355                  * common code is finished */
1356
1357                 unsigned divisor = elem->instance_divisor;
1358
1359                 if (divisor && ctx->instance_count == 1) {
1360                         /* Silly corner case where there's a divisor(=1) but
1361                          * there's no legitimate instancing. So we want *every*
1362                          * attribute to be the same. So set stride to zero so
1363                          * we don't go anywhere. */
1364
1365                         attrs[k].size = attrs[k].stride + chopped_addr;
1366                         attrs[k].stride = 0;
1367                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1368                 } else if (ctx->instance_count <= 1) {
1369                         /* Normal, non-instanced attributes */
1370                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1371                 } else {
1372                         unsigned instance_shift = vertex_postfix->instance_shift;
1373                         unsigned instance_odd = vertex_postfix->instance_odd;
1374
1375                         k += panfrost_vertex_instanced(ctx->padded_count,
1376                                                        instance_shift,
1377                                                        instance_odd,
1378                                                        divisor, &attrs[k]);
1379                 }
1380         }
1381
1382         /* Add special gl_VertexID/gl_InstanceID buffers */
1383
1384         panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1385         so->hw[PAN_VERTEX_ID].index = k++;
1386         panfrost_instance_id(ctx->padded_count, &attrs[k]);
1387         so->hw[PAN_INSTANCE_ID].index = k++;
1388
1389         /* Upload whatever we emitted and go */
1390
1391         vertex_postfix->attributes = panfrost_upload_transient(batch, attrs,
1392                                                            k * sizeof(*attrs));
1393 }
1394
1395 static mali_ptr
1396 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1397                        unsigned stride, unsigned count)
1398 {
1399         /* Fill out the descriptor */
1400         slot->stride = stride;
1401         slot->size = stride * count;
1402         slot->shift = slot->extra_flags = 0;
1403
1404         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1405                                                                         slot->size);
1406
1407         slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1408
1409         return transfer.gpu;
1410 }
1411
1412 static void
1413 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1414                         unsigned stride, unsigned offset, unsigned count,
1415                         struct pipe_stream_output_target *target)
1416 {
1417         /* Fill out the descriptor */
1418         slot->stride = stride * 4;
1419         slot->shift = slot->extra_flags = 0;
1420
1421         unsigned max_size = target->buffer_size;
1422         unsigned expected_size = slot->stride * count;
1423
1424         slot->size = MIN2(max_size, expected_size);
1425
1426         /* Grab the BO and bind it to the batch */
1427         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1428
1429         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1430          * the perspective of the TILER and FRAGMENT.
1431          */
1432         panfrost_batch_add_bo(batch, bo,
1433                               PAN_BO_ACCESS_SHARED |
1434                               PAN_BO_ACCESS_RW |
1435                               PAN_BO_ACCESS_VERTEX_TILER |
1436                               PAN_BO_ACCESS_FRAGMENT);
1437
1438         mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1439         slot->elements = addr;
1440 }
1441
1442 /* Given a shader and buffer indices, link varying metadata together */
1443
1444 static bool
1445 is_special_varying(gl_varying_slot loc)
1446 {
1447         switch (loc) {
1448         case VARYING_SLOT_POS:
1449         case VARYING_SLOT_PSIZ:
1450         case VARYING_SLOT_PNTC:
1451         case VARYING_SLOT_FACE:
1452                 return true;
1453         default:
1454                 return false;
1455         }
1456 }
1457
1458 static void
1459 panfrost_emit_varying_meta(void *outptr, struct panfrost_shader_state *ss,
1460                            signed general, signed gl_Position,
1461                            signed gl_PointSize, signed gl_PointCoord,
1462                            signed gl_FrontFacing)
1463 {
1464         struct mali_attr_meta *out = (struct mali_attr_meta *) outptr;
1465
1466         for (unsigned i = 0; i < ss->varying_count; ++i) {
1467                 gl_varying_slot location = ss->varyings_loc[i];
1468                 int index = -1;
1469
1470                 switch (location) {
1471                 case VARYING_SLOT_POS:
1472                         index = gl_Position;
1473                         break;
1474                 case VARYING_SLOT_PSIZ:
1475                         index = gl_PointSize;
1476                         break;
1477                 case VARYING_SLOT_PNTC:
1478                         index = gl_PointCoord;
1479                         break;
1480                 case VARYING_SLOT_FACE:
1481                         index = gl_FrontFacing;
1482                         break;
1483                 default:
1484                         index = general;
1485                         break;
1486                 }
1487
1488                 assert(index >= 0);
1489                 out[i].index = index;
1490         }
1491 }
1492
1493 static bool
1494 has_point_coord(unsigned mask, gl_varying_slot loc)
1495 {
1496         if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1497                 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1498         else if (loc == VARYING_SLOT_PNTC)
1499                 return (mask & (1 << 8));
1500         else
1501                 return false;
1502 }
1503
1504 /* Helpers for manipulating stream out information so we can pack varyings
1505  * accordingly. Compute the src_offset for a given captured varying */
1506
1507 static struct pipe_stream_output *
1508 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1509 {
1510         for (unsigned i = 0; i < info->num_outputs; ++i) {
1511                 if (info->output[i].register_index == loc)
1512                         return &info->output[i];
1513         }
1514
1515         unreachable("Varying not captured");
1516 }
1517
1518 /* TODO: Integers */
1519 static enum mali_format
1520 pan_xfb_format(unsigned nr_components)
1521 {
1522         switch (nr_components) {
1523                 case 1: return MALI_R32F;
1524                 case 2: return MALI_RG32F;
1525                 case 3: return MALI_RGB32F;
1526                 case 4: return MALI_RGBA32F;
1527                 default: unreachable("Invalid format");
1528         }
1529 }
1530
1531 void
1532 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1533                                  unsigned vertex_count,
1534                                  struct mali_vertex_tiler_postfix *vertex_postfix,
1535                                  struct mali_vertex_tiler_postfix *tiler_postfix,
1536                                  union midgard_primitive_size *primitive_size)
1537 {
1538         /* Load the shaders */
1539         struct panfrost_context *ctx = batch->ctx;
1540         struct panfrost_shader_state *vs, *fs;
1541         unsigned int num_gen_varyings = 0;
1542         size_t vs_size, fs_size;
1543
1544         /* Allocate the varying descriptor */
1545
1546         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1547         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1548         vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
1549         fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
1550
1551         struct panfrost_transfer trans = panfrost_allocate_transient(batch,
1552                                                                      vs_size +
1553                                                                      fs_size);
1554
1555         struct pipe_stream_output_info *so = &vs->stream_output;
1556
1557         /* Check if this varying is linked by us. This is the case for
1558          * general-purpose, non-captured varyings. If it is, link it. If it's
1559          * not, use the provided stream out information to determine the
1560          * offset, since it was already linked for us. */
1561
1562         for (unsigned i = 0; i < vs->varying_count; i++) {
1563                 gl_varying_slot loc = vs->varyings_loc[i];
1564
1565                 bool special = is_special_varying(loc);
1566                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1567
1568                 if (captured) {
1569                         struct pipe_stream_output *o = pan_get_so(so, loc);
1570
1571                         unsigned dst_offset = o->dst_offset * 4; /* dwords */
1572                         vs->varyings[i].src_offset = dst_offset;
1573                 } else if (!special) {
1574                         vs->varyings[i].src_offset = 16 * (num_gen_varyings++);
1575                 }
1576         }
1577
1578         /* Conversely, we need to set src_offset for the captured varyings.
1579          * Here, the layout is defined by the stream out info, not us */
1580
1581         /* Link up with fragment varyings */
1582         bool reads_point_coord = fs->reads_point_coord;
1583
1584         for (unsigned i = 0; i < fs->varying_count; i++) {
1585                 gl_varying_slot loc = fs->varyings_loc[i];
1586                 unsigned src_offset;
1587                 signed vs_idx = -1;
1588
1589                 /* Link up */
1590                 for (unsigned j = 0; j < vs->varying_count; ++j) {
1591                         if (vs->varyings_loc[j] == loc) {
1592                                 vs_idx = j;
1593                                 break;
1594                         }
1595                 }
1596
1597                 /* Either assign or reuse */
1598                 if (vs_idx >= 0)
1599                         src_offset = vs->varyings[vs_idx].src_offset;
1600                 else
1601                         src_offset = 16 * (num_gen_varyings++);
1602
1603                 fs->varyings[i].src_offset = src_offset;
1604
1605                 if (has_point_coord(fs->point_sprite_mask, loc))
1606                         reads_point_coord = true;
1607         }
1608
1609         memcpy(trans.cpu, vs->varyings, vs_size);
1610         memcpy(trans.cpu + vs_size, fs->varyings, fs_size);
1611
1612         union mali_attr varyings[PIPE_MAX_ATTRIBS] = {0};
1613
1614         /* Figure out how many streamout buffers could be bound */
1615         unsigned so_count = ctx->streamout.num_targets;
1616         for (unsigned i = 0; i < vs->varying_count; i++) {
1617                 gl_varying_slot loc = vs->varyings_loc[i];
1618
1619                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1620                 if (!captured) continue;
1621
1622                 struct pipe_stream_output *o = pan_get_so(so, loc);
1623                 so_count = MAX2(so_count, o->output_buffer + 1);
1624         }
1625
1626         signed idx = so_count;
1627         signed general = idx++;
1628         signed gl_Position = idx++;
1629         signed gl_PointSize = vs->writes_point_size ? (idx++) : -1;
1630         signed gl_PointCoord = reads_point_coord ? (idx++) : -1;
1631         signed gl_FrontFacing = fs->reads_face ? (idx++) : -1;
1632         signed gl_FragCoord = fs->reads_frag_coord ? (idx++) : -1;
1633
1634         /* Emit the stream out buffers */
1635
1636         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1637                                                            ctx->vertex_count);
1638
1639         for (unsigned i = 0; i < so_count; ++i) {
1640                 if (i < ctx->streamout.num_targets) {
1641                         panfrost_emit_streamout(batch, &varyings[i],
1642                                                 so->stride[i],
1643                                                 ctx->streamout.offsets[i],
1644                                                 out_count,
1645                                                 ctx->streamout.targets[i]);
1646                 } else {
1647                         /* Emit a dummy buffer */
1648                         panfrost_emit_varyings(batch, &varyings[i],
1649                                                so->stride[i] * 4,
1650                                                out_count);
1651
1652                         /* Clear the attribute type */
1653                         varyings[i].elements &= ~0xF;
1654                 }
1655         }
1656
1657         panfrost_emit_varyings(batch, &varyings[general],
1658                                num_gen_varyings * 16,
1659                                vertex_count);
1660
1661         mali_ptr varyings_p;
1662
1663         /* fp32 vec4 gl_Position */
1664         varyings_p = panfrost_emit_varyings(batch, &varyings[gl_Position],
1665                                             sizeof(float) * 4, vertex_count);
1666         tiler_postfix->position_varying = varyings_p;
1667
1668
1669         if (panfrost_writes_point_size(ctx)) {
1670                 varyings_p = panfrost_emit_varyings(batch,
1671                                                     &varyings[gl_PointSize],
1672                                                     2, vertex_count);
1673                 primitive_size->pointer = varyings_p;
1674         }
1675
1676         if (reads_point_coord)
1677                 varyings[gl_PointCoord].elements = MALI_VARYING_POINT_COORD;
1678
1679         if (fs->reads_face)
1680                 varyings[gl_FrontFacing].elements = MALI_VARYING_FRONT_FACING;
1681
1682         if (fs->reads_frag_coord)
1683                 varyings[gl_FragCoord].elements = MALI_VARYING_FRAG_COORD;
1684
1685         /* Let's go ahead and link varying meta to the buffer in question, now
1686          * that that information is available. VARYING_SLOT_POS is mapped to
1687          * gl_FragCoord for fragment shaders but gl_Positionf or vertex shaders
1688          * */
1689
1690         panfrost_emit_varying_meta(trans.cpu, vs, general, gl_Position,
1691                                    gl_PointSize, gl_PointCoord,
1692                                    gl_FrontFacing);
1693
1694         panfrost_emit_varying_meta(trans.cpu + vs_size, fs, general,
1695                                    gl_FragCoord, gl_PointSize,
1696                                    gl_PointCoord, gl_FrontFacing);
1697
1698         /* Replace streamout */
1699
1700         struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
1701         struct mali_attr_meta *ofs = ovs + vs->varying_count;
1702
1703         for (unsigned i = 0; i < vs->varying_count; i++) {
1704                 gl_varying_slot loc = vs->varyings_loc[i];
1705
1706                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1707                 if (!captured)
1708                         continue;
1709
1710                 struct pipe_stream_output *o = pan_get_so(so, loc);
1711                 ovs[i].index = o->output_buffer;
1712
1713                 /* Set the type appropriately. TODO: Integer varyings XXX */
1714                 assert(o->stream == 0);
1715                 ovs[i].format = pan_xfb_format(o->num_components);
1716                 ovs[i].swizzle = panfrost_get_default_swizzle(o->num_components);
1717
1718                 /* Link to the fragment */
1719                 signed fs_idx = -1;
1720
1721                 /* Link up */
1722                 for (unsigned j = 0; j < fs->varying_count; ++j) {
1723                         if (fs->varyings_loc[j] == loc) {
1724                                 fs_idx = j;
1725                                 break;
1726                         }
1727                 }
1728
1729                 if (fs_idx >= 0) {
1730                         ofs[fs_idx].index = ovs[i].index;
1731                         ofs[fs_idx].format = ovs[i].format;
1732                         ofs[fs_idx].swizzle = ovs[i].swizzle;
1733                 }
1734         }
1735
1736         /* Replace point sprite */
1737         for (unsigned i = 0; i < fs->varying_count; i++) {
1738                 /* If we have a point sprite replacement, handle that here. We
1739                  * have to translate location first.  TODO: Flip y in shader.
1740                  * We're already keying ... just time crunch .. */
1741
1742                 if (has_point_coord(fs->point_sprite_mask,
1743                                     fs->varyings_loc[i])) {
1744                         ofs[i].index = gl_PointCoord;
1745
1746                         /* Swizzle out the z/w to 0/1 */
1747                         ofs[i].format = MALI_RG16F;
1748                         ofs[i].swizzle = panfrost_get_default_swizzle(2);
1749                 }
1750         }
1751
1752         /* Fix up unaligned addresses */
1753         for (unsigned i = 0; i < so_count; ++i) {
1754                 if (varyings[i].elements < MALI_RECORD_SPECIAL)
1755                         continue;
1756
1757                 unsigned align = (varyings[i].elements & 63);
1758
1759                 /* While we're at it, the SO buffers are linear */
1760
1761                 if (!align) {
1762                         varyings[i].elements |= MALI_ATTR_LINEAR;
1763                         continue;
1764                 }
1765
1766                 /* We need to adjust alignment */
1767                 varyings[i].elements &= ~63;
1768                 varyings[i].elements |= MALI_ATTR_LINEAR;
1769                 varyings[i].size += align;
1770
1771                 for (unsigned v = 0; v < vs->varying_count; ++v) {
1772                         if (ovs[v].index != i)
1773                                 continue;
1774
1775                         ovs[v].src_offset = vs->varyings[v].src_offset + align;
1776                 }
1777
1778                 for (unsigned f = 0; f < fs->varying_count; ++f) {
1779                         if (ofs[f].index != i)
1780                                 continue;
1781
1782                         ofs[f].src_offset = fs->varyings[f].src_offset + align;
1783                 }
1784         }
1785
1786         varyings_p = panfrost_upload_transient(batch, varyings,
1787                                                idx * sizeof(*varyings));
1788         vertex_postfix->varyings = varyings_p;
1789         tiler_postfix->varyings = varyings_p;
1790
1791         vertex_postfix->varying_meta = trans.gpu;
1792         tiler_postfix->varying_meta = trans.gpu + vs_size;
1793 }
1794
1795 void
1796 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1797                                 struct mali_vertex_tiler_prefix *vertex_prefix,
1798                                 struct mali_vertex_tiler_postfix *vertex_postfix,
1799                                 struct mali_vertex_tiler_prefix *tiler_prefix,
1800                                 struct mali_vertex_tiler_postfix *tiler_postfix,
1801                                 union midgard_primitive_size *primitive_size)
1802 {
1803         struct panfrost_context *ctx = batch->ctx;
1804         struct panfrost_device *device = pan_device(ctx->base.screen);
1805         bool wallpapering = ctx->wallpaper_batch && batch->tiler_dep;
1806         struct bifrost_payload_vertex bifrost_vertex = {0,};
1807         struct bifrost_payload_tiler bifrost_tiler = {0,};
1808         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
1809         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
1810         void *vp, *tp;
1811         size_t vp_size, tp_size;
1812
1813         if (device->quirks & IS_BIFROST) {
1814                 bifrost_vertex.prefix = *vertex_prefix;
1815                 bifrost_vertex.postfix = *vertex_postfix;
1816                 vp = &bifrost_vertex;
1817                 vp_size = sizeof(bifrost_vertex);
1818
1819                 bifrost_tiler.prefix = *tiler_prefix;
1820                 bifrost_tiler.tiler.primitive_size = *primitive_size;
1821                 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
1822                 bifrost_tiler.postfix = *tiler_postfix;
1823                 tp = &bifrost_tiler;
1824                 tp_size = sizeof(bifrost_tiler);
1825         } else {
1826                 midgard_vertex.prefix = *vertex_prefix;
1827                 midgard_vertex.postfix = *vertex_postfix;
1828                 vp = &midgard_vertex;
1829                 vp_size = sizeof(midgard_vertex);
1830
1831                 midgard_tiler.prefix = *tiler_prefix;
1832                 midgard_tiler.postfix = *tiler_postfix;
1833                 midgard_tiler.primitive_size = *primitive_size;
1834                 tp = &midgard_tiler;
1835                 tp_size = sizeof(midgard_tiler);
1836         }
1837
1838         if (wallpapering) {
1839                 /* Inject in reverse order, with "predicted" job indices.
1840                  * THIS IS A HACK XXX */
1841                 panfrost_new_job(batch, JOB_TYPE_TILER, false,
1842                                  batch->job_index + 2, tp, tp_size, true);
1843                 panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
1844                                  vp, vp_size, true);
1845                 return;
1846         }
1847
1848         /* If rasterizer discard is enable, only submit the vertex */
1849
1850         bool rasterizer_discard = ctx->rasterizer &&
1851                                   ctx->rasterizer->base.rasterizer_discard;
1852
1853         unsigned vertex = panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
1854                                            vp, vp_size, false);
1855
1856         if (rasterizer_discard)
1857                 return;
1858
1859         panfrost_new_job(batch, JOB_TYPE_TILER, false, vertex, tp, tp_size,
1860                          false);
1861 }