src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28
  29 #include "panfrost-quirks.h"
  30
  31 #include "pan_allocate.h"
  32 #include "pan_bo.h"
  33 #include "pan_cmdstream.h"
  34 #include "pan_context.h"
  35 #include "pan_job.h"
  36
  37 /* If a BO is accessed for a particular shader stage, will it be in the primary
  38  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  39  * fragment will be primary, e.g. compute jobs will be considered
  40  * "vertex/tiler" by analogy */
  41
  42 static inline uint32_t
  43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  44 {
  45         assert(stage == PIPE_SHADER_FRAGMENT ||
  46                stage == PIPE_SHADER_VERTEX ||
  47                stage == PIPE_SHADER_COMPUTE);
  48
  49         return stage == PIPE_SHADER_FRAGMENT ?
  50                PAN_BO_ACCESS_FRAGMENT :
  51                PAN_BO_ACCESS_VERTEX_TILER;
  52 }
  53
  54 static void
  55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
  56                                struct mali_vertex_tiler_postfix *postfix)
  57 {
  58         struct panfrost_device *dev = pan_device(ctx->base.screen);
  59         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  60
  61         unsigned shift = panfrost_get_stack_shift(batch->stack_size);
  62         struct mali_shared_memory shared = {
  63                 .stack_shift = shift,
  64                 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
  65                 .shared_workgroup_count = ~0,
  66         };
  67         postfix->shared_memory = panfrost_upload_transient(batch, &shared, sizeof(shared));
  68 }
  69
  70 static void
  71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
  72                                struct mali_vertex_tiler_postfix *postfix)
  73 {
  74         struct panfrost_device *dev = pan_device(ctx->base.screen);
  75         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  76
  77         /* If we haven't, reserve space for the framebuffer */
  78
  79         if (!batch->framebuffer.gpu) {
  80                 unsigned size = (dev->quirks & MIDGARD_SFBD) ?
  81                         sizeof(struct mali_single_framebuffer) :
  82                         sizeof(struct mali_framebuffer);
  83
  84                 batch->framebuffer = panfrost_allocate_transient(batch, size);
  85
  86                 /* Tag the pointer */
  87                 if (!(dev->quirks & MIDGARD_SFBD))
  88                         batch->framebuffer.gpu |= MALI_MFBD;
  89         }
  90
  91         postfix->shared_memory = batch->framebuffer.gpu;
  92 }
  93
  94 static void
  95 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
  96                               struct mali_vertex_tiler_prefix *prefix,
  97                               struct mali_vertex_tiler_postfix *postfix)
  98 {
  99         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 100
 101         postfix->gl_enables |= 0x7;
 102         SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
 103                 rasterizer && rasterizer->base.front_ccw);
 104         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
 105                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
 106         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
 107                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
 108         SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
 109                 rasterizer && rasterizer->base.flatshade_first);
 110 }
 111
 112 void
 113 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
 114                                   struct mali_vertex_tiler_prefix *prefix,
 115                                   union midgard_primitive_size *primitive_size)
 116 {
 117         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 118
 119         if (!panfrost_writes_point_size(ctx)) {
 120                 bool points = prefix->draw_mode == MALI_POINTS;
 121                 float val = 0.0f;
 122
 123                 if (rasterizer)
 124                         val = points ?
 125                               rasterizer->base.point_size :
 126                               rasterizer->base.line_width;
 127
 128                 primitive_size->constant = val;
 129         }
 130 }
 131
 132 static void
 133 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
 134                                    struct mali_vertex_tiler_postfix *postfix)
 135 {
 136         SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
 137         if (ctx->occlusion_query)
 138                 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
 139         else
 140                 postfix->occlusion_counter = 0;
 141 }
 142
 143 void
 144 panfrost_vt_init(struct panfrost_context *ctx,
 145                  enum pipe_shader_type stage,
 146                  struct mali_vertex_tiler_prefix *prefix,
 147                  struct mali_vertex_tiler_postfix *postfix)
 148 {
 149         struct panfrost_device *device = pan_device(ctx->base.screen);
 150
 151         if (!ctx->shader[stage])
 152                 return;
 153
 154         memset(prefix, 0, sizeof(*prefix));
 155         memset(postfix, 0, sizeof(*postfix));
 156
 157         if (device->quirks & IS_BIFROST) {
 158                 postfix->gl_enables = 0x2;
 159                 panfrost_vt_emit_shared_memory(ctx, postfix);
 160         } else {
 161                 postfix->gl_enables = 0x6;
 162                 panfrost_vt_attach_framebuffer(ctx, postfix);
 163         }
 164
 165         if (stage == PIPE_SHADER_FRAGMENT) {
 166                 panfrost_vt_update_occlusion_query(ctx, postfix);
 167                 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
 168         }
 169 }
 170
 171 static unsigned
 172 panfrost_translate_index_size(unsigned size)
 173 {
 174         switch (size) {
 175         case 1:
 176                 return MALI_DRAW_INDEXED_UINT8;
 177
 178         case 2:
 179                 return MALI_DRAW_INDEXED_UINT16;
 180
 181         case 4:
 182                 return MALI_DRAW_INDEXED_UINT32;
 183
 184         default:
 185                 unreachable("Invalid index size");
 186         }
 187 }
 188
 189 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
 190  * good for the duration of the draw (transient), could last longer. Also get
 191  * the bounds on the index buffer for the range accessed by the draw. We do
 192  * these operations together because there are natural optimizations which
 193  * require them to be together. */
 194
 195 static mali_ptr
 196 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 197                                   const struct pipe_draw_info *info,
 198                                   unsigned *min_index, unsigned *max_index)
 199 {
 200         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 201         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 202         off_t offset = info->start * info->index_size;
 203         bool needs_indices = true;
 204         mali_ptr out = 0;
 205
 206         if (info->max_index != ~0u) {
 207                 *min_index = info->min_index;
 208                 *max_index = info->max_index;
 209                 needs_indices = false;
 210         }
 211
 212         if (!info->has_user_indices) {
 213                 /* Only resources can be directly mapped */
 214                 panfrost_batch_add_bo(batch, rsrc->bo,
 215                                       PAN_BO_ACCESS_SHARED |
 216                                       PAN_BO_ACCESS_READ |
 217                                       PAN_BO_ACCESS_VERTEX_TILER);
 218                 out = rsrc->bo->gpu + offset;
 219
 220                 /* Check the cache */
 221                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 222                                                            info->start,
 223                                                            info->count,
 224                                                            min_index,
 225                                                            max_index);
 226         } else {
 227                 /* Otherwise, we need to upload to transient memory */
 228                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 229                 out = panfrost_upload_transient(batch, ibuf8 + offset,
 230                                                 info->count *
 231                                                 info->index_size);
 232         }
 233
 234         if (needs_indices) {
 235                 /* Fallback */
 236                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 237
 238                 if (!info->has_user_indices)
 239                         panfrost_minmax_cache_add(rsrc->index_cache,
 240                                                   info->start, info->count,
 241                                                   *min_index, *max_index);
 242         }
 243
 244         return out;
 245 }
 246
 247 void
 248 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
 249                           const struct pipe_draw_info *info,
 250                           enum mali_draw_mode draw_mode,
 251                           struct mali_vertex_tiler_postfix *vertex_postfix,
 252                           struct mali_vertex_tiler_prefix *tiler_prefix,
 253                           struct mali_vertex_tiler_postfix *tiler_postfix,
 254                           unsigned *vertex_count,
 255                           unsigned *padded_count)
 256 {
 257         tiler_prefix->draw_mode = draw_mode;
 258
 259         unsigned draw_flags = 0;
 260
 261         if (panfrost_writes_point_size(ctx))
 262                 draw_flags |= MALI_DRAW_VARYING_SIZE;
 263
 264         if (info->primitive_restart)
 265                 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
 266
 267         /* These doesn't make much sense */
 268
 269         draw_flags |= 0x3000;
 270
 271         if (info->index_size) {
 272                 unsigned min_index = 0, max_index = 0;
 273
 274                 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
 275                                                                        info,
 276                                                                        &min_index,
 277                                                                        &max_index);
 278
 279                 /* Use the corresponding values */
 280                 *vertex_count = max_index - min_index + 1;
 281                 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
 282                 tiler_prefix->offset_bias_correction = -min_index;
 283                 tiler_prefix->index_count = MALI_POSITIVE(info->count);
 284                 draw_flags |= panfrost_translate_index_size(info->index_size);
 285         } else {
 286                 tiler_prefix->indices = 0;
 287                 *vertex_count = ctx->vertex_count;
 288                 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
 289                 tiler_prefix->offset_bias_correction = 0;
 290                 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
 291         }
 292
 293         tiler_prefix->unknown_draw = draw_flags;
 294
 295         /* Encode the padded vertex count */
 296
 297         if (info->instance_count > 1) {
 298                 *padded_count = panfrost_padded_vertex_count(*vertex_count);
 299
 300                 unsigned shift = __builtin_ctz(ctx->padded_count);
 301                 unsigned k = ctx->padded_count >> (shift + 1);
 302
 303                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
 304                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
 305         } else {
 306                 *padded_count = *vertex_count;
 307
 308                 /* Reset instancing state */
 309                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
 310                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
 311         }
 312 }
 313
 314 static void
 315 panfrost_shader_meta_init(struct panfrost_context *ctx,
 316                           enum pipe_shader_type st,
 317                           struct mali_shader_meta *meta)
 318 {
 319         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 320         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 321
 322         memset(meta, 0, sizeof(*meta));
 323         meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
 324         meta->attribute_count = ss->attribute_count;
 325         meta->varying_count = ss->varying_count;
 326         meta->texture_count = ctx->sampler_view_count[st];
 327         meta->sampler_count = ctx->sampler_count[st];
 328
 329         if (dev->quirks & IS_BIFROST) {
 330                 if (st == PIPE_SHADER_VERTEX)
 331                         meta->bifrost1.unk1 = 0x800000;
 332                 else {
 333                         /* First clause ATEST |= 0x4000000.
 334                          * Less than 32 regs |= 0x200 */
 335                         meta->bifrost1.unk1 = 0x958020;
 336                 }
 337
 338                 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 339                 if (st == PIPE_SHADER_VERTEX)
 340                         meta->bifrost2.preload_regs = 0xC0;
 341                 else
 342                         meta->bifrost2.preload_regs = 0x1;
 343                 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
 344                                                     ss->uniform_cutoff);
 345         } else {
 346                 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
 347                                                     ss->uniform_cutoff);
 348                 meta->midgard1.work_count = ss->work_reg_count;
 349                 meta->midgard1.flags_hi = 0x8; /* XXX */
 350                 meta->midgard1.flags_lo = 0x220;
 351                 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 352         }
 353 }
 354
 355 static unsigned
 356 panfrost_translate_compare_func(enum pipe_compare_func in)
 357 {
 358         switch (in) {
 359         case PIPE_FUNC_NEVER:
 360                 return MALI_FUNC_NEVER;
 361
 362         case PIPE_FUNC_LESS:
 363                 return MALI_FUNC_LESS;
 364
 365         case PIPE_FUNC_EQUAL:
 366                 return MALI_FUNC_EQUAL;
 367
 368         case PIPE_FUNC_LEQUAL:
 369                 return MALI_FUNC_LEQUAL;
 370
 371         case PIPE_FUNC_GREATER:
 372                 return MALI_FUNC_GREATER;
 373
 374         case PIPE_FUNC_NOTEQUAL:
 375                 return MALI_FUNC_NOTEQUAL;
 376
 377         case PIPE_FUNC_GEQUAL:
 378                 return MALI_FUNC_GEQUAL;
 379
 380         case PIPE_FUNC_ALWAYS:
 381                 return MALI_FUNC_ALWAYS;
 382
 383         default:
 384                 unreachable("Invalid func");
 385         }
 386 }
 387
 388 static unsigned
 389 panfrost_translate_stencil_op(enum pipe_stencil_op in)
 390 {
 391         switch (in) {
 392         case PIPE_STENCIL_OP_KEEP:
 393                 return MALI_STENCIL_KEEP;
 394
 395         case PIPE_STENCIL_OP_ZERO:
 396                 return MALI_STENCIL_ZERO;
 397
 398         case PIPE_STENCIL_OP_REPLACE:
 399                return MALI_STENCIL_REPLACE;
 400
 401         case PIPE_STENCIL_OP_INCR:
 402                 return MALI_STENCIL_INCR;
 403
 404         case PIPE_STENCIL_OP_DECR:
 405                 return MALI_STENCIL_DECR;
 406
 407         case PIPE_STENCIL_OP_INCR_WRAP:
 408                 return MALI_STENCIL_INCR_WRAP;
 409
 410         case PIPE_STENCIL_OP_DECR_WRAP:
 411                 return MALI_STENCIL_DECR_WRAP;
 412
 413         case PIPE_STENCIL_OP_INVERT:
 414                 return MALI_STENCIL_INVERT;
 415
 416         default:
 417                 unreachable("Invalid stencil op");
 418         }
 419 }
 420
 421 static unsigned
 422 translate_tex_wrap(enum pipe_tex_wrap w)
 423 {
 424         switch (w) {
 425         case PIPE_TEX_WRAP_REPEAT:
 426                 return MALI_WRAP_REPEAT;
 427
 428         case PIPE_TEX_WRAP_CLAMP:
 429                 return MALI_WRAP_CLAMP;
 430
 431         case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 432                 return MALI_WRAP_CLAMP_TO_EDGE;
 433
 434         case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
 435                 return MALI_WRAP_CLAMP_TO_BORDER;
 436
 437         case PIPE_TEX_WRAP_MIRROR_REPEAT:
 438                 return MALI_WRAP_MIRRORED_REPEAT;
 439
 440         case PIPE_TEX_WRAP_MIRROR_CLAMP:
 441                 return MALI_WRAP_MIRRORED_CLAMP;
 442
 443         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 444                 return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE;
 445
 446         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
 447                 return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER;
 448
 449         default:
 450                 unreachable("Invalid wrap");
 451         }
 452 }
 453
 454 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 455                                 struct mali_sampler_descriptor *hw)
 456 {
 457         unsigned func = panfrost_translate_compare_func(cso->compare_func);
 458         bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 459         bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 460         bool mip_linear  = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
 461         unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
 462         unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
 463         unsigned mip_filter = mip_linear  ?
 464                               (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
 465         unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
 466
 467         *hw = (struct mali_sampler_descriptor) {
 468                 .filter_mode = min_filter | mag_filter | mip_filter |
 469                                normalized,
 470                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 471                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 472                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 473                 .compare_func = panfrost_flip_compare_func(func),
 474                 .border_color = {
 475                         cso->border_color.f[0],
 476                         cso->border_color.f[1],
 477                         cso->border_color.f[2],
 478                         cso->border_color.f[3]
 479                 },
 480                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 481                 .max_lod = FIXED_16(cso->max_lod, false),
 482                 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
 483                 .seamless_cube_map = cso->seamless_cube_map,
 484         };
 485
 486         /* If necessary, we disable mipmapping in the sampler descriptor by
 487          * clamping the LOD as tight as possible (from 0 to epsilon,
 488          * essentially -- remember these are fixed point numbers, so
 489          * epsilon=1/256) */
 490
 491         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 492                 hw->max_lod = hw->min_lod + 1;
 493 }
 494
 495 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
 496                                         struct bifrost_sampler_descriptor *hw)
 497 {
 498         *hw = (struct bifrost_sampler_descriptor) {
 499                 .unk1 = 0x1,
 500                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 501                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 502                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 503                 .unk8 = 0x8,
 504                 .min_filter = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST,
 505                 .norm_coords = cso->normalized_coords,
 506                 .mip_filter = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR,
 507                 .mag_filter = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR,
 508                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 509                 .max_lod = FIXED_16(cso->max_lod, false),
 510         };
 511
 512         /* If necessary, we disable mipmapping in the sampler descriptor by
 513          * clamping the LOD as tight as possible (from 0 to epsilon,
 514          * essentially -- remember these are fixed point numbers, so
 515          * epsilon=1/256) */
 516
 517         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 518                 hw->max_lod = hw->min_lod + 1;
 519 }
 520
 521 static void
 522 panfrost_make_stencil_state(const struct pipe_stencil_state *in,
 523                             struct mali_stencil_test *out)
 524 {
 525         out->ref = 0; /* Gallium gets it from elsewhere */
 526
 527         out->mask = in->valuemask;
 528         out->func = panfrost_translate_compare_func(in->func);
 529         out->sfail = panfrost_translate_stencil_op(in->fail_op);
 530         out->dpfail = panfrost_translate_stencil_op(in->zfail_op);
 531         out->dppass = panfrost_translate_stencil_op(in->zpass_op);
 532 }
 533
 534 static void
 535 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
 536                                      struct mali_shader_meta *fragmeta)
 537 {
 538         if (!ctx->rasterizer) {
 539                 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
 540                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
 541                 fragmeta->depth_units = 0.0f;
 542                 fragmeta->depth_factor = 0.0f;
 543                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
 544                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
 545                 return;
 546         }
 547
 548         bool msaa = ctx->rasterizer->base.multisample;
 549
 550         /* TODO: Sample size */
 551         SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
 552         SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
 553         fragmeta->depth_units = ctx->rasterizer->base.offset_units * 2.0f;
 554         fragmeta->depth_factor = ctx->rasterizer->base.offset_scale;
 555
 556         /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
 557
 558         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A,
 559                 ctx->rasterizer->base.offset_tri);
 560         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B,
 561                 ctx->rasterizer->base.offset_tri);
 562 }
 563
 564 static void
 565 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
 566                               struct mali_shader_meta *fragmeta)
 567 {
 568         const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
 569         int zfunc = PIPE_FUNC_ALWAYS;
 570
 571         if (!zsa) {
 572                 struct pipe_stencil_state default_stencil = {
 573                         .enabled = 0,
 574                         .func = PIPE_FUNC_ALWAYS,
 575                         .fail_op = MALI_STENCIL_KEEP,
 576                         .zfail_op = MALI_STENCIL_KEEP,
 577                         .zpass_op = MALI_STENCIL_KEEP,
 578                         .writemask = 0xFF,
 579                         .valuemask = 0xFF
 580                 };
 581
 582                 panfrost_make_stencil_state(&default_stencil,
 583                                             &fragmeta->stencil_front);
 584                 fragmeta->stencil_mask_front = default_stencil.writemask;
 585                 fragmeta->stencil_back = fragmeta->stencil_front;
 586                 fragmeta->stencil_mask_back = default_stencil.writemask;
 587                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
 588                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
 589         } else {
 590                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
 591                         zsa->stencil[0].enabled);
 592                 panfrost_make_stencil_state(&zsa->stencil[0],
 593                                             &fragmeta->stencil_front);
 594                 fragmeta->stencil_mask_front = zsa->stencil[0].writemask;
 595                 fragmeta->stencil_front.ref = ctx->stencil_ref.ref_value[0];
 596
 597                 /* If back-stencil is not enabled, use the front values */
 598
 599                 if (zsa->stencil[1].enabled) {
 600                         panfrost_make_stencil_state(&zsa->stencil[1],
 601                                                     &fragmeta->stencil_back);
 602                         fragmeta->stencil_mask_back = zsa->stencil[1].writemask;
 603                         fragmeta->stencil_back.ref = ctx->stencil_ref.ref_value[1];
 604                 } else {
 605                         fragmeta->stencil_back = fragmeta->stencil_front;
 606                         fragmeta->stencil_mask_back = fragmeta->stencil_mask_front;
 607                         fragmeta->stencil_back.ref = fragmeta->stencil_front.ref;
 608                 }
 609
 610                 if (zsa->depth.enabled)
 611                         zfunc = zsa->depth.func;
 612
 613                 /* Depth state (TODO: Refactor) */
 614
 615                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
 616                         zsa->depth.writemask);
 617         }
 618
 619         fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
 620         fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
 621 }
 622
 623 static bool
 624 panfrost_fs_required(
 625                 struct panfrost_shader_state *fs,
 626                 struct panfrost_blend_final *blend,
 627                 unsigned rt_count)
 628 {
 629         /* If we generally have side effects */
 630         if (fs->fs_sidefx)
 631                 return true;
 632
 633         /* If colour is written we need to execute */
 634         for (unsigned i = 0; i < rt_count; ++i) {
 635                 if (!blend[i].no_colour)
 636                         return true;
 637         }
 638
 639         /* If depth is written and not implied we need to execute.
 640          * TODO: Predicate on Z/S writes being enabled */
 641         return (fs->writes_depth || fs->writes_stencil);
 642 }
 643
 644 static void
 645 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
 646                                 struct mali_shader_meta *fragmeta,
 647                                 void *rts)
 648 {
 649         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 650         struct panfrost_shader_state *fs;
 651         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 652
 653         SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
 654                 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
 655                 !ctx->blend->base.dither);
 656
 657         /* Get blending setup */
 658         unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 659
 660         struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 661         unsigned shader_offset = 0;
 662         struct panfrost_bo *shader_bo = NULL;
 663
 664         for (unsigned c = 0; c < rt_count; ++c)
 665                 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
 666                                                           &shader_offset);
 667
 668         /* Disable shader execution if we can */
 669         if (dev->quirks & MIDGARD_SHADERLESS
 670                         && !panfrost_fs_required(fs, blend, rt_count)) {
 671                 fragmeta->shader = 0;
 672                 fragmeta->attribute_count = 0;
 673                 fragmeta->varying_count = 0;
 674                 fragmeta->texture_count = 0;
 675                 fragmeta->sampler_count = 0;
 676
 677                 /* This feature is not known to work on Bifrost */
 678                 fragmeta->midgard1.work_count = 1;
 679                 fragmeta->midgard1.uniform_count = 0;
 680                 fragmeta->midgard1.uniform_buffer_count = 0;
 681         }
 682
 683          /* If there is a blend shader, work registers are shared. We impose 8
 684           * work registers as a limit for blend shaders. Should be lower XXX */
 685
 686         if (!(dev->quirks & IS_BIFROST)) {
 687                 for (unsigned c = 0; c < rt_count; ++c) {
 688                         if (blend[c].is_shader) {
 689                                 fragmeta->midgard1.work_count =
 690                                         MAX2(fragmeta->midgard1.work_count, 8);
 691                         }
 692                 }
 693         }
 694
 695         /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
 696          * copied to the blend_meta appended (by convention), but this is the
 697          * field actually read by the hardware. (Or maybe both are read...?).
 698          * Specify the last RTi with a blend shader. */
 699
 700         fragmeta->blend.shader = 0;
 701
 702         for (signed rt = (rt_count - 1); rt >= 0; --rt) {
 703                 if (!blend[rt].is_shader)
 704                         continue;
 705
 706                 fragmeta->blend.shader = blend[rt].shader.gpu |
 707                                          blend[rt].shader.first_tag;
 708                 break;
 709         }
 710
 711         if (dev->quirks & MIDGARD_SFBD) {
 712                 /* When only a single render target platform is used, the blend
 713                  * information is inside the shader meta itself. We additionally
 714                  * need to signal CAN_DISCARD for nontrivial blend modes (so
 715                  * we're able to read back the destination buffer) */
 716
 717                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
 718                         blend[0].is_shader);
 719
 720                 if (!blend[0].is_shader) {
 721                         fragmeta->blend.equation = *blend[0].equation.equation;
 722                         fragmeta->blend.constant = blend[0].equation.constant;
 723                 }
 724
 725                 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
 726                         !blend[0].no_blending || fs->can_discard);
 727                 return;
 728         }
 729
 730         /* Additional blend descriptor tacked on for jobs using MFBD */
 731
 732         for (unsigned i = 0; i < rt_count; ++i) {
 733                 if (dev->quirks & IS_BIFROST) {
 734                         struct bifrost_blend_rt *brts = rts;
 735
 736                         brts[i].flags = 0x200;
 737                         if (blend[i].is_shader) {
 738                                 /* The blend shader's address needs to be at
 739                                  * the same top 32 bit as the fragment shader.
 740                                  * TODO: Ensure that's always the case.
 741                                  */
 742                                 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
 743                                        (fs->bo->gpu & (0xffffffffull << 32)));
 744                                 brts[i].shader = blend[i].shader.gpu;
 745                                 brts[i].unk2 = 0x0;
 746                         } else {
 747                                 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
 748                                 const struct util_format_description *format_desc;
 749                                 format_desc = util_format_description(format);
 750
 751                                 brts[i].equation = *blend[i].equation.equation;
 752
 753                                 /* TODO: this is a bit more complicated */
 754                                 brts[i].constant = blend[i].equation.constant;
 755
 756                                 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
 757                                 brts[i].unk2 = 0x19;
 758
 759                                 brts[i].shader_type = fs->blend_types[i];
 760                         }
 761                 } else {
 762                         struct midgard_blend_rt *mrts = rts;
 763
 764                         if (!blend[i].no_colour) {
 765                                 mrts[i].flags = 0x200;
 766
 767                                 bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
 768                                                (ctx->pipe_framebuffer.cbufs[i]) &&
 769                                                util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
 770
 771                                 SET_BIT(mrts[i].flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
 772                                 SET_BIT(mrts[i].flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
 773                                 SET_BIT(mrts[i].flags, MALI_BLEND_SRGB, is_srgb);
 774                                 SET_BIT(mrts[i].flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
 775                         }
 776
 777                         if (blend[i].is_shader) {
 778                                 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 779                         } else {
 780                                 mrts[i].blend.equation = *blend[i].equation.equation;
 781                                 mrts[i].blend.constant = blend[i].equation.constant;
 782                         }
 783                 }
 784         }
 785 }
 786
 787 static void
 788 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
 789                                struct mali_shader_meta *fragmeta,
 790                                void *rts)
 791 {
 792         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 793         struct panfrost_shader_state *fs;
 794
 795         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 796
 797         fragmeta->alpha_coverage = ~MALI_ALPHA_COVERAGE(0.000000);
 798         fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x3010;
 799         fragmeta->unknown2_4 = 0x4e0;
 800
 801         /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
 802          * is required (independent of 32-bit/64-bit descriptors), or why it's
 803          * not used on later GPU revisions. Otherwise, all shader jobs fault on
 804          * these earlier chips (perhaps this is a chicken bit of some kind).
 805          * More investigation is needed. */
 806
 807         SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
 808
 809         if (dev->quirks & IS_BIFROST) {
 810                 /* TODO */
 811         } else {
 812                 /* Depending on whether it's legal to in the given shader, we try to
 813                  * enable early-z testing (or forward-pixel kill?) */
 814
 815                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
 816                         !fs->can_discard && !fs->writes_depth);
 817
 818                 /* Add the writes Z/S flags if needed. */
 819                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
 820                 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
 821
 822                 /* Any time texturing is used, derivatives are implicitly calculated,
 823                  * so we need to enable helper invocations */
 824
 825                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
 826                         fs->helper_invocations);
 827
 828                 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
 829
 830                 bool depth_enabled = fs->writes_depth ||
 831                    (zsa && zsa->depth.enabled && zsa->depth.func != PIPE_FUNC_ALWAYS);
 832
 833                 SET_BIT(fragmeta->midgard1.flags_lo, 0x400, !depth_enabled && fs->can_discard);
 834                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, depth_enabled && fs->can_discard);
 835         }
 836
 837         panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
 838         panfrost_frag_meta_zsa_update(ctx, fragmeta);
 839         panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
 840 }
 841
 842 void
 843 panfrost_emit_shader_meta(struct panfrost_batch *batch,
 844                           enum pipe_shader_type st,
 845                           struct mali_vertex_tiler_postfix *postfix)
 846 {
 847         struct panfrost_context *ctx = batch->ctx;
 848         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 849
 850         if (!ss) {
 851                 postfix->shader = 0;
 852                 return;
 853         }
 854
 855         struct mali_shader_meta meta;
 856
 857         panfrost_shader_meta_init(ctx, st, &meta);
 858
 859         /* Add the shader BO to the batch. */
 860         panfrost_batch_add_bo(batch, ss->bo,
 861                               PAN_BO_ACCESS_PRIVATE |
 862                               PAN_BO_ACCESS_READ |
 863                               panfrost_bo_access_for_stage(st));
 864
 865         mali_ptr shader_ptr;
 866
 867         if (st == PIPE_SHADER_FRAGMENT) {
 868                 struct panfrost_device *dev = pan_device(ctx->base.screen);
 869                 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 870                 size_t desc_size = sizeof(meta);
 871                 void *rts = NULL;
 872                 struct panfrost_transfer xfer;
 873                 unsigned rt_size;
 874
 875                 if (dev->quirks & MIDGARD_SFBD)
 876                         rt_size = 0;
 877                 else if (dev->quirks & IS_BIFROST)
 878                         rt_size = sizeof(struct bifrost_blend_rt);
 879                 else
 880                         rt_size = sizeof(struct midgard_blend_rt);
 881
 882                 desc_size += rt_size * rt_count;
 883
 884                 if (rt_size)
 885                         rts = rzalloc_size(ctx, rt_size * rt_count);
 886
 887                 panfrost_frag_shader_meta_init(ctx, &meta, rts);
 888
 889                 xfer = panfrost_allocate_transient(batch, desc_size);
 890
 891                 memcpy(xfer.cpu, &meta, sizeof(meta));
 892                 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
 893
 894                 if (rt_size)
 895                         ralloc_free(rts);
 896
 897                 shader_ptr = xfer.gpu;
 898         } else {
 899                 shader_ptr = panfrost_upload_transient(batch, &meta,
 900                                                        sizeof(meta));
 901         }
 902
 903         postfix->shader = shader_ptr;
 904 }
 905
 906 static void
 907 panfrost_mali_viewport_init(struct panfrost_context *ctx,
 908                             struct mali_viewport *mvp)
 909 {
 910         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 911
 912         /* Clip bounds are encoded as floats. The viewport itself is encoded as
 913          * (somewhat) asymmetric ints. */
 914
 915         const struct pipe_scissor_state *ss = &ctx->scissor;
 916
 917         memset(mvp, 0, sizeof(*mvp));
 918
 919         /* By default, do no viewport clipping, i.e. clip to (-inf, inf) in
 920          * each direction. Clipping to the viewport in theory should work, but
 921          * in practice causes issues when we're not explicitly trying to
 922          * scissor */
 923
 924         *mvp = (struct mali_viewport) {
 925                 .clip_minx = -INFINITY,
 926                 .clip_miny = -INFINITY,
 927                 .clip_maxx = INFINITY,
 928                 .clip_maxy = INFINITY,
 929         };
 930
 931         /* Always scissor to the viewport by default. */
 932         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 933         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 934
 935         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 936         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 937
 938         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 939         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 940
 941         /* Apply the scissor test */
 942
 943         unsigned minx, miny, maxx, maxy;
 944
 945         if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
 946                 minx = MAX2(ss->minx, vp_minx);
 947                 miny = MAX2(ss->miny, vp_miny);
 948                 maxx = MIN2(ss->maxx, vp_maxx);
 949                 maxy = MIN2(ss->maxy, vp_maxy);
 950         } else {
 951                 minx = vp_minx;
 952                 miny = vp_miny;
 953                 maxx = vp_maxx;
 954                 maxy = vp_maxy;
 955         }
 956
 957         /* Hardware needs the min/max to be strictly ordered, so flip if we
 958          * need to. The viewport transformation in the vertex shader will
 959          * handle the negatives if we don't */
 960
 961         if (miny > maxy) {
 962                 unsigned temp = miny;
 963                 miny = maxy;
 964                 maxy = temp;
 965         }
 966
 967         if (minx > maxx) {
 968                 unsigned temp = minx;
 969                 minx = maxx;
 970                 maxx = temp;
 971         }
 972
 973         if (minz > maxz) {
 974                 float temp = minz;
 975                 minz = maxz;
 976                 maxz = temp;
 977         }
 978
 979         /* Clamp to the framebuffer size as a last check */
 980
 981         minx = MIN2(ctx->pipe_framebuffer.width, minx);
 982         maxx = MIN2(ctx->pipe_framebuffer.width, maxx);
 983
 984         miny = MIN2(ctx->pipe_framebuffer.height, miny);
 985         maxy = MIN2(ctx->pipe_framebuffer.height, maxy);
 986
 987         /* Upload */
 988
 989         mvp->viewport0[0] = minx;
 990         mvp->viewport1[0] = MALI_POSITIVE(maxx);
 991
 992         mvp->viewport0[1] = miny;
 993         mvp->viewport1[1] = MALI_POSITIVE(maxy);
 994
 995         mvp->clip_minz = minz;
 996         mvp->clip_maxz = maxz;
 997 }
 998
 999 void
1000 panfrost_emit_viewport(struct panfrost_batch *batch,
1001                        struct mali_vertex_tiler_postfix *tiler_postfix)
1002 {
1003         struct panfrost_context *ctx = batch->ctx;
1004         struct mali_viewport mvp;
1005
1006         panfrost_mali_viewport_init(batch->ctx,  &mvp);
1007
1008         /* Update the job, unless we're doing wallpapering (whose lack of
1009          * scissor we can ignore, since if we "miss" a tile of wallpaper, it'll
1010          * just... be faster :) */
1011
1012         if (!ctx->wallpaper_batch)
1013                 panfrost_batch_union_scissor(batch, mvp.viewport0[0],
1014                                              mvp.viewport0[1],
1015                                              mvp.viewport1[0] + 1,
1016                                              mvp.viewport1[1] + 1);
1017
1018         tiler_postfix->viewport = panfrost_upload_transient(batch, &mvp,
1019                                                             sizeof(mvp));
1020 }
1021
1022 static mali_ptr
1023 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
1024                                  enum pipe_shader_type st,
1025                                  struct panfrost_constant_buffer *buf,
1026                                  unsigned index)
1027 {
1028         struct pipe_constant_buffer *cb = &buf->cb[index];
1029         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1030
1031         if (rsrc) {
1032                 panfrost_batch_add_bo(batch, rsrc->bo,
1033                                       PAN_BO_ACCESS_SHARED |
1034                                       PAN_BO_ACCESS_READ |
1035                                       panfrost_bo_access_for_stage(st));
1036
1037                 /* Alignment gauranteed by
1038                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
1039                 return rsrc->bo->gpu + cb->buffer_offset;
1040         } else if (cb->user_buffer) {
1041                 return panfrost_upload_transient(batch,
1042                                                  cb->user_buffer +
1043                                                  cb->buffer_offset,
1044                                                  cb->buffer_size);
1045         } else {
1046                 unreachable("No constant buffer");
1047         }
1048 }
1049
1050 struct sysval_uniform {
1051         union {
1052                 float f[4];
1053                 int32_t i[4];
1054                 uint32_t u[4];
1055                 uint64_t du[2];
1056         };
1057 };
1058
1059 static void
1060 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
1061                                       struct sysval_uniform *uniform)
1062 {
1063         struct panfrost_context *ctx = batch->ctx;
1064         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1065
1066         uniform->f[0] = vp->scale[0];
1067         uniform->f[1] = vp->scale[1];
1068         uniform->f[2] = vp->scale[2];
1069 }
1070
1071 static void
1072 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1073                                        struct sysval_uniform *uniform)
1074 {
1075         struct panfrost_context *ctx = batch->ctx;
1076         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1077
1078         uniform->f[0] = vp->translate[0];
1079         uniform->f[1] = vp->translate[1];
1080         uniform->f[2] = vp->translate[2];
1081 }
1082
1083 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1084                                        enum pipe_shader_type st,
1085                                        unsigned int sysvalid,
1086                                        struct sysval_uniform *uniform)
1087 {
1088         struct panfrost_context *ctx = batch->ctx;
1089         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1090         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1091         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1092         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1093
1094         assert(dim);
1095         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1096
1097         if (dim > 1)
1098                 uniform->i[1] = u_minify(tex->texture->height0,
1099                                          tex->u.tex.first_level);
1100
1101         if (dim > 2)
1102                 uniform->i[2] = u_minify(tex->texture->depth0,
1103                                          tex->u.tex.first_level);
1104
1105         if (is_array)
1106                 uniform->i[dim] = tex->texture->array_size;
1107 }
1108
1109 static void
1110 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1111                             enum pipe_shader_type st,
1112                             unsigned ssbo_id,
1113                             struct sysval_uniform *uniform)
1114 {
1115         struct panfrost_context *ctx = batch->ctx;
1116
1117         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1118         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1119
1120         /* Compute address */
1121         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1122
1123         panfrost_batch_add_bo(batch, bo,
1124                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1125                               panfrost_bo_access_for_stage(st));
1126
1127         /* Upload address and size as sysval */
1128         uniform->du[0] = bo->gpu + sb.buffer_offset;
1129         uniform->u[2] = sb.buffer_size;
1130 }
1131
1132 static void
1133 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1134                                enum pipe_shader_type st,
1135                                unsigned samp_idx,
1136                                struct sysval_uniform *uniform)
1137 {
1138         struct panfrost_context *ctx = batch->ctx;
1139         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1140
1141         uniform->f[0] = sampl->min_lod;
1142         uniform->f[1] = sampl->max_lod;
1143         uniform->f[2] = sampl->lod_bias;
1144
1145         /* Even without any errata, Midgard represents "no mipmapping" as
1146          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1147          * panfrost_create_sampler_state which also explains our choice of
1148          * epsilon value (again to keep behaviour consistent) */
1149
1150         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1151                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1152 }
1153
1154 static void
1155 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1156                                        struct sysval_uniform *uniform)
1157 {
1158         struct panfrost_context *ctx = batch->ctx;
1159
1160         uniform->u[0] = ctx->compute_grid->grid[0];
1161         uniform->u[1] = ctx->compute_grid->grid[1];
1162         uniform->u[2] = ctx->compute_grid->grid[2];
1163 }
1164
1165 static void
1166 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1167                         struct panfrost_shader_state *ss,
1168                         enum pipe_shader_type st)
1169 {
1170         struct sysval_uniform *uniforms = (void *)buf;
1171
1172         for (unsigned i = 0; i < ss->sysval_count; ++i) {
1173                 int sysval = ss->sysval[i];
1174
1175                 switch (PAN_SYSVAL_TYPE(sysval)) {
1176                 case PAN_SYSVAL_VIEWPORT_SCALE:
1177                         panfrost_upload_viewport_scale_sysval(batch,
1178                                                               &uniforms[i]);
1179                         break;
1180                 case PAN_SYSVAL_VIEWPORT_OFFSET:
1181                         panfrost_upload_viewport_offset_sysval(batch,
1182                                                                &uniforms[i]);
1183                         break;
1184                 case PAN_SYSVAL_TEXTURE_SIZE:
1185                         panfrost_upload_txs_sysval(batch, st,
1186                                                    PAN_SYSVAL_ID(sysval),
1187                                                    &uniforms[i]);
1188                         break;
1189                 case PAN_SYSVAL_SSBO:
1190                         panfrost_upload_ssbo_sysval(batch, st,
1191                                                     PAN_SYSVAL_ID(sysval),
1192                                                     &uniforms[i]);
1193                         break;
1194                 case PAN_SYSVAL_NUM_WORK_GROUPS:
1195                         panfrost_upload_num_work_groups_sysval(batch,
1196                                                                &uniforms[i]);
1197                         break;
1198                 case PAN_SYSVAL_SAMPLER:
1199                         panfrost_upload_sampler_sysval(batch, st,
1200                                                        PAN_SYSVAL_ID(sysval),
1201                                                        &uniforms[i]);
1202                         break;
1203                 default:
1204                         assert(0);
1205                 }
1206         }
1207 }
1208
1209 static const void *
1210 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1211                                  unsigned index)
1212 {
1213         struct pipe_constant_buffer *cb = &buf->cb[index];
1214         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1215
1216         if (rsrc)
1217                 return rsrc->bo->cpu;
1218         else if (cb->user_buffer)
1219                 return cb->user_buffer;
1220         else
1221                 unreachable("No constant buffer");
1222 }
1223
1224 void
1225 panfrost_emit_const_buf(struct panfrost_batch *batch,
1226                         enum pipe_shader_type stage,
1227                         struct mali_vertex_tiler_postfix *postfix)
1228 {
1229         struct panfrost_context *ctx = batch->ctx;
1230         struct panfrost_shader_variants *all = ctx->shader[stage];
1231
1232         if (!all)
1233                 return;
1234
1235         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1236
1237         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1238
1239         /* Uniforms are implicitly UBO #0 */
1240         bool has_uniforms = buf->enabled_mask & (1 << 0);
1241
1242         /* Allocate room for the sysval and the uniforms */
1243         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1244         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1245         size_t size = sys_size + uniform_size;
1246         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1247                                                                         size);
1248
1249         /* Upload sysvals requested by the shader */
1250         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1251
1252         /* Upload uniforms */
1253         if (has_uniforms && uniform_size) {
1254                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1255                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1256         }
1257
1258         /* Next up, attach UBOs. UBO #0 is the uniforms we just
1259          * uploaded */
1260
1261         unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1262         assert(ubo_count >= 1);
1263
1264         size_t sz = sizeof(uint64_t) * ubo_count;
1265         uint64_t ubos[PAN_MAX_CONST_BUFFERS];
1266         int uniform_count = ss->uniform_count;
1267
1268         /* Upload uniforms as a UBO */
1269         ubos[0] = MALI_MAKE_UBO(2 + uniform_count, transfer.gpu);
1270
1271         /* The rest are honest-to-goodness UBOs */
1272
1273         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1274                 size_t usz = buf->cb[ubo].buffer_size;
1275                 bool enabled = buf->enabled_mask & (1 << ubo);
1276                 bool empty = usz == 0;
1277
1278                 if (!enabled || empty) {
1279                         /* Stub out disabled UBOs to catch accesses */
1280                         ubos[ubo] = MALI_MAKE_UBO(0, 0xDEAD0000);
1281                         continue;
1282                 }
1283
1284                 mali_ptr gpu = panfrost_map_constant_buffer_gpu(batch, stage,
1285                                                                 buf, ubo);
1286
1287                 unsigned bytes_per_field = 16;
1288                 unsigned aligned = ALIGN_POT(usz, bytes_per_field);
1289                 ubos[ubo] = MALI_MAKE_UBO(aligned / bytes_per_field, gpu);
1290         }
1291
1292         mali_ptr ubufs = panfrost_upload_transient(batch, ubos, sz);
1293         postfix->uniforms = transfer.gpu;
1294         postfix->uniform_buffers = ubufs;
1295
1296         buf->dirty_mask = 0;
1297 }
1298
1299 void
1300 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1301                             const struct pipe_grid_info *info,
1302                             struct midgard_payload_vertex_tiler *vtp)
1303 {
1304         struct panfrost_context *ctx = batch->ctx;
1305         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1306         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1307         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1308                                                            128));
1309         unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1310                                info->grid[2] * 4;
1311         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1312                                                                   shared_size,
1313                                                                   1);
1314
1315         struct mali_shared_memory shared = {
1316                 .shared_memory = bo->gpu,
1317                 .shared_workgroup_count =
1318                         util_logbase2_ceil(info->grid[0]) +
1319                         util_logbase2_ceil(info->grid[1]) +
1320                         util_logbase2_ceil(info->grid[2]),
1321                 .shared_unk1 = 0x2,
1322                 .shared_shift = util_logbase2(single_size) - 1
1323         };
1324
1325         vtp->postfix.shared_memory = panfrost_upload_transient(batch, &shared,
1326                                                                sizeof(shared));
1327 }
1328
1329 static mali_ptr
1330 panfrost_get_tex_desc(struct panfrost_batch *batch,
1331                       enum pipe_shader_type st,
1332                       struct panfrost_sampler_view *view)
1333 {
1334         if (!view)
1335                 return (mali_ptr) 0;
1336
1337         struct pipe_sampler_view *pview = &view->base;
1338         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1339
1340         /* Add the BO to the job so it's retained until the job is done. */
1341
1342         panfrost_batch_add_bo(batch, rsrc->bo,
1343                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1344                               panfrost_bo_access_for_stage(st));
1345
1346         panfrost_batch_add_bo(batch, view->midgard_bo,
1347                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1348                               panfrost_bo_access_for_stage(st));
1349
1350         return view->midgard_bo->gpu;
1351 }
1352
1353 void
1354 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1355                                   enum pipe_shader_type stage,
1356                                   struct mali_vertex_tiler_postfix *postfix)
1357 {
1358         struct panfrost_context *ctx = batch->ctx;
1359         struct panfrost_device *device = pan_device(ctx->base.screen);
1360
1361         if (!ctx->sampler_view_count[stage])
1362                 return;
1363
1364         if (device->quirks & IS_BIFROST) {
1365                 struct bifrost_texture_descriptor *descriptors;
1366
1367                 descriptors = malloc(sizeof(struct bifrost_texture_descriptor) *
1368                                      ctx->sampler_view_count[stage]);
1369
1370                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1371                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1372                         struct pipe_sampler_view *pview = &view->base;
1373                         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1374
1375                         /* Add the BOs to the job so they are retained until the job is done. */
1376
1377                         panfrost_batch_add_bo(batch, rsrc->bo,
1378                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1379                                               panfrost_bo_access_for_stage(stage));
1380
1381                         panfrost_batch_add_bo(batch, view->bifrost_bo,
1382                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1383                                               panfrost_bo_access_for_stage(stage));
1384
1385                         memcpy(&descriptors[i], view->bifrost_descriptor, sizeof(*view->bifrost_descriptor));
1386                 }
1387
1388                 postfix->textures = panfrost_upload_transient(batch,
1389                                                               descriptors,
1390                                                               sizeof(struct bifrost_texture_descriptor) *
1391                                                                       ctx->sampler_view_count[stage]);
1392
1393                 free(descriptors);
1394         } else {
1395                 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1396
1397                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i)
1398                         trampolines[i] = panfrost_get_tex_desc(batch, stage,
1399                                                                ctx->sampler_views[stage][i]);
1400
1401                 postfix->textures = panfrost_upload_transient(batch,
1402                                                               trampolines,
1403                                                               sizeof(uint64_t) *
1404                                                               ctx->sampler_view_count[stage]);
1405         }
1406 }
1407
1408 void
1409 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1410                                   enum pipe_shader_type stage,
1411                                   struct mali_vertex_tiler_postfix *postfix)
1412 {
1413         struct panfrost_context *ctx = batch->ctx;
1414         struct panfrost_device *device = pan_device(ctx->base.screen);
1415
1416         if (!ctx->sampler_count[stage])
1417                 return;
1418
1419         if (device->quirks & IS_BIFROST) {
1420                 size_t desc_size = sizeof(struct bifrost_sampler_descriptor);
1421                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1422                 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1423                                                                                 transfer_size);
1424                 struct bifrost_sampler_descriptor *desc = (struct bifrost_sampler_descriptor *)transfer.cpu;
1425
1426                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1427                         desc[i] = ctx->samplers[stage][i]->bifrost_hw;
1428
1429                 postfix->sampler_descriptor = transfer.gpu;
1430         } else {
1431                 size_t desc_size = sizeof(struct mali_sampler_descriptor);
1432                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1433                 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1434                                                                                 transfer_size);
1435                 struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1436
1437                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1438                         desc[i] = ctx->samplers[stage][i]->midgard_hw;
1439
1440                 postfix->sampler_descriptor = transfer.gpu;
1441         }
1442 }
1443
1444 void
1445 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1446                                struct mali_vertex_tiler_postfix *vertex_postfix)
1447 {
1448         struct panfrost_context *ctx = batch->ctx;
1449
1450         if (!ctx->vertex)
1451                 return;
1452
1453         struct panfrost_vertex_state *so = ctx->vertex;
1454
1455         panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1456         vertex_postfix->attribute_meta = panfrost_upload_transient(batch, so->hw,
1457                                                                sizeof(*so->hw) *
1458                                                                PAN_MAX_ATTRIBUTE);
1459 }
1460
1461 void
1462 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1463                           struct mali_vertex_tiler_postfix *vertex_postfix)
1464 {
1465         struct panfrost_context *ctx = batch->ctx;
1466         struct panfrost_vertex_state *so = ctx->vertex;
1467
1468         /* Staged mali_attr, and index into them. i =/= k, depending on the
1469          * vertex buffer mask and instancing. Twice as much room is allocated,
1470          * for a worst case of NPOT_DIVIDEs which take up extra slot */
1471         union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1472         unsigned k = 0;
1473
1474         for (unsigned i = 0; i < so->num_elements; ++i) {
1475                 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1476                  * means duplicating some vertex buffers (who cares? aside from
1477                  * maybe some caching implications but I somehow doubt that
1478                  * matters) */
1479
1480                 struct pipe_vertex_element *elem = &so->pipe[i];
1481                 unsigned vbi = elem->vertex_buffer_index;
1482
1483                 /* The exception to 1:1 mapping is that we can have multiple
1484                  * entries (NPOT divisors), so we fixup anyways */
1485
1486                 so->hw[i].index = k;
1487
1488                 if (!(ctx->vb_mask & (1 << vbi)))
1489                         continue;
1490
1491                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1492                 struct panfrost_resource *rsrc;
1493
1494                 rsrc = pan_resource(buf->buffer.resource);
1495                 if (!rsrc)
1496                         continue;
1497
1498                 /* Align to 64 bytes by masking off the lower bits. This
1499                  * will be adjusted back when we fixup the src_offset in
1500                  * mali_attr_meta */
1501
1502                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1503                 mali_ptr addr = raw_addr & ~63;
1504                 unsigned chopped_addr = raw_addr - addr;
1505
1506                 /* Add a dependency of the batch on the vertex buffer */
1507                 panfrost_batch_add_bo(batch, rsrc->bo,
1508                                       PAN_BO_ACCESS_SHARED |
1509                                       PAN_BO_ACCESS_READ |
1510                                       PAN_BO_ACCESS_VERTEX_TILER);
1511
1512                 /* Set common fields */
1513                 attrs[k].elements = addr;
1514                 attrs[k].stride = buf->stride;
1515
1516                 /* Since we advanced the base pointer, we shrink the buffer
1517                  * size */
1518                 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1519
1520                 /* We need to add the extra size we masked off (for
1521                  * correctness) so the data doesn't get clamped away */
1522                 attrs[k].size += chopped_addr;
1523
1524                 /* For non-instancing make sure we initialize */
1525                 attrs[k].shift = attrs[k].extra_flags = 0;
1526
1527                 /* Instancing uses a dramatically different code path than
1528                  * linear, so dispatch for the actual emission now that the
1529                  * common code is finished */
1530
1531                 unsigned divisor = elem->instance_divisor;
1532
1533                 if (divisor && ctx->instance_count == 1) {
1534                         /* Silly corner case where there's a divisor(=1) but
1535                          * there's no legitimate instancing. So we want *every*
1536                          * attribute to be the same. So set stride to zero so
1537                          * we don't go anywhere. */
1538
1539                         attrs[k].size = attrs[k].stride + chopped_addr;
1540                         attrs[k].stride = 0;
1541                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1542                 } else if (ctx->instance_count <= 1) {
1543                         /* Normal, non-instanced attributes */
1544                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1545                 } else {
1546                         unsigned instance_shift = vertex_postfix->instance_shift;
1547                         unsigned instance_odd = vertex_postfix->instance_odd;
1548
1549                         k += panfrost_vertex_instanced(ctx->padded_count,
1550                                                        instance_shift,
1551                                                        instance_odd,
1552                                                        divisor, &attrs[k]);
1553                 }
1554         }
1555
1556         /* Add special gl_VertexID/gl_InstanceID buffers */
1557
1558         panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1559         so->hw[PAN_VERTEX_ID].index = k++;
1560         panfrost_instance_id(ctx->padded_count, &attrs[k]);
1561         so->hw[PAN_INSTANCE_ID].index = k++;
1562
1563         /* Upload whatever we emitted and go */
1564
1565         vertex_postfix->attributes = panfrost_upload_transient(batch, attrs,
1566                                                            k * sizeof(*attrs));
1567 }
1568
1569 static mali_ptr
1570 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1571                        unsigned stride, unsigned count)
1572 {
1573         /* Fill out the descriptor */
1574         slot->stride = stride;
1575         slot->size = stride * count;
1576         slot->shift = slot->extra_flags = 0;
1577
1578         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1579                                                                         slot->size);
1580
1581         slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1582
1583         return transfer.gpu;
1584 }
1585
1586 static void
1587 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1588                         unsigned stride, unsigned offset, unsigned count,
1589                         struct pipe_stream_output_target *target)
1590 {
1591         /* Fill out the descriptor */
1592         slot->stride = stride * 4;
1593         slot->shift = slot->extra_flags = 0;
1594
1595         unsigned max_size = target->buffer_size;
1596         unsigned expected_size = slot->stride * count;
1597
1598         slot->size = MIN2(max_size, expected_size);
1599
1600         /* Grab the BO and bind it to the batch */
1601         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1602
1603         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1604          * the perspective of the TILER and FRAGMENT.
1605          */
1606         panfrost_batch_add_bo(batch, bo,
1607                               PAN_BO_ACCESS_SHARED |
1608                               PAN_BO_ACCESS_RW |
1609                               PAN_BO_ACCESS_VERTEX_TILER |
1610                               PAN_BO_ACCESS_FRAGMENT);
1611
1612         mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1613         slot->elements = addr;
1614 }
1615
1616 /* Given a shader and buffer indices, link varying metadata together */
1617
1618 static bool
1619 is_special_varying(gl_varying_slot loc)
1620 {
1621         switch (loc) {
1622         case VARYING_SLOT_POS:
1623         case VARYING_SLOT_PSIZ:
1624         case VARYING_SLOT_PNTC:
1625         case VARYING_SLOT_FACE:
1626                 return true;
1627         default:
1628                 return false;
1629         }
1630 }
1631
1632 static void
1633 panfrost_emit_varying_meta(void *outptr, struct panfrost_shader_state *ss,
1634                            signed general, signed gl_Position,
1635                            signed gl_PointSize, signed gl_PointCoord,
1636                            signed gl_FrontFacing)
1637 {
1638         struct mali_attr_meta *out = (struct mali_attr_meta *) outptr;
1639
1640         for (unsigned i = 0; i < ss->varying_count; ++i) {
1641                 gl_varying_slot location = ss->varyings_loc[i];
1642                 int index = -1;
1643
1644                 switch (location) {
1645                 case VARYING_SLOT_POS:
1646                         index = gl_Position;
1647                         break;
1648                 case VARYING_SLOT_PSIZ:
1649                         index = gl_PointSize;
1650                         break;
1651                 case VARYING_SLOT_PNTC:
1652                         index = gl_PointCoord;
1653                         break;
1654                 case VARYING_SLOT_FACE:
1655                         index = gl_FrontFacing;
1656                         break;
1657                 default:
1658                         index = general;
1659                         break;
1660                 }
1661
1662                 assert(index >= 0);
1663                 out[i].index = index;
1664         }
1665 }
1666
1667 static bool
1668 has_point_coord(unsigned mask, gl_varying_slot loc)
1669 {
1670         if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1671                 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1672         else if (loc == VARYING_SLOT_PNTC)
1673                 return (mask & (1 << 8));
1674         else
1675                 return false;
1676 }
1677
1678 /* Helpers for manipulating stream out information so we can pack varyings
1679  * accordingly. Compute the src_offset for a given captured varying */
1680
1681 static struct pipe_stream_output *
1682 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1683 {
1684         for (unsigned i = 0; i < info->num_outputs; ++i) {
1685                 if (info->output[i].register_index == loc)
1686                         return &info->output[i];
1687         }
1688
1689         unreachable("Varying not captured");
1690 }
1691
1692 void
1693 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1694                                  unsigned vertex_count,
1695                                  struct mali_vertex_tiler_postfix *vertex_postfix,
1696                                  struct mali_vertex_tiler_postfix *tiler_postfix,
1697                                  union midgard_primitive_size *primitive_size)
1698 {
1699         /* Load the shaders */
1700         struct panfrost_context *ctx = batch->ctx;
1701         struct panfrost_shader_state *vs, *fs;
1702         unsigned int num_gen_varyings = 0;
1703         size_t vs_size, fs_size;
1704
1705         /* Allocate the varying descriptor */
1706
1707         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1708         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1709         vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
1710         fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
1711
1712         struct panfrost_transfer trans = panfrost_allocate_transient(batch,
1713                                                                      vs_size +
1714                                                                      fs_size);
1715
1716         struct pipe_stream_output_info *so = &vs->stream_output;
1717
1718         /* Check if this varying is linked by us. This is the case for
1719          * general-purpose, non-captured varyings. If it is, link it. If it's
1720          * not, use the provided stream out information to determine the
1721          * offset, since it was already linked for us. */
1722
1723         for (unsigned i = 0; i < vs->varying_count; i++) {
1724                 gl_varying_slot loc = vs->varyings_loc[i];
1725
1726                 bool special = is_special_varying(loc);
1727                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1728
1729                 if (captured) {
1730                         struct pipe_stream_output *o = pan_get_so(so, loc);
1731
1732                         unsigned dst_offset = o->dst_offset * 4; /* dwords */
1733                         vs->varyings[i].src_offset = dst_offset;
1734                 } else if (!special) {
1735                         vs->varyings[i].src_offset = 16 * (num_gen_varyings++);
1736                 }
1737         }
1738
1739         /* Conversely, we need to set src_offset for the captured varyings.
1740          * Here, the layout is defined by the stream out info, not us */
1741
1742         /* Link up with fragment varyings */
1743         bool reads_point_coord = fs->reads_point_coord;
1744
1745         for (unsigned i = 0; i < fs->varying_count; i++) {
1746                 gl_varying_slot loc = fs->varyings_loc[i];
1747                 unsigned src_offset;
1748                 signed vs_idx = -1;
1749
1750                 /* Link up */
1751                 for (unsigned j = 0; j < vs->varying_count; ++j) {
1752                         if (vs->varyings_loc[j] == loc) {
1753                                 vs_idx = j;
1754                                 break;
1755                         }
1756                 }
1757
1758                 /* Either assign or reuse */
1759                 if (vs_idx >= 0)
1760                         src_offset = vs->varyings[vs_idx].src_offset;
1761                 else
1762                         src_offset = 16 * (num_gen_varyings++);
1763
1764                 fs->varyings[i].src_offset = src_offset;
1765
1766                 if (has_point_coord(fs->point_sprite_mask, loc))
1767                         reads_point_coord = true;
1768         }
1769
1770         memcpy(trans.cpu, vs->varyings, vs_size);
1771         memcpy(trans.cpu + vs_size, fs->varyings, fs_size);
1772
1773         union mali_attr varyings[PIPE_MAX_ATTRIBS] = {0};
1774
1775         /* Figure out how many streamout buffers could be bound */
1776         unsigned so_count = ctx->streamout.num_targets;
1777         for (unsigned i = 0; i < vs->varying_count; i++) {
1778                 gl_varying_slot loc = vs->varyings_loc[i];
1779
1780                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1781                 if (!captured) continue;
1782
1783                 struct pipe_stream_output *o = pan_get_so(so, loc);
1784                 so_count = MAX2(so_count, o->output_buffer + 1);
1785         }
1786
1787         signed idx = so_count;
1788         signed general = idx++;
1789         signed gl_Position = idx++;
1790         signed gl_PointSize = vs->writes_point_size ? (idx++) : -1;
1791         signed gl_PointCoord = reads_point_coord ? (idx++) : -1;
1792         signed gl_FrontFacing = fs->reads_face ? (idx++) : -1;
1793         signed gl_FragCoord = fs->reads_frag_coord ? (idx++) : -1;
1794
1795         /* Emit the stream out buffers */
1796
1797         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1798                                                            ctx->vertex_count);
1799
1800         for (unsigned i = 0; i < so_count; ++i) {
1801                 if (i < ctx->streamout.num_targets) {
1802                         panfrost_emit_streamout(batch, &varyings[i],
1803                                                 so->stride[i],
1804                                                 ctx->streamout.offsets[i],
1805                                                 out_count,
1806                                                 ctx->streamout.targets[i]);
1807                 } else {
1808                         /* Emit a dummy buffer */
1809                         panfrost_emit_varyings(batch, &varyings[i],
1810                                                so->stride[i] * 4,
1811                                                out_count);
1812
1813                         /* Clear the attribute type */
1814                         varyings[i].elements &= ~0xF;
1815                 }
1816         }
1817
1818         panfrost_emit_varyings(batch, &varyings[general],
1819                                num_gen_varyings * 16,
1820                                vertex_count);
1821
1822         mali_ptr varyings_p;
1823
1824         /* fp32 vec4 gl_Position */
1825         varyings_p = panfrost_emit_varyings(batch, &varyings[gl_Position],
1826                                             sizeof(float) * 4, vertex_count);
1827         tiler_postfix->position_varying = varyings_p;
1828
1829
1830         if (panfrost_writes_point_size(ctx)) {
1831                 varyings_p = panfrost_emit_varyings(batch,
1832                                                     &varyings[gl_PointSize],
1833                                                     2, vertex_count);
1834                 primitive_size->pointer = varyings_p;
1835         }
1836
1837         if (reads_point_coord)
1838                 varyings[gl_PointCoord].elements = MALI_VARYING_POINT_COORD;
1839
1840         if (fs->reads_face)
1841                 varyings[gl_FrontFacing].elements = MALI_VARYING_FRONT_FACING;
1842
1843         if (fs->reads_frag_coord)
1844                 varyings[gl_FragCoord].elements = MALI_VARYING_FRAG_COORD;
1845
1846         struct panfrost_device *device = pan_device(ctx->base.screen);
1847         assert(!(device->quirks & IS_BIFROST) || !(reads_point_coord));
1848
1849         /* Let's go ahead and link varying meta to the buffer in question, now
1850          * that that information is available. VARYING_SLOT_POS is mapped to
1851          * gl_FragCoord for fragment shaders but gl_Positionf or vertex shaders
1852          * */
1853
1854         panfrost_emit_varying_meta(trans.cpu, vs, general, gl_Position,
1855                                    gl_PointSize, gl_PointCoord,
1856                                    gl_FrontFacing);
1857
1858         panfrost_emit_varying_meta(trans.cpu + vs_size, fs, general,
1859                                    gl_FragCoord, gl_PointSize,
1860                                    gl_PointCoord, gl_FrontFacing);
1861
1862         /* Replace streamout */
1863
1864         struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
1865         struct mali_attr_meta *ofs = ovs + vs->varying_count;
1866
1867         for (unsigned i = 0; i < vs->varying_count; i++) {
1868                 gl_varying_slot loc = vs->varyings_loc[i];
1869
1870                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1871                 if (!captured)
1872                         continue;
1873
1874                 struct pipe_stream_output *o = pan_get_so(so, loc);
1875                 ovs[i].index = o->output_buffer;
1876
1877                 assert(o->stream == 0);
1878                 ovs[i].format = (vs->varyings[i].format & ~MALI_NR_CHANNELS(4))
1879                         | MALI_NR_CHANNELS(o->num_components);
1880
1881                 if (device->quirks & HAS_SWIZZLES)
1882                         ovs[i].swizzle = panfrost_get_default_swizzle(o->num_components);
1883                 else
1884                         ovs[i].swizzle = panfrost_bifrost_swizzle(o->num_components);
1885
1886                 /* Link to the fragment */
1887                 signed fs_idx = -1;
1888
1889                 /* Link up */
1890                 for (unsigned j = 0; j < fs->varying_count; ++j) {
1891                         if (fs->varyings_loc[j] == loc) {
1892                                 fs_idx = j;
1893                                 break;
1894                         }
1895                 }
1896
1897                 if (fs_idx >= 0) {
1898                         ofs[fs_idx].index = ovs[i].index;
1899                         ofs[fs_idx].format = ovs[i].format;
1900                         ofs[fs_idx].swizzle = ovs[i].swizzle;
1901                 }
1902         }
1903
1904         /* Replace point sprite */
1905         for (unsigned i = 0; i < fs->varying_count; i++) {
1906                 /* If we have a point sprite replacement, handle that here. We
1907                  * have to translate location first.  TODO: Flip y in shader.
1908                  * We're already keying ... just time crunch .. */
1909
1910                 if (has_point_coord(fs->point_sprite_mask,
1911                                     fs->varyings_loc[i])) {
1912                         ofs[i].index = gl_PointCoord;
1913
1914                         /* Swizzle out the z/w to 0/1 */
1915                         ofs[i].format = MALI_RG16F;
1916                         ofs[i].swizzle = panfrost_get_default_swizzle(2);
1917                 }
1918         }
1919
1920         /* Fix up unaligned addresses */
1921         for (unsigned i = 0; i < so_count; ++i) {
1922                 if (varyings[i].elements < MALI_RECORD_SPECIAL)
1923                         continue;
1924
1925                 unsigned align = (varyings[i].elements & 63);
1926
1927                 /* While we're at it, the SO buffers are linear */
1928
1929                 if (!align) {
1930                         varyings[i].elements |= MALI_ATTR_LINEAR;
1931                         continue;
1932                 }
1933
1934                 /* We need to adjust alignment */
1935                 varyings[i].elements &= ~63;
1936                 varyings[i].elements |= MALI_ATTR_LINEAR;
1937                 varyings[i].size += align;
1938
1939                 for (unsigned v = 0; v < vs->varying_count; ++v) {
1940                         if (ovs[v].index != i)
1941                                 continue;
1942
1943                         ovs[v].src_offset = vs->varyings[v].src_offset + align;
1944                 }
1945
1946                 for (unsigned f = 0; f < fs->varying_count; ++f) {
1947                         if (ofs[f].index != i)
1948                                 continue;
1949
1950                         ofs[f].src_offset = fs->varyings[f].src_offset + align;
1951                 }
1952         }
1953
1954         varyings_p = panfrost_upload_transient(batch, varyings,
1955                                                idx * sizeof(*varyings));
1956         vertex_postfix->varyings = varyings_p;
1957         tiler_postfix->varyings = varyings_p;
1958
1959         vertex_postfix->varying_meta = trans.gpu;
1960         tiler_postfix->varying_meta = trans.gpu + vs_size;
1961 }
1962
1963 void
1964 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1965                                 struct mali_vertex_tiler_prefix *vertex_prefix,
1966                                 struct mali_vertex_tiler_postfix *vertex_postfix,
1967                                 struct mali_vertex_tiler_prefix *tiler_prefix,
1968                                 struct mali_vertex_tiler_postfix *tiler_postfix,
1969                                 union midgard_primitive_size *primitive_size)
1970 {
1971         struct panfrost_context *ctx = batch->ctx;
1972         struct panfrost_device *device = pan_device(ctx->base.screen);
1973         bool wallpapering = ctx->wallpaper_batch && batch->tiler_dep;
1974         struct bifrost_payload_vertex bifrost_vertex = {0,};
1975         struct bifrost_payload_tiler bifrost_tiler = {0,};
1976         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
1977         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
1978         void *vp, *tp;
1979         size_t vp_size, tp_size;
1980
1981         if (device->quirks & IS_BIFROST) {
1982                 bifrost_vertex.prefix = *vertex_prefix;
1983                 bifrost_vertex.postfix = *vertex_postfix;
1984                 vp = &bifrost_vertex;
1985                 vp_size = sizeof(bifrost_vertex);
1986
1987                 bifrost_tiler.prefix = *tiler_prefix;
1988                 bifrost_tiler.tiler.primitive_size = *primitive_size;
1989                 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
1990                 bifrost_tiler.postfix = *tiler_postfix;
1991                 tp = &bifrost_tiler;
1992                 tp_size = sizeof(bifrost_tiler);
1993         } else {
1994                 midgard_vertex.prefix = *vertex_prefix;
1995                 midgard_vertex.postfix = *vertex_postfix;
1996                 vp = &midgard_vertex;
1997                 vp_size = sizeof(midgard_vertex);
1998
1999                 midgard_tiler.prefix = *tiler_prefix;
2000                 midgard_tiler.postfix = *tiler_postfix;
2001                 midgard_tiler.primitive_size = *primitive_size;
2002                 tp = &midgard_tiler;
2003                 tp_size = sizeof(midgard_tiler);
2004         }
2005
2006         if (wallpapering) {
2007                 /* Inject in reverse order, with "predicted" job indices.
2008                  * THIS IS A HACK XXX */
2009                 panfrost_new_job(batch, JOB_TYPE_TILER, false,
2010                                  batch->job_index + 2, tp, tp_size, true);
2011                 panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2012                                  vp, vp_size, true);
2013                 return;
2014         }
2015
2016         /* If rasterizer discard is enable, only submit the vertex */
2017
2018         bool rasterizer_discard = ctx->rasterizer &&
2019                                   ctx->rasterizer->base.rasterizer_discard;
2020
2021         unsigned vertex = panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2022                                            vp, vp_size, false);
2023
2024         if (rasterizer_discard)
2025                 return;
2026
2027         panfrost_new_job(batch, JOB_TYPE_TILER, false, vertex, tp, tp_size,
2028                          false);
2029 }
2030
2031 /* TODO: stop hardcoding this */
2032 mali_ptr
2033 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2034 {
2035         uint16_t locations[] = {
2036             128, 128,
2037             0, 256,
2038             0, 256,
2039             0, 256,
2040             0, 256,
2041             0, 256,
2042             0, 256,
2043             0, 256,
2044             0, 256,
2045             0, 256,
2046             0, 256,
2047             0, 256,
2048             0, 256,
2049             0, 256,
2050             0, 256,
2051             0, 256,
2052             0, 256,
2053             0, 256,
2054             0, 256,
2055             0, 256,
2056             0, 256,
2057             0, 256,
2058             0, 256,
2059             0, 256,
2060             0, 256,
2061             0, 256,
2062             0, 256,
2063             0, 256,
2064             0, 256,
2065             0, 256,
2066             0, 256,
2067             0, 256,
2068             128, 128,
2069             0, 0,
2070             0, 0,
2071             0, 0,
2072             0, 0,
2073             0, 0,
2074             0, 0,
2075             0, 0,
2076             0, 0,
2077             0, 0,
2078             0, 0,
2079             0, 0,
2080             0, 0,
2081             0, 0,
2082             0, 0,
2083             0, 0,
2084         };
2085
2086         return panfrost_upload_transient(batch, locations, 96 * sizeof(uint16_t));
2087 }