src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28
  29 #include "panfrost-quirks.h"
  30
  31 #include "pan_allocate.h"
  32 #include "pan_bo.h"
  33 #include "pan_cmdstream.h"
  34 #include "pan_context.h"
  35 #include "pan_job.h"
  36
  37 /* If a BO is accessed for a particular shader stage, will it be in the primary
  38  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  39  * fragment will be primary, e.g. compute jobs will be considered
  40  * "vertex/tiler" by analogy */
  41
  42 static inline uint32_t
  43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  44 {
  45         assert(stage == PIPE_SHADER_FRAGMENT ||
  46                stage == PIPE_SHADER_VERTEX ||
  47                stage == PIPE_SHADER_COMPUTE);
  48
  49         return stage == PIPE_SHADER_FRAGMENT ?
  50                PAN_BO_ACCESS_FRAGMENT :
  51                PAN_BO_ACCESS_VERTEX_TILER;
  52 }
  53
  54 static void
  55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
  56                                struct mali_vertex_tiler_postfix *postfix)
  57 {
  58         struct panfrost_device *dev = pan_device(ctx->base.screen);
  59         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  60
  61         unsigned shift = panfrost_get_stack_shift(batch->stack_size);
  62         struct mali_shared_memory shared = {
  63                 .stack_shift = shift,
  64                 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
  65                 .shared_workgroup_count = ~0,
  66         };
  67         postfix->shared_memory = panfrost_upload_transient(batch, &shared, sizeof(shared));
  68 }
  69
  70 static void
  71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
  72                                struct mali_vertex_tiler_postfix *postfix)
  73 {
  74         struct panfrost_device *dev = pan_device(ctx->base.screen);
  75         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  76
  77         /* If we haven't, reserve space for the framebuffer */
  78
  79         if (!batch->framebuffer.gpu) {
  80                 unsigned size = (dev->quirks & MIDGARD_SFBD) ?
  81                         sizeof(struct mali_single_framebuffer) :
  82                         sizeof(struct mali_framebuffer);
  83
  84                 batch->framebuffer = panfrost_allocate_transient(batch, size);
  85
  86                 /* Tag the pointer */
  87                 if (!(dev->quirks & MIDGARD_SFBD))
  88                         batch->framebuffer.gpu |= MALI_MFBD;
  89         }
  90
  91         postfix->shared_memory = batch->framebuffer.gpu;
  92 }
  93
  94 static void
  95 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
  96                               struct mali_vertex_tiler_prefix *prefix,
  97                               struct mali_vertex_tiler_postfix *postfix)
  98 {
  99         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 100
 101         postfix->gl_enables |= 0x7;
 102         SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
 103                 rasterizer && rasterizer->base.front_ccw);
 104         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
 105                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
 106         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
 107                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
 108         SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
 109                 rasterizer && rasterizer->base.flatshade_first);
 110 }
 111
 112 void
 113 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
 114                                   struct mali_vertex_tiler_prefix *prefix,
 115                                   union midgard_primitive_size *primitive_size)
 116 {
 117         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 118
 119         if (!panfrost_writes_point_size(ctx)) {
 120                 bool points = prefix->draw_mode == MALI_POINTS;
 121                 float val = 0.0f;
 122
 123                 if (rasterizer)
 124                         val = points ?
 125                               rasterizer->base.point_size :
 126                               rasterizer->base.line_width;
 127
 128                 primitive_size->constant = val;
 129         }
 130 }
 131
 132 static void
 133 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
 134                                    struct mali_vertex_tiler_postfix *postfix)
 135 {
 136         SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
 137         if (ctx->occlusion_query)
 138                 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
 139         else
 140                 postfix->occlusion_counter = 0;
 141 }
 142
 143 void
 144 panfrost_vt_init(struct panfrost_context *ctx,
 145                  enum pipe_shader_type stage,
 146                  struct mali_vertex_tiler_prefix *prefix,
 147                  struct mali_vertex_tiler_postfix *postfix)
 148 {
 149         struct panfrost_device *device = pan_device(ctx->base.screen);
 150
 151         if (!ctx->shader[stage])
 152                 return;
 153
 154         memset(prefix, 0, sizeof(*prefix));
 155         memset(postfix, 0, sizeof(*postfix));
 156
 157         if (device->quirks & IS_BIFROST) {
 158                 postfix->gl_enables = 0x2;
 159                 panfrost_vt_emit_shared_memory(ctx, postfix);
 160         } else {
 161                 postfix->gl_enables = 0x6;
 162                 panfrost_vt_attach_framebuffer(ctx, postfix);
 163         }
 164
 165         if (stage == PIPE_SHADER_FRAGMENT) {
 166                 panfrost_vt_update_occlusion_query(ctx, postfix);
 167                 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
 168         }
 169 }
 170
 171 static unsigned
 172 panfrost_translate_index_size(unsigned size)
 173 {
 174         switch (size) {
 175         case 1:
 176                 return MALI_DRAW_INDEXED_UINT8;
 177
 178         case 2:
 179                 return MALI_DRAW_INDEXED_UINT16;
 180
 181         case 4:
 182                 return MALI_DRAW_INDEXED_UINT32;
 183
 184         default:
 185                 unreachable("Invalid index size");
 186         }
 187 }
 188
 189 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
 190  * good for the duration of the draw (transient), could last longer. Also get
 191  * the bounds on the index buffer for the range accessed by the draw. We do
 192  * these operations together because there are natural optimizations which
 193  * require them to be together. */
 194
 195 static mali_ptr
 196 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 197                                   const struct pipe_draw_info *info,
 198                                   unsigned *min_index, unsigned *max_index)
 199 {
 200         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 201         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 202         off_t offset = info->start * info->index_size;
 203         bool needs_indices = true;
 204         mali_ptr out = 0;
 205
 206         if (info->max_index != ~0u) {
 207                 *min_index = info->min_index;
 208                 *max_index = info->max_index;
 209                 needs_indices = false;
 210         }
 211
 212         if (!info->has_user_indices) {
 213                 /* Only resources can be directly mapped */
 214                 panfrost_batch_add_bo(batch, rsrc->bo,
 215                                       PAN_BO_ACCESS_SHARED |
 216                                       PAN_BO_ACCESS_READ |
 217                                       PAN_BO_ACCESS_VERTEX_TILER);
 218                 out = rsrc->bo->gpu + offset;
 219
 220                 /* Check the cache */
 221                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 222                                                            info->start,
 223                                                            info->count,
 224                                                            min_index,
 225                                                            max_index);
 226         } else {
 227                 /* Otherwise, we need to upload to transient memory */
 228                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 229                 out = panfrost_upload_transient(batch, ibuf8 + offset,
 230                                                 info->count *
 231                                                 info->index_size);
 232         }
 233
 234         if (needs_indices) {
 235                 /* Fallback */
 236                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 237
 238                 if (!info->has_user_indices)
 239                         panfrost_minmax_cache_add(rsrc->index_cache,
 240                                                   info->start, info->count,
 241                                                   *min_index, *max_index);
 242         }
 243
 244         return out;
 245 }
 246
 247 void
 248 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
 249                           const struct pipe_draw_info *info,
 250                           enum mali_draw_mode draw_mode,
 251                           struct mali_vertex_tiler_postfix *vertex_postfix,
 252                           struct mali_vertex_tiler_prefix *tiler_prefix,
 253                           struct mali_vertex_tiler_postfix *tiler_postfix,
 254                           unsigned *vertex_count,
 255                           unsigned *padded_count)
 256 {
 257         tiler_prefix->draw_mode = draw_mode;
 258
 259         unsigned draw_flags = 0;
 260
 261         if (panfrost_writes_point_size(ctx))
 262                 draw_flags |= MALI_DRAW_VARYING_SIZE;
 263
 264         if (info->primitive_restart)
 265                 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
 266
 267         /* These doesn't make much sense */
 268
 269         draw_flags |= 0x3000;
 270
 271         if (info->index_size) {
 272                 unsigned min_index = 0, max_index = 0;
 273
 274                 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
 275                                                                        info,
 276                                                                        &min_index,
 277                                                                        &max_index);
 278
 279                 /* Use the corresponding values */
 280                 *vertex_count = max_index - min_index + 1;
 281                 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
 282                 tiler_prefix->offset_bias_correction = -min_index;
 283                 tiler_prefix->index_count = MALI_POSITIVE(info->count);
 284                 draw_flags |= panfrost_translate_index_size(info->index_size);
 285         } else {
 286                 tiler_prefix->indices = 0;
 287                 *vertex_count = ctx->vertex_count;
 288                 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
 289                 tiler_prefix->offset_bias_correction = 0;
 290                 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
 291         }
 292
 293         tiler_prefix->unknown_draw = draw_flags;
 294
 295         /* Encode the padded vertex count */
 296
 297         if (info->instance_count > 1) {
 298                 *padded_count = panfrost_padded_vertex_count(*vertex_count);
 299
 300                 unsigned shift = __builtin_ctz(ctx->padded_count);
 301                 unsigned k = ctx->padded_count >> (shift + 1);
 302
 303                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
 304                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
 305         } else {
 306                 *padded_count = *vertex_count;
 307
 308                 /* Reset instancing state */
 309                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
 310                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
 311         }
 312 }
 313
 314 static void
 315 panfrost_shader_meta_init(struct panfrost_context *ctx,
 316                           enum pipe_shader_type st,
 317                           struct mali_shader_meta *meta)
 318 {
 319         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 320         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 321
 322         memset(meta, 0, sizeof(*meta));
 323         meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
 324         meta->attribute_count = ss->attribute_count;
 325         meta->varying_count = ss->varying_count;
 326         meta->texture_count = ctx->sampler_view_count[st];
 327         meta->sampler_count = ctx->sampler_count[st];
 328
 329         if (dev->quirks & IS_BIFROST) {
 330                 if (st == PIPE_SHADER_VERTEX)
 331                         meta->bifrost1.unk1 = 0x800000;
 332                 else {
 333                         /* First clause ATEST |= 0x4000000.
 334                          * Less than 32 regs |= 0x200 */
 335                         meta->bifrost1.unk1 = 0x958020;
 336                 }
 337
 338                 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 339                 if (st == PIPE_SHADER_VERTEX)
 340                         meta->bifrost2.preload_regs = 0xC0;
 341                 else
 342                         meta->bifrost2.preload_regs = 0x1;
 343                 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
 344                                                     ss->uniform_cutoff);
 345         } else {
 346                 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
 347                                                     ss->uniform_cutoff);
 348                 meta->midgard1.work_count = ss->work_reg_count;
 349                 meta->midgard1.flags_hi = 0x8; /* XXX */
 350                 meta->midgard1.flags_lo = 0x220;
 351                 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 352         }
 353 }
 354
 355 static unsigned
 356 panfrost_translate_compare_func(enum pipe_compare_func in)
 357 {
 358         switch (in) {
 359         case PIPE_FUNC_NEVER:
 360                 return MALI_FUNC_NEVER;
 361
 362         case PIPE_FUNC_LESS:
 363                 return MALI_FUNC_LESS;
 364
 365         case PIPE_FUNC_EQUAL:
 366                 return MALI_FUNC_EQUAL;
 367
 368         case PIPE_FUNC_LEQUAL:
 369                 return MALI_FUNC_LEQUAL;
 370
 371         case PIPE_FUNC_GREATER:
 372                 return MALI_FUNC_GREATER;
 373
 374         case PIPE_FUNC_NOTEQUAL:
 375                 return MALI_FUNC_NOTEQUAL;
 376
 377         case PIPE_FUNC_GEQUAL:
 378                 return MALI_FUNC_GEQUAL;
 379
 380         case PIPE_FUNC_ALWAYS:
 381                 return MALI_FUNC_ALWAYS;
 382
 383         default:
 384                 unreachable("Invalid func");
 385         }
 386 }
 387
 388 static unsigned
 389 panfrost_translate_stencil_op(enum pipe_stencil_op in)
 390 {
 391         switch (in) {
 392         case PIPE_STENCIL_OP_KEEP:
 393                 return MALI_STENCIL_KEEP;
 394
 395         case PIPE_STENCIL_OP_ZERO:
 396                 return MALI_STENCIL_ZERO;
 397
 398         case PIPE_STENCIL_OP_REPLACE:
 399                return MALI_STENCIL_REPLACE;
 400
 401         case PIPE_STENCIL_OP_INCR:
 402                 return MALI_STENCIL_INCR;
 403
 404         case PIPE_STENCIL_OP_DECR:
 405                 return MALI_STENCIL_DECR;
 406
 407         case PIPE_STENCIL_OP_INCR_WRAP:
 408                 return MALI_STENCIL_INCR_WRAP;
 409
 410         case PIPE_STENCIL_OP_DECR_WRAP:
 411                 return MALI_STENCIL_DECR_WRAP;
 412
 413         case PIPE_STENCIL_OP_INVERT:
 414                 return MALI_STENCIL_INVERT;
 415
 416         default:
 417                 unreachable("Invalid stencil op");
 418         }
 419 }
 420
 421 static unsigned
 422 translate_tex_wrap(enum pipe_tex_wrap w)
 423 {
 424         switch (w) {
 425         case PIPE_TEX_WRAP_REPEAT:
 426                 return MALI_WRAP_REPEAT;
 427
 428         case PIPE_TEX_WRAP_CLAMP:
 429                 return MALI_WRAP_CLAMP;
 430
 431         case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 432                 return MALI_WRAP_CLAMP_TO_EDGE;
 433
 434         case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
 435                 return MALI_WRAP_CLAMP_TO_BORDER;
 436
 437         case PIPE_TEX_WRAP_MIRROR_REPEAT:
 438                 return MALI_WRAP_MIRRORED_REPEAT;
 439
 440         case PIPE_TEX_WRAP_MIRROR_CLAMP:
 441                 return MALI_WRAP_MIRRORED_CLAMP;
 442
 443         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 444                 return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE;
 445
 446         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
 447                 return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER;
 448
 449         default:
 450                 unreachable("Invalid wrap");
 451         }
 452 }
 453
 454 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 455                                 struct mali_sampler_descriptor *hw)
 456 {
 457         unsigned func = panfrost_translate_compare_func(cso->compare_func);
 458         bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 459         bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 460         bool mip_linear  = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
 461         unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
 462         unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
 463         unsigned mip_filter = mip_linear  ?
 464                               (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
 465         unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
 466
 467         *hw = (struct mali_sampler_descriptor) {
 468                 .filter_mode = min_filter | mag_filter | mip_filter |
 469                                normalized,
 470                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 471                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 472                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 473                 .compare_func = panfrost_flip_compare_func(func),
 474                 .border_color = {
 475                         cso->border_color.f[0],
 476                         cso->border_color.f[1],
 477                         cso->border_color.f[2],
 478                         cso->border_color.f[3]
 479                 },
 480                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 481                 .max_lod = FIXED_16(cso->max_lod, false),
 482                 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
 483                 .seamless_cube_map = cso->seamless_cube_map,
 484         };
 485
 486         /* If necessary, we disable mipmapping in the sampler descriptor by
 487          * clamping the LOD as tight as possible (from 0 to epsilon,
 488          * essentially -- remember these are fixed point numbers, so
 489          * epsilon=1/256) */
 490
 491         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 492                 hw->max_lod = hw->min_lod + 1;
 493 }
 494
 495 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
 496                                         struct bifrost_sampler_descriptor *hw)
 497 {
 498         *hw = (struct bifrost_sampler_descriptor) {
 499                 .unk1 = 0x1,
 500                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 501                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 502                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 503                 .unk8 = 0x8,
 504                 .min_filter = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST,
 505                 .norm_coords = cso->normalized_coords,
 506                 .mip_filter = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR,
 507                 .mag_filter = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR,
 508                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 509                 .max_lod = FIXED_16(cso->max_lod, false),
 510         };
 511
 512         /* If necessary, we disable mipmapping in the sampler descriptor by
 513          * clamping the LOD as tight as possible (from 0 to epsilon,
 514          * essentially -- remember these are fixed point numbers, so
 515          * epsilon=1/256) */
 516
 517         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 518                 hw->max_lod = hw->min_lod + 1;
 519 }
 520
 521 static void
 522 panfrost_make_stencil_state(const struct pipe_stencil_state *in,
 523                             struct mali_stencil_test *out)
 524 {
 525         out->ref = 0; /* Gallium gets it from elsewhere */
 526
 527         out->mask = in->valuemask;
 528         out->func = panfrost_translate_compare_func(in->func);
 529         out->sfail = panfrost_translate_stencil_op(in->fail_op);
 530         out->dpfail = panfrost_translate_stencil_op(in->zfail_op);
 531         out->dppass = panfrost_translate_stencil_op(in->zpass_op);
 532 }
 533
 534 static void
 535 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
 536                                      struct mali_shader_meta *fragmeta)
 537 {
 538         if (!ctx->rasterizer) {
 539                 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
 540                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
 541                 fragmeta->depth_units = 0.0f;
 542                 fragmeta->depth_factor = 0.0f;
 543                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
 544                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
 545                 return;
 546         }
 547
 548         bool msaa = ctx->rasterizer->base.multisample;
 549
 550         /* TODO: Sample size */
 551         SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
 552         SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
 553         fragmeta->depth_units = ctx->rasterizer->base.offset_units * 2.0f;
 554         fragmeta->depth_factor = ctx->rasterizer->base.offset_scale;
 555
 556         /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
 557
 558         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A,
 559                 ctx->rasterizer->base.offset_tri);
 560         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B,
 561                 ctx->rasterizer->base.offset_tri);
 562 }
 563
 564 static void
 565 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
 566                               struct mali_shader_meta *fragmeta)
 567 {
 568         const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
 569         int zfunc = PIPE_FUNC_ALWAYS;
 570
 571         if (!zsa) {
 572                 struct pipe_stencil_state default_stencil = {
 573                         .enabled = 0,
 574                         .func = PIPE_FUNC_ALWAYS,
 575                         .fail_op = MALI_STENCIL_KEEP,
 576                         .zfail_op = MALI_STENCIL_KEEP,
 577                         .zpass_op = MALI_STENCIL_KEEP,
 578                         .writemask = 0xFF,
 579                         .valuemask = 0xFF
 580                 };
 581
 582                 panfrost_make_stencil_state(&default_stencil,
 583                                             &fragmeta->stencil_front);
 584                 fragmeta->stencil_mask_front = default_stencil.writemask;
 585                 fragmeta->stencil_back = fragmeta->stencil_front;
 586                 fragmeta->stencil_mask_back = default_stencil.writemask;
 587                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
 588                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
 589         } else {
 590                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
 591                         zsa->stencil[0].enabled);
 592                 panfrost_make_stencil_state(&zsa->stencil[0],
 593                                             &fragmeta->stencil_front);
 594                 fragmeta->stencil_mask_front = zsa->stencil[0].writemask;
 595                 fragmeta->stencil_front.ref = ctx->stencil_ref.ref_value[0];
 596
 597                 /* If back-stencil is not enabled, use the front values */
 598
 599                 if (zsa->stencil[1].enabled) {
 600                         panfrost_make_stencil_state(&zsa->stencil[1],
 601                                                     &fragmeta->stencil_back);
 602                         fragmeta->stencil_mask_back = zsa->stencil[1].writemask;
 603                         fragmeta->stencil_back.ref = ctx->stencil_ref.ref_value[1];
 604                 } else {
 605                         fragmeta->stencil_back = fragmeta->stencil_front;
 606                         fragmeta->stencil_mask_back = fragmeta->stencil_mask_front;
 607                         fragmeta->stencil_back.ref = fragmeta->stencil_front.ref;
 608                 }
 609
 610                 if (zsa->depth.enabled)
 611                         zfunc = zsa->depth.func;
 612
 613                 /* Depth state (TODO: Refactor) */
 614
 615                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
 616                         zsa->depth.writemask);
 617         }
 618
 619         fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
 620         fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
 621 }
 622
 623 static bool
 624 panfrost_fs_required(
 625                 struct panfrost_shader_state *fs,
 626                 struct panfrost_blend_final *blend,
 627                 unsigned rt_count)
 628 {
 629         /* If we generally have side effects */
 630         if (fs->fs_sidefx)
 631                 return true;
 632
 633         /* If colour is written we need to execute */
 634         for (unsigned i = 0; i < rt_count; ++i) {
 635                 if (!blend[i].no_colour)
 636                         return true;
 637         }
 638
 639         /* If depth is written and not implied we need to execute.
 640          * TODO: Predicate on Z/S writes being enabled */
 641         return (fs->writes_depth || fs->writes_stencil);
 642 }
 643
 644 static void
 645 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
 646                                 struct mali_shader_meta *fragmeta,
 647                                 void *rts)
 648 {
 649         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 650         struct panfrost_shader_state *fs;
 651         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 652
 653         SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
 654                 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
 655                 !ctx->blend->base.dither);
 656
 657         /* Get blending setup */
 658         unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 659
 660         struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 661         unsigned shader_offset = 0;
 662         struct panfrost_bo *shader_bo = NULL;
 663
 664         for (unsigned c = 0; c < rt_count; ++c)
 665                 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
 666                                                           &shader_offset);
 667
 668         /* Disable shader execution if we can */
 669         if (dev->quirks & MIDGARD_SHADERLESS
 670                         && !panfrost_fs_required(fs, blend, rt_count)) {
 671                 fragmeta->shader = 0;
 672                 fragmeta->attribute_count = 0;
 673                 fragmeta->varying_count = 0;
 674                 fragmeta->texture_count = 0;
 675                 fragmeta->sampler_count = 0;
 676
 677                 /* This feature is not known to work on Bifrost */
 678                 fragmeta->midgard1.work_count = 1;
 679                 fragmeta->midgard1.uniform_count = 0;
 680                 fragmeta->midgard1.uniform_buffer_count = 0;
 681         }
 682
 683          /* If there is a blend shader, work registers are shared. We impose 8
 684           * work registers as a limit for blend shaders. Should be lower XXX */
 685
 686         if (!(dev->quirks & IS_BIFROST)) {
 687                 for (unsigned c = 0; c < rt_count; ++c) {
 688                         if (blend[c].is_shader) {
 689                                 fragmeta->midgard1.work_count =
 690                                         MAX2(fragmeta->midgard1.work_count, 8);
 691                         }
 692                 }
 693         }
 694
 695         /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
 696          * copied to the blend_meta appended (by convention), but this is the
 697          * field actually read by the hardware. (Or maybe both are read...?).
 698          * Specify the last RTi with a blend shader. */
 699
 700         fragmeta->blend.shader = 0;
 701
 702         for (signed rt = (rt_count - 1); rt >= 0; --rt) {
 703                 if (!blend[rt].is_shader)
 704                         continue;
 705
 706                 fragmeta->blend.shader = blend[rt].shader.gpu |
 707                                          blend[rt].shader.first_tag;
 708                 break;
 709         }
 710
 711         if (dev->quirks & MIDGARD_SFBD) {
 712                 /* When only a single render target platform is used, the blend
 713                  * information is inside the shader meta itself. We additionally
 714                  * need to signal CAN_DISCARD for nontrivial blend modes (so
 715                  * we're able to read back the destination buffer) */
 716
 717                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
 718                         blend[0].is_shader);
 719
 720                 if (!blend[0].is_shader) {
 721                         fragmeta->blend.equation = *blend[0].equation.equation;
 722                         fragmeta->blend.constant = blend[0].equation.constant;
 723                 }
 724
 725                 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
 726                         !blend[0].no_blending || fs->can_discard);
 727                 return;
 728         }
 729
 730         /* Additional blend descriptor tacked on for jobs using MFBD */
 731
 732         for (unsigned i = 0; i < rt_count; ++i) {
 733                 if (dev->quirks & IS_BIFROST) {
 734                         struct bifrost_blend_rt *brts = rts;
 735
 736                         if (blend[i].is_shader) {
 737                                 /* The blend shader's address needs to be at
 738                                  * the same top 32 bit as the fragment shader.
 739                                  * TODO: Ensure that's always the case.
 740                                  */
 741                                 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
 742                                        (fs->bo->gpu & (0xffffffffull << 32)));
 743                                 brts[i].shader = blend[i].shader.gpu;
 744                                 brts[i].unk2 = 0x0;
 745                                 brts[i].flags = 0x200;
 746                         } else if (ctx->pipe_framebuffer.nr_cbufs > i) {
 747                                 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
 748                                 const struct util_format_description *format_desc;
 749                                 format_desc = util_format_description(format);
 750
 751                                 brts[i].equation = *blend[i].equation.equation;
 752
 753                                 /* TODO: this is a bit more complicated */
 754                                 brts[i].constant = blend[i].equation.constant;
 755
 756                                 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
 757                                 brts[i].unk2 = 0x19;
 758
 759                                 brts[i].shader_type = fs->blend_types[i];
 760                                 brts[i].flags = 0x200;
 761                         } else {
 762                                 /* Dummy attachment for depth-only */
 763                                 brts[i].unk2 = 0x3;
 764                                 brts[i].shader_type = fs->blend_types[i];
 765                         }
 766                 } else {
 767                         struct midgard_blend_rt *mrts = rts;
 768
 769                         if (!blend[i].no_colour) {
 770                                 mrts[i].flags = 0x200;
 771
 772                                 bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
 773                                                (ctx->pipe_framebuffer.cbufs[i]) &&
 774                                                util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
 775
 776                                 SET_BIT(mrts[i].flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
 777                                 SET_BIT(mrts[i].flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
 778                                 SET_BIT(mrts[i].flags, MALI_BLEND_SRGB, is_srgb);
 779                                 SET_BIT(mrts[i].flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
 780                         }
 781
 782                         if (blend[i].is_shader) {
 783                                 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 784                         } else {
 785                                 mrts[i].blend.equation = *blend[i].equation.equation;
 786                                 mrts[i].blend.constant = blend[i].equation.constant;
 787                         }
 788                 }
 789         }
 790 }
 791
 792 static void
 793 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
 794                                struct mali_shader_meta *fragmeta,
 795                                void *rts)
 796 {
 797         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 798         struct panfrost_shader_state *fs;
 799
 800         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 801
 802         fragmeta->alpha_coverage = ~MALI_ALPHA_COVERAGE(0.000000);
 803         fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x3010;
 804         fragmeta->unknown2_4 = 0x4e0;
 805
 806         /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
 807          * is required (independent of 32-bit/64-bit descriptors), or why it's
 808          * not used on later GPU revisions. Otherwise, all shader jobs fault on
 809          * these earlier chips (perhaps this is a chicken bit of some kind).
 810          * More investigation is needed. */
 811
 812         SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
 813
 814         if (dev->quirks & IS_BIFROST) {
 815                 /* TODO */
 816         } else {
 817                 /* Depending on whether it's legal to in the given shader, we try to
 818                  * enable early-z testing (or forward-pixel kill?) */
 819
 820                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
 821                         !fs->can_discard && !fs->writes_depth);
 822
 823                 /* Add the writes Z/S flags if needed. */
 824                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
 825                 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
 826
 827                 /* Any time texturing is used, derivatives are implicitly calculated,
 828                  * so we need to enable helper invocations */
 829
 830                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
 831                         fs->helper_invocations);
 832
 833                 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
 834
 835                 bool depth_enabled = fs->writes_depth ||
 836                    (zsa && zsa->depth.enabled && zsa->depth.func != PIPE_FUNC_ALWAYS);
 837
 838                 SET_BIT(fragmeta->midgard1.flags_lo, 0x400, !depth_enabled && fs->can_discard);
 839                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, depth_enabled && fs->can_discard);
 840         }
 841
 842         panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
 843         panfrost_frag_meta_zsa_update(ctx, fragmeta);
 844         panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
 845 }
 846
 847 void
 848 panfrost_emit_shader_meta(struct panfrost_batch *batch,
 849                           enum pipe_shader_type st,
 850                           struct mali_vertex_tiler_postfix *postfix)
 851 {
 852         struct panfrost_context *ctx = batch->ctx;
 853         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 854
 855         if (!ss) {
 856                 postfix->shader = 0;
 857                 return;
 858         }
 859
 860         struct mali_shader_meta meta;
 861
 862         panfrost_shader_meta_init(ctx, st, &meta);
 863
 864         /* Add the shader BO to the batch. */
 865         panfrost_batch_add_bo(batch, ss->bo,
 866                               PAN_BO_ACCESS_PRIVATE |
 867                               PAN_BO_ACCESS_READ |
 868                               panfrost_bo_access_for_stage(st));
 869
 870         mali_ptr shader_ptr;
 871
 872         if (st == PIPE_SHADER_FRAGMENT) {
 873                 struct panfrost_device *dev = pan_device(ctx->base.screen);
 874                 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 875                 size_t desc_size = sizeof(meta);
 876                 void *rts = NULL;
 877                 struct panfrost_transfer xfer;
 878                 unsigned rt_size;
 879
 880                 if (dev->quirks & MIDGARD_SFBD)
 881                         rt_size = 0;
 882                 else if (dev->quirks & IS_BIFROST)
 883                         rt_size = sizeof(struct bifrost_blend_rt);
 884                 else
 885                         rt_size = sizeof(struct midgard_blend_rt);
 886
 887                 desc_size += rt_size * rt_count;
 888
 889                 if (rt_size)
 890                         rts = rzalloc_size(ctx, rt_size * rt_count);
 891
 892                 panfrost_frag_shader_meta_init(ctx, &meta, rts);
 893
 894                 xfer = panfrost_allocate_transient(batch, desc_size);
 895
 896                 memcpy(xfer.cpu, &meta, sizeof(meta));
 897                 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
 898
 899                 if (rt_size)
 900                         ralloc_free(rts);
 901
 902                 shader_ptr = xfer.gpu;
 903         } else {
 904                 shader_ptr = panfrost_upload_transient(batch, &meta,
 905                                                        sizeof(meta));
 906         }
 907
 908         postfix->shader = shader_ptr;
 909 }
 910
 911 static void
 912 panfrost_mali_viewport_init(struct panfrost_context *ctx,
 913                             struct mali_viewport *mvp)
 914 {
 915         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 916
 917         /* Clip bounds are encoded as floats. The viewport itself is encoded as
 918          * (somewhat) asymmetric ints. */
 919
 920         const struct pipe_scissor_state *ss = &ctx->scissor;
 921
 922         memset(mvp, 0, sizeof(*mvp));
 923
 924         /* By default, do no viewport clipping, i.e. clip to (-inf, inf) in
 925          * each direction. Clipping to the viewport in theory should work, but
 926          * in practice causes issues when we're not explicitly trying to
 927          * scissor */
 928
 929         *mvp = (struct mali_viewport) {
 930                 .clip_minx = -INFINITY,
 931                 .clip_miny = -INFINITY,
 932                 .clip_maxx = INFINITY,
 933                 .clip_maxy = INFINITY,
 934         };
 935
 936         /* Always scissor to the viewport by default. */
 937         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 938         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 939
 940         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 941         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 942
 943         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 944         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 945
 946         /* Apply the scissor test */
 947
 948         unsigned minx, miny, maxx, maxy;
 949
 950         if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
 951                 minx = MAX2(ss->minx, vp_minx);
 952                 miny = MAX2(ss->miny, vp_miny);
 953                 maxx = MIN2(ss->maxx, vp_maxx);
 954                 maxy = MIN2(ss->maxy, vp_maxy);
 955         } else {
 956                 minx = vp_minx;
 957                 miny = vp_miny;
 958                 maxx = vp_maxx;
 959                 maxy = vp_maxy;
 960         }
 961
 962         /* Hardware needs the min/max to be strictly ordered, so flip if we
 963          * need to. The viewport transformation in the vertex shader will
 964          * handle the negatives if we don't */
 965
 966         if (miny > maxy) {
 967                 unsigned temp = miny;
 968                 miny = maxy;
 969                 maxy = temp;
 970         }
 971
 972         if (minx > maxx) {
 973                 unsigned temp = minx;
 974                 minx = maxx;
 975                 maxx = temp;
 976         }
 977
 978         if (minz > maxz) {
 979                 float temp = minz;
 980                 minz = maxz;
 981                 maxz = temp;
 982         }
 983
 984         /* Clamp to the framebuffer size as a last check */
 985
 986         minx = MIN2(ctx->pipe_framebuffer.width, minx);
 987         maxx = MIN2(ctx->pipe_framebuffer.width, maxx);
 988
 989         miny = MIN2(ctx->pipe_framebuffer.height, miny);
 990         maxy = MIN2(ctx->pipe_framebuffer.height, maxy);
 991
 992         /* Upload */
 993
 994         mvp->viewport0[0] = minx;
 995         mvp->viewport1[0] = MALI_POSITIVE(maxx);
 996
 997         mvp->viewport0[1] = miny;
 998         mvp->viewport1[1] = MALI_POSITIVE(maxy);
 999
1000         mvp->clip_minz = minz;
1001         mvp->clip_maxz = maxz;
1002 }
1003
1004 void
1005 panfrost_emit_viewport(struct panfrost_batch *batch,
1006                        struct mali_vertex_tiler_postfix *tiler_postfix)
1007 {
1008         struct panfrost_context *ctx = batch->ctx;
1009         struct mali_viewport mvp;
1010
1011         panfrost_mali_viewport_init(batch->ctx,  &mvp);
1012
1013         /* Update the job, unless we're doing wallpapering (whose lack of
1014          * scissor we can ignore, since if we "miss" a tile of wallpaper, it'll
1015          * just... be faster :) */
1016
1017         if (!ctx->wallpaper_batch)
1018                 panfrost_batch_union_scissor(batch, mvp.viewport0[0],
1019                                              mvp.viewport0[1],
1020                                              mvp.viewport1[0] + 1,
1021                                              mvp.viewport1[1] + 1);
1022
1023         tiler_postfix->viewport = panfrost_upload_transient(batch, &mvp,
1024                                                             sizeof(mvp));
1025 }
1026
1027 static mali_ptr
1028 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
1029                                  enum pipe_shader_type st,
1030                                  struct panfrost_constant_buffer *buf,
1031                                  unsigned index)
1032 {
1033         struct pipe_constant_buffer *cb = &buf->cb[index];
1034         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1035
1036         if (rsrc) {
1037                 panfrost_batch_add_bo(batch, rsrc->bo,
1038                                       PAN_BO_ACCESS_SHARED |
1039                                       PAN_BO_ACCESS_READ |
1040                                       panfrost_bo_access_for_stage(st));
1041
1042                 /* Alignment gauranteed by
1043                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
1044                 return rsrc->bo->gpu + cb->buffer_offset;
1045         } else if (cb->user_buffer) {
1046                 return panfrost_upload_transient(batch,
1047                                                  cb->user_buffer +
1048                                                  cb->buffer_offset,
1049                                                  cb->buffer_size);
1050         } else {
1051                 unreachable("No constant buffer");
1052         }
1053 }
1054
1055 struct sysval_uniform {
1056         union {
1057                 float f[4];
1058                 int32_t i[4];
1059                 uint32_t u[4];
1060                 uint64_t du[2];
1061         };
1062 };
1063
1064 static void
1065 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
1066                                       struct sysval_uniform *uniform)
1067 {
1068         struct panfrost_context *ctx = batch->ctx;
1069         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1070
1071         uniform->f[0] = vp->scale[0];
1072         uniform->f[1] = vp->scale[1];
1073         uniform->f[2] = vp->scale[2];
1074 }
1075
1076 static void
1077 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1078                                        struct sysval_uniform *uniform)
1079 {
1080         struct panfrost_context *ctx = batch->ctx;
1081         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1082
1083         uniform->f[0] = vp->translate[0];
1084         uniform->f[1] = vp->translate[1];
1085         uniform->f[2] = vp->translate[2];
1086 }
1087
1088 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1089                                        enum pipe_shader_type st,
1090                                        unsigned int sysvalid,
1091                                        struct sysval_uniform *uniform)
1092 {
1093         struct panfrost_context *ctx = batch->ctx;
1094         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1095         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1096         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1097         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1098
1099         assert(dim);
1100         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1101
1102         if (dim > 1)
1103                 uniform->i[1] = u_minify(tex->texture->height0,
1104                                          tex->u.tex.first_level);
1105
1106         if (dim > 2)
1107                 uniform->i[2] = u_minify(tex->texture->depth0,
1108                                          tex->u.tex.first_level);
1109
1110         if (is_array)
1111                 uniform->i[dim] = tex->texture->array_size;
1112 }
1113
1114 static void
1115 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1116                             enum pipe_shader_type st,
1117                             unsigned ssbo_id,
1118                             struct sysval_uniform *uniform)
1119 {
1120         struct panfrost_context *ctx = batch->ctx;
1121
1122         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1123         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1124
1125         /* Compute address */
1126         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1127
1128         panfrost_batch_add_bo(batch, bo,
1129                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1130                               panfrost_bo_access_for_stage(st));
1131
1132         /* Upload address and size as sysval */
1133         uniform->du[0] = bo->gpu + sb.buffer_offset;
1134         uniform->u[2] = sb.buffer_size;
1135 }
1136
1137 static void
1138 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1139                                enum pipe_shader_type st,
1140                                unsigned samp_idx,
1141                                struct sysval_uniform *uniform)
1142 {
1143         struct panfrost_context *ctx = batch->ctx;
1144         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1145
1146         uniform->f[0] = sampl->min_lod;
1147         uniform->f[1] = sampl->max_lod;
1148         uniform->f[2] = sampl->lod_bias;
1149
1150         /* Even without any errata, Midgard represents "no mipmapping" as
1151          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1152          * panfrost_create_sampler_state which also explains our choice of
1153          * epsilon value (again to keep behaviour consistent) */
1154
1155         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1156                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1157 }
1158
1159 static void
1160 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1161                                        struct sysval_uniform *uniform)
1162 {
1163         struct panfrost_context *ctx = batch->ctx;
1164
1165         uniform->u[0] = ctx->compute_grid->grid[0];
1166         uniform->u[1] = ctx->compute_grid->grid[1];
1167         uniform->u[2] = ctx->compute_grid->grid[2];
1168 }
1169
1170 static void
1171 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1172                         struct panfrost_shader_state *ss,
1173                         enum pipe_shader_type st)
1174 {
1175         struct sysval_uniform *uniforms = (void *)buf;
1176
1177         for (unsigned i = 0; i < ss->sysval_count; ++i) {
1178                 int sysval = ss->sysval[i];
1179
1180                 switch (PAN_SYSVAL_TYPE(sysval)) {
1181                 case PAN_SYSVAL_VIEWPORT_SCALE:
1182                         panfrost_upload_viewport_scale_sysval(batch,
1183                                                               &uniforms[i]);
1184                         break;
1185                 case PAN_SYSVAL_VIEWPORT_OFFSET:
1186                         panfrost_upload_viewport_offset_sysval(batch,
1187                                                                &uniforms[i]);
1188                         break;
1189                 case PAN_SYSVAL_TEXTURE_SIZE:
1190                         panfrost_upload_txs_sysval(batch, st,
1191                                                    PAN_SYSVAL_ID(sysval),
1192                                                    &uniforms[i]);
1193                         break;
1194                 case PAN_SYSVAL_SSBO:
1195                         panfrost_upload_ssbo_sysval(batch, st,
1196                                                     PAN_SYSVAL_ID(sysval),
1197                                                     &uniforms[i]);
1198                         break;
1199                 case PAN_SYSVAL_NUM_WORK_GROUPS:
1200                         panfrost_upload_num_work_groups_sysval(batch,
1201                                                                &uniforms[i]);
1202                         break;
1203                 case PAN_SYSVAL_SAMPLER:
1204                         panfrost_upload_sampler_sysval(batch, st,
1205                                                        PAN_SYSVAL_ID(sysval),
1206                                                        &uniforms[i]);
1207                         break;
1208                 default:
1209                         assert(0);
1210                 }
1211         }
1212 }
1213
1214 static const void *
1215 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1216                                  unsigned index)
1217 {
1218         struct pipe_constant_buffer *cb = &buf->cb[index];
1219         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1220
1221         if (rsrc)
1222                 return rsrc->bo->cpu;
1223         else if (cb->user_buffer)
1224                 return cb->user_buffer;
1225         else
1226                 unreachable("No constant buffer");
1227 }
1228
1229 void
1230 panfrost_emit_const_buf(struct panfrost_batch *batch,
1231                         enum pipe_shader_type stage,
1232                         struct mali_vertex_tiler_postfix *postfix)
1233 {
1234         struct panfrost_context *ctx = batch->ctx;
1235         struct panfrost_shader_variants *all = ctx->shader[stage];
1236
1237         if (!all)
1238                 return;
1239
1240         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1241
1242         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1243
1244         /* Uniforms are implicitly UBO #0 */
1245         bool has_uniforms = buf->enabled_mask & (1 << 0);
1246
1247         /* Allocate room for the sysval and the uniforms */
1248         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1249         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1250         size_t size = sys_size + uniform_size;
1251         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1252                                                                         size);
1253
1254         /* Upload sysvals requested by the shader */
1255         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1256
1257         /* Upload uniforms */
1258         if (has_uniforms && uniform_size) {
1259                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1260                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1261         }
1262
1263         /* Next up, attach UBOs. UBO #0 is the uniforms we just
1264          * uploaded */
1265
1266         unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1267         assert(ubo_count >= 1);
1268
1269         size_t sz = sizeof(uint64_t) * ubo_count;
1270         uint64_t ubos[PAN_MAX_CONST_BUFFERS];
1271         int uniform_count = ss->uniform_count;
1272
1273         /* Upload uniforms as a UBO */
1274         ubos[0] = MALI_MAKE_UBO(2 + uniform_count, transfer.gpu);
1275
1276         /* The rest are honest-to-goodness UBOs */
1277
1278         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1279                 size_t usz = buf->cb[ubo].buffer_size;
1280                 bool enabled = buf->enabled_mask & (1 << ubo);
1281                 bool empty = usz == 0;
1282
1283                 if (!enabled || empty) {
1284                         /* Stub out disabled UBOs to catch accesses */
1285                         ubos[ubo] = MALI_MAKE_UBO(0, 0xDEAD0000);
1286                         continue;
1287                 }
1288
1289                 mali_ptr gpu = panfrost_map_constant_buffer_gpu(batch, stage,
1290                                                                 buf, ubo);
1291
1292                 unsigned bytes_per_field = 16;
1293                 unsigned aligned = ALIGN_POT(usz, bytes_per_field);
1294                 ubos[ubo] = MALI_MAKE_UBO(aligned / bytes_per_field, gpu);
1295         }
1296
1297         mali_ptr ubufs = panfrost_upload_transient(batch, ubos, sz);
1298         postfix->uniforms = transfer.gpu;
1299         postfix->uniform_buffers = ubufs;
1300
1301         buf->dirty_mask = 0;
1302 }
1303
1304 void
1305 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1306                             const struct pipe_grid_info *info,
1307                             struct midgard_payload_vertex_tiler *vtp)
1308 {
1309         struct panfrost_context *ctx = batch->ctx;
1310         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1311         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1312         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1313                                                            128));
1314         unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1315                                info->grid[2] * 4;
1316         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1317                                                                   shared_size,
1318                                                                   1);
1319
1320         struct mali_shared_memory shared = {
1321                 .shared_memory = bo->gpu,
1322                 .shared_workgroup_count =
1323                         util_logbase2_ceil(info->grid[0]) +
1324                         util_logbase2_ceil(info->grid[1]) +
1325                         util_logbase2_ceil(info->grid[2]),
1326                 .shared_unk1 = 0x2,
1327                 .shared_shift = util_logbase2(single_size) - 1
1328         };
1329
1330         vtp->postfix.shared_memory = panfrost_upload_transient(batch, &shared,
1331                                                                sizeof(shared));
1332 }
1333
1334 static mali_ptr
1335 panfrost_get_tex_desc(struct panfrost_batch *batch,
1336                       enum pipe_shader_type st,
1337                       struct panfrost_sampler_view *view)
1338 {
1339         if (!view)
1340                 return (mali_ptr) 0;
1341
1342         struct pipe_sampler_view *pview = &view->base;
1343         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1344
1345         /* Add the BO to the job so it's retained until the job is done. */
1346
1347         panfrost_batch_add_bo(batch, rsrc->bo,
1348                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1349                               panfrost_bo_access_for_stage(st));
1350
1351         panfrost_batch_add_bo(batch, view->midgard_bo,
1352                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1353                               panfrost_bo_access_for_stage(st));
1354
1355         return view->midgard_bo->gpu;
1356 }
1357
1358 void
1359 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1360                                   enum pipe_shader_type stage,
1361                                   struct mali_vertex_tiler_postfix *postfix)
1362 {
1363         struct panfrost_context *ctx = batch->ctx;
1364         struct panfrost_device *device = pan_device(ctx->base.screen);
1365
1366         if (!ctx->sampler_view_count[stage])
1367                 return;
1368
1369         if (device->quirks & IS_BIFROST) {
1370                 struct bifrost_texture_descriptor *descriptors;
1371
1372                 descriptors = malloc(sizeof(struct bifrost_texture_descriptor) *
1373                                      ctx->sampler_view_count[stage]);
1374
1375                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1376                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1377                         struct pipe_sampler_view *pview = &view->base;
1378                         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1379
1380                         /* Add the BOs to the job so they are retained until the job is done. */
1381
1382                         panfrost_batch_add_bo(batch, rsrc->bo,
1383                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1384                                               panfrost_bo_access_for_stage(stage));
1385
1386                         panfrost_batch_add_bo(batch, view->bifrost_bo,
1387                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1388                                               panfrost_bo_access_for_stage(stage));
1389
1390                         memcpy(&descriptors[i], view->bifrost_descriptor, sizeof(*view->bifrost_descriptor));
1391                 }
1392
1393                 postfix->textures = panfrost_upload_transient(batch,
1394                                                               descriptors,
1395                                                               sizeof(struct bifrost_texture_descriptor) *
1396                                                                       ctx->sampler_view_count[stage]);
1397
1398                 free(descriptors);
1399         } else {
1400                 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1401
1402                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i)
1403                         trampolines[i] = panfrost_get_tex_desc(batch, stage,
1404                                                                ctx->sampler_views[stage][i]);
1405
1406                 postfix->textures = panfrost_upload_transient(batch,
1407                                                               trampolines,
1408                                                               sizeof(uint64_t) *
1409                                                               ctx->sampler_view_count[stage]);
1410         }
1411 }
1412
1413 void
1414 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1415                                   enum pipe_shader_type stage,
1416                                   struct mali_vertex_tiler_postfix *postfix)
1417 {
1418         struct panfrost_context *ctx = batch->ctx;
1419         struct panfrost_device *device = pan_device(ctx->base.screen);
1420
1421         if (!ctx->sampler_count[stage])
1422                 return;
1423
1424         if (device->quirks & IS_BIFROST) {
1425                 size_t desc_size = sizeof(struct bifrost_sampler_descriptor);
1426                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1427                 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1428                                                                                 transfer_size);
1429                 struct bifrost_sampler_descriptor *desc = (struct bifrost_sampler_descriptor *)transfer.cpu;
1430
1431                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1432                         desc[i] = ctx->samplers[stage][i]->bifrost_hw;
1433
1434                 postfix->sampler_descriptor = transfer.gpu;
1435         } else {
1436                 size_t desc_size = sizeof(struct mali_sampler_descriptor);
1437                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1438                 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1439                                                                                 transfer_size);
1440                 struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1441
1442                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1443                         desc[i] = ctx->samplers[stage][i]->midgard_hw;
1444
1445                 postfix->sampler_descriptor = transfer.gpu;
1446         }
1447 }
1448
1449 void
1450 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1451                                struct mali_vertex_tiler_postfix *vertex_postfix)
1452 {
1453         struct panfrost_context *ctx = batch->ctx;
1454
1455         if (!ctx->vertex)
1456                 return;
1457
1458         struct panfrost_vertex_state *so = ctx->vertex;
1459
1460         panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1461         vertex_postfix->attribute_meta = panfrost_upload_transient(batch, so->hw,
1462                                                                sizeof(*so->hw) *
1463                                                                PAN_MAX_ATTRIBUTE);
1464 }
1465
1466 void
1467 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1468                           struct mali_vertex_tiler_postfix *vertex_postfix)
1469 {
1470         struct panfrost_context *ctx = batch->ctx;
1471         struct panfrost_vertex_state *so = ctx->vertex;
1472
1473         /* Staged mali_attr, and index into them. i =/= k, depending on the
1474          * vertex buffer mask and instancing. Twice as much room is allocated,
1475          * for a worst case of NPOT_DIVIDEs which take up extra slot */
1476         union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1477         unsigned k = 0;
1478
1479         for (unsigned i = 0; i < so->num_elements; ++i) {
1480                 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1481                  * means duplicating some vertex buffers (who cares? aside from
1482                  * maybe some caching implications but I somehow doubt that
1483                  * matters) */
1484
1485                 struct pipe_vertex_element *elem = &so->pipe[i];
1486                 unsigned vbi = elem->vertex_buffer_index;
1487
1488                 /* The exception to 1:1 mapping is that we can have multiple
1489                  * entries (NPOT divisors), so we fixup anyways */
1490
1491                 so->hw[i].index = k;
1492
1493                 if (!(ctx->vb_mask & (1 << vbi)))
1494                         continue;
1495
1496                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1497                 struct panfrost_resource *rsrc;
1498
1499                 rsrc = pan_resource(buf->buffer.resource);
1500                 if (!rsrc)
1501                         continue;
1502
1503                 /* Align to 64 bytes by masking off the lower bits. This
1504                  * will be adjusted back when we fixup the src_offset in
1505                  * mali_attr_meta */
1506
1507                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1508                 mali_ptr addr = raw_addr & ~63;
1509                 unsigned chopped_addr = raw_addr - addr;
1510
1511                 /* Add a dependency of the batch on the vertex buffer */
1512                 panfrost_batch_add_bo(batch, rsrc->bo,
1513                                       PAN_BO_ACCESS_SHARED |
1514                                       PAN_BO_ACCESS_READ |
1515                                       PAN_BO_ACCESS_VERTEX_TILER);
1516
1517                 /* Set common fields */
1518                 attrs[k].elements = addr;
1519                 attrs[k].stride = buf->stride;
1520
1521                 /* Since we advanced the base pointer, we shrink the buffer
1522                  * size */
1523                 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1524
1525                 /* We need to add the extra size we masked off (for
1526                  * correctness) so the data doesn't get clamped away */
1527                 attrs[k].size += chopped_addr;
1528
1529                 /* For non-instancing make sure we initialize */
1530                 attrs[k].shift = attrs[k].extra_flags = 0;
1531
1532                 /* Instancing uses a dramatically different code path than
1533                  * linear, so dispatch for the actual emission now that the
1534                  * common code is finished */
1535
1536                 unsigned divisor = elem->instance_divisor;
1537
1538                 if (divisor && ctx->instance_count == 1) {
1539                         /* Silly corner case where there's a divisor(=1) but
1540                          * there's no legitimate instancing. So we want *every*
1541                          * attribute to be the same. So set stride to zero so
1542                          * we don't go anywhere. */
1543
1544                         attrs[k].size = attrs[k].stride + chopped_addr;
1545                         attrs[k].stride = 0;
1546                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1547                 } else if (ctx->instance_count <= 1) {
1548                         /* Normal, non-instanced attributes */
1549                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1550                 } else {
1551                         unsigned instance_shift = vertex_postfix->instance_shift;
1552                         unsigned instance_odd = vertex_postfix->instance_odd;
1553
1554                         k += panfrost_vertex_instanced(ctx->padded_count,
1555                                                        instance_shift,
1556                                                        instance_odd,
1557                                                        divisor, &attrs[k]);
1558                 }
1559         }
1560
1561         /* Add special gl_VertexID/gl_InstanceID buffers */
1562
1563         panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1564         so->hw[PAN_VERTEX_ID].index = k++;
1565         panfrost_instance_id(ctx->padded_count, &attrs[k]);
1566         so->hw[PAN_INSTANCE_ID].index = k++;
1567
1568         /* Upload whatever we emitted and go */
1569
1570         vertex_postfix->attributes = panfrost_upload_transient(batch, attrs,
1571                                                            k * sizeof(*attrs));
1572 }
1573
1574 static mali_ptr
1575 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1576                        unsigned stride, unsigned count)
1577 {
1578         /* Fill out the descriptor */
1579         slot->stride = stride;
1580         slot->size = stride * count;
1581         slot->shift = slot->extra_flags = 0;
1582
1583         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1584                                                                         slot->size);
1585
1586         slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1587
1588         return transfer.gpu;
1589 }
1590
1591 static void
1592 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1593                         unsigned stride, unsigned offset, unsigned count,
1594                         struct pipe_stream_output_target *target)
1595 {
1596         /* Fill out the descriptor */
1597         slot->stride = stride * 4;
1598         slot->shift = slot->extra_flags = 0;
1599
1600         unsigned max_size = target->buffer_size;
1601         unsigned expected_size = slot->stride * count;
1602
1603         slot->size = MIN2(max_size, expected_size);
1604
1605         /* Grab the BO and bind it to the batch */
1606         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1607
1608         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1609          * the perspective of the TILER and FRAGMENT.
1610          */
1611         panfrost_batch_add_bo(batch, bo,
1612                               PAN_BO_ACCESS_SHARED |
1613                               PAN_BO_ACCESS_RW |
1614                               PAN_BO_ACCESS_VERTEX_TILER |
1615                               PAN_BO_ACCESS_FRAGMENT);
1616
1617         mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1618         slot->elements = addr;
1619 }
1620
1621 /* Given a shader and buffer indices, link varying metadata together */
1622
1623 static bool
1624 is_special_varying(gl_varying_slot loc)
1625 {
1626         switch (loc) {
1627         case VARYING_SLOT_POS:
1628         case VARYING_SLOT_PSIZ:
1629         case VARYING_SLOT_PNTC:
1630         case VARYING_SLOT_FACE:
1631                 return true;
1632         default:
1633                 return false;
1634         }
1635 }
1636
1637 static void
1638 panfrost_emit_varying_meta(void *outptr, struct panfrost_shader_state *ss,
1639                            signed general, signed gl_Position,
1640                            signed gl_PointSize, signed gl_PointCoord,
1641                            signed gl_FrontFacing)
1642 {
1643         struct mali_attr_meta *out = (struct mali_attr_meta *) outptr;
1644
1645         for (unsigned i = 0; i < ss->varying_count; ++i) {
1646                 gl_varying_slot location = ss->varyings_loc[i];
1647                 int index = -1;
1648
1649                 switch (location) {
1650                 case VARYING_SLOT_POS:
1651                         index = gl_Position;
1652                         break;
1653                 case VARYING_SLOT_PSIZ:
1654                         index = gl_PointSize;
1655                         break;
1656                 case VARYING_SLOT_PNTC:
1657                         index = gl_PointCoord;
1658                         break;
1659                 case VARYING_SLOT_FACE:
1660                         index = gl_FrontFacing;
1661                         break;
1662                 default:
1663                         index = general;
1664                         break;
1665                 }
1666
1667                 assert(index >= 0);
1668                 out[i].index = index;
1669         }
1670 }
1671
1672 static bool
1673 has_point_coord(unsigned mask, gl_varying_slot loc)
1674 {
1675         if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1676                 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1677         else if (loc == VARYING_SLOT_PNTC)
1678                 return (mask & (1 << 8));
1679         else
1680                 return false;
1681 }
1682
1683 /* Helpers for manipulating stream out information so we can pack varyings
1684  * accordingly. Compute the src_offset for a given captured varying */
1685
1686 static struct pipe_stream_output *
1687 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1688 {
1689         for (unsigned i = 0; i < info->num_outputs; ++i) {
1690                 if (info->output[i].register_index == loc)
1691                         return &info->output[i];
1692         }
1693
1694         unreachable("Varying not captured");
1695 }
1696
1697 void
1698 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1699                                  unsigned vertex_count,
1700                                  struct mali_vertex_tiler_postfix *vertex_postfix,
1701                                  struct mali_vertex_tiler_postfix *tiler_postfix,
1702                                  union midgard_primitive_size *primitive_size)
1703 {
1704         /* Load the shaders */
1705         struct panfrost_context *ctx = batch->ctx;
1706         struct panfrost_shader_state *vs, *fs;
1707         unsigned int num_gen_varyings = 0;
1708         size_t vs_size, fs_size;
1709
1710         /* Allocate the varying descriptor */
1711
1712         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1713         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1714         vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
1715         fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
1716
1717         struct panfrost_transfer trans = panfrost_allocate_transient(batch,
1718                                                                      vs_size +
1719                                                                      fs_size);
1720
1721         struct pipe_stream_output_info *so = &vs->stream_output;
1722
1723         /* Check if this varying is linked by us. This is the case for
1724          * general-purpose, non-captured varyings. If it is, link it. If it's
1725          * not, use the provided stream out information to determine the
1726          * offset, since it was already linked for us. */
1727
1728         for (unsigned i = 0; i < vs->varying_count; i++) {
1729                 gl_varying_slot loc = vs->varyings_loc[i];
1730
1731                 bool special = is_special_varying(loc);
1732                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1733
1734                 if (captured) {
1735                         struct pipe_stream_output *o = pan_get_so(so, loc);
1736
1737                         unsigned dst_offset = o->dst_offset * 4; /* dwords */
1738                         vs->varyings[i].src_offset = dst_offset;
1739                 } else if (!special) {
1740                         vs->varyings[i].src_offset = 16 * (num_gen_varyings++);
1741                 }
1742         }
1743
1744         /* Conversely, we need to set src_offset for the captured varyings.
1745          * Here, the layout is defined by the stream out info, not us */
1746
1747         /* Link up with fragment varyings */
1748         bool reads_point_coord = fs->reads_point_coord;
1749
1750         for (unsigned i = 0; i < fs->varying_count; i++) {
1751                 gl_varying_slot loc = fs->varyings_loc[i];
1752                 unsigned src_offset;
1753                 signed vs_idx = -1;
1754
1755                 /* Link up */
1756                 for (unsigned j = 0; j < vs->varying_count; ++j) {
1757                         if (vs->varyings_loc[j] == loc) {
1758                                 vs_idx = j;
1759                                 break;
1760                         }
1761                 }
1762
1763                 /* Either assign or reuse */
1764                 if (vs_idx >= 0)
1765                         src_offset = vs->varyings[vs_idx].src_offset;
1766                 else
1767                         src_offset = 16 * (num_gen_varyings++);
1768
1769                 fs->varyings[i].src_offset = src_offset;
1770
1771                 if (has_point_coord(fs->point_sprite_mask, loc))
1772                         reads_point_coord = true;
1773         }
1774
1775         memcpy(trans.cpu, vs->varyings, vs_size);
1776         memcpy(trans.cpu + vs_size, fs->varyings, fs_size);
1777
1778         union mali_attr varyings[PIPE_MAX_ATTRIBS] = {0};
1779
1780         /* Figure out how many streamout buffers could be bound */
1781         unsigned so_count = ctx->streamout.num_targets;
1782         for (unsigned i = 0; i < vs->varying_count; i++) {
1783                 gl_varying_slot loc = vs->varyings_loc[i];
1784
1785                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1786                 if (!captured) continue;
1787
1788                 struct pipe_stream_output *o = pan_get_so(so, loc);
1789                 so_count = MAX2(so_count, o->output_buffer + 1);
1790         }
1791
1792         signed idx = so_count;
1793         signed general = idx++;
1794         signed gl_Position = idx++;
1795         signed gl_PointSize = vs->writes_point_size ? (idx++) : -1;
1796         signed gl_PointCoord = reads_point_coord ? (idx++) : -1;
1797         signed gl_FrontFacing = fs->reads_face ? (idx++) : -1;
1798         signed gl_FragCoord = fs->reads_frag_coord ? (idx++) : -1;
1799
1800         /* Emit the stream out buffers */
1801
1802         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1803                                                            ctx->vertex_count);
1804
1805         for (unsigned i = 0; i < so_count; ++i) {
1806                 if (i < ctx->streamout.num_targets) {
1807                         panfrost_emit_streamout(batch, &varyings[i],
1808                                                 so->stride[i],
1809                                                 ctx->streamout.offsets[i],
1810                                                 out_count,
1811                                                 ctx->streamout.targets[i]);
1812                 } else {
1813                         /* Emit a dummy buffer */
1814                         panfrost_emit_varyings(batch, &varyings[i],
1815                                                so->stride[i] * 4,
1816                                                out_count);
1817
1818                         /* Clear the attribute type */
1819                         varyings[i].elements &= ~0xF;
1820                 }
1821         }
1822
1823         panfrost_emit_varyings(batch, &varyings[general],
1824                                num_gen_varyings * 16,
1825                                vertex_count);
1826
1827         mali_ptr varyings_p;
1828
1829         /* fp32 vec4 gl_Position */
1830         varyings_p = panfrost_emit_varyings(batch, &varyings[gl_Position],
1831                                             sizeof(float) * 4, vertex_count);
1832         tiler_postfix->position_varying = varyings_p;
1833
1834
1835         if (panfrost_writes_point_size(ctx)) {
1836                 varyings_p = panfrost_emit_varyings(batch,
1837                                                     &varyings[gl_PointSize],
1838                                                     2, vertex_count);
1839                 primitive_size->pointer = varyings_p;
1840         }
1841
1842         if (reads_point_coord)
1843                 varyings[gl_PointCoord].elements = MALI_VARYING_POINT_COORD;
1844
1845         if (fs->reads_face)
1846                 varyings[gl_FrontFacing].elements = MALI_VARYING_FRONT_FACING;
1847
1848         if (fs->reads_frag_coord)
1849                 varyings[gl_FragCoord].elements = MALI_VARYING_FRAG_COORD;
1850
1851         struct panfrost_device *device = pan_device(ctx->base.screen);
1852         assert(!(device->quirks & IS_BIFROST) || !(reads_point_coord));
1853
1854         /* Let's go ahead and link varying meta to the buffer in question, now
1855          * that that information is available. VARYING_SLOT_POS is mapped to
1856          * gl_FragCoord for fragment shaders but gl_Positionf or vertex shaders
1857          * */
1858
1859         panfrost_emit_varying_meta(trans.cpu, vs, general, gl_Position,
1860                                    gl_PointSize, gl_PointCoord,
1861                                    gl_FrontFacing);
1862
1863         panfrost_emit_varying_meta(trans.cpu + vs_size, fs, general,
1864                                    gl_FragCoord, gl_PointSize,
1865                                    gl_PointCoord, gl_FrontFacing);
1866
1867         /* Replace streamout */
1868
1869         struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
1870         struct mali_attr_meta *ofs = ovs + vs->varying_count;
1871
1872         for (unsigned i = 0; i < vs->varying_count; i++) {
1873                 gl_varying_slot loc = vs->varyings_loc[i];
1874
1875                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1876                 if (!captured)
1877                         continue;
1878
1879                 struct pipe_stream_output *o = pan_get_so(so, loc);
1880                 ovs[i].index = o->output_buffer;
1881
1882                 assert(o->stream == 0);
1883                 ovs[i].format = (vs->varyings[i].format & ~MALI_NR_CHANNELS(4))
1884                         | MALI_NR_CHANNELS(o->num_components);
1885
1886                 if (device->quirks & HAS_SWIZZLES)
1887                         ovs[i].swizzle = panfrost_get_default_swizzle(o->num_components);
1888                 else
1889                         ovs[i].swizzle = panfrost_bifrost_swizzle(o->num_components);
1890
1891                 /* Link to the fragment */
1892                 signed fs_idx = -1;
1893
1894                 /* Link up */
1895                 for (unsigned j = 0; j < fs->varying_count; ++j) {
1896                         if (fs->varyings_loc[j] == loc) {
1897                                 fs_idx = j;
1898                                 break;
1899                         }
1900                 }
1901
1902                 if (fs_idx >= 0) {
1903                         ofs[fs_idx].index = ovs[i].index;
1904                         ofs[fs_idx].format = ovs[i].format;
1905                         ofs[fs_idx].swizzle = ovs[i].swizzle;
1906                 }
1907         }
1908
1909         /* Replace point sprite */
1910         for (unsigned i = 0; i < fs->varying_count; i++) {
1911                 /* If we have a point sprite replacement, handle that here. We
1912                  * have to translate location first.  TODO: Flip y in shader.
1913                  * We're already keying ... just time crunch .. */
1914
1915                 if (has_point_coord(fs->point_sprite_mask,
1916                                     fs->varyings_loc[i])) {
1917                         ofs[i].index = gl_PointCoord;
1918
1919                         /* Swizzle out the z/w to 0/1 */
1920                         ofs[i].format = MALI_RG16F;
1921                         ofs[i].swizzle = panfrost_get_default_swizzle(2);
1922                 }
1923         }
1924
1925         /* Fix up unaligned addresses */
1926         for (unsigned i = 0; i < so_count; ++i) {
1927                 if (varyings[i].elements < MALI_RECORD_SPECIAL)
1928                         continue;
1929
1930                 unsigned align = (varyings[i].elements & 63);
1931
1932                 /* While we're at it, the SO buffers are linear */
1933
1934                 if (!align) {
1935                         varyings[i].elements |= MALI_ATTR_LINEAR;
1936                         continue;
1937                 }
1938
1939                 /* We need to adjust alignment */
1940                 varyings[i].elements &= ~63;
1941                 varyings[i].elements |= MALI_ATTR_LINEAR;
1942                 varyings[i].size += align;
1943
1944                 for (unsigned v = 0; v < vs->varying_count; ++v) {
1945                         if (ovs[v].index != i)
1946                                 continue;
1947
1948                         ovs[v].src_offset = vs->varyings[v].src_offset + align;
1949                 }
1950
1951                 for (unsigned f = 0; f < fs->varying_count; ++f) {
1952                         if (ofs[f].index != i)
1953                                 continue;
1954
1955                         ofs[f].src_offset = fs->varyings[f].src_offset + align;
1956                 }
1957         }
1958
1959         varyings_p = panfrost_upload_transient(batch, varyings,
1960                                                idx * sizeof(*varyings));
1961         vertex_postfix->varyings = varyings_p;
1962         tiler_postfix->varyings = varyings_p;
1963
1964         vertex_postfix->varying_meta = trans.gpu;
1965         tiler_postfix->varying_meta = trans.gpu + vs_size;
1966 }
1967
1968 void
1969 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1970                                 struct mali_vertex_tiler_prefix *vertex_prefix,
1971                                 struct mali_vertex_tiler_postfix *vertex_postfix,
1972                                 struct mali_vertex_tiler_prefix *tiler_prefix,
1973                                 struct mali_vertex_tiler_postfix *tiler_postfix,
1974                                 union midgard_primitive_size *primitive_size)
1975 {
1976         struct panfrost_context *ctx = batch->ctx;
1977         struct panfrost_device *device = pan_device(ctx->base.screen);
1978         bool wallpapering = ctx->wallpaper_batch && batch->tiler_dep;
1979         struct bifrost_payload_vertex bifrost_vertex = {0,};
1980         struct bifrost_payload_tiler bifrost_tiler = {0,};
1981         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
1982         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
1983         void *vp, *tp;
1984         size_t vp_size, tp_size;
1985
1986         if (device->quirks & IS_BIFROST) {
1987                 bifrost_vertex.prefix = *vertex_prefix;
1988                 bifrost_vertex.postfix = *vertex_postfix;
1989                 vp = &bifrost_vertex;
1990                 vp_size = sizeof(bifrost_vertex);
1991
1992                 bifrost_tiler.prefix = *tiler_prefix;
1993                 bifrost_tiler.tiler.primitive_size = *primitive_size;
1994                 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
1995                 bifrost_tiler.postfix = *tiler_postfix;
1996                 tp = &bifrost_tiler;
1997                 tp_size = sizeof(bifrost_tiler);
1998         } else {
1999                 midgard_vertex.prefix = *vertex_prefix;
2000                 midgard_vertex.postfix = *vertex_postfix;
2001                 vp = &midgard_vertex;
2002                 vp_size = sizeof(midgard_vertex);
2003
2004                 midgard_tiler.prefix = *tiler_prefix;
2005                 midgard_tiler.postfix = *tiler_postfix;
2006                 midgard_tiler.primitive_size = *primitive_size;
2007                 tp = &midgard_tiler;
2008                 tp_size = sizeof(midgard_tiler);
2009         }
2010
2011         if (wallpapering) {
2012                 /* Inject in reverse order, with "predicted" job indices.
2013                  * THIS IS A HACK XXX */
2014                 panfrost_new_job(batch, JOB_TYPE_TILER, false,
2015                                  batch->job_index + 2, tp, tp_size, true);
2016                 panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2017                                  vp, vp_size, true);
2018                 return;
2019         }
2020
2021         /* If rasterizer discard is enable, only submit the vertex */
2022
2023         bool rasterizer_discard = ctx->rasterizer &&
2024                                   ctx->rasterizer->base.rasterizer_discard;
2025
2026         unsigned vertex = panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2027                                            vp, vp_size, false);
2028
2029         if (rasterizer_discard)
2030                 return;
2031
2032         panfrost_new_job(batch, JOB_TYPE_TILER, false, vertex, tp, tp_size,
2033                          false);
2034 }
2035
2036 /* TODO: stop hardcoding this */
2037 mali_ptr
2038 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2039 {
2040         uint16_t locations[] = {
2041             128, 128,
2042             0, 256,
2043             0, 256,
2044             0, 256,
2045             0, 256,
2046             0, 256,
2047             0, 256,
2048             0, 256,
2049             0, 256,
2050             0, 256,
2051             0, 256,
2052             0, 256,
2053             0, 256,
2054             0, 256,
2055             0, 256,
2056             0, 256,
2057             0, 256,
2058             0, 256,
2059             0, 256,
2060             0, 256,
2061             0, 256,
2062             0, 256,
2063             0, 256,
2064             0, 256,
2065             0, 256,
2066             0, 256,
2067             0, 256,
2068             0, 256,
2069             0, 256,
2070             0, 256,
2071             0, 256,
2072             0, 256,
2073             128, 128,
2074             0, 0,
2075             0, 0,
2076             0, 0,
2077             0, 0,
2078             0, 0,
2079             0, 0,
2080             0, 0,
2081             0, 0,
2082             0, 0,
2083             0, 0,
2084             0, 0,
2085             0, 0,
2086             0, 0,
2087             0, 0,
2088             0, 0,
2089         };
2090
2091         return panfrost_upload_transient(batch, locations, 96 * sizeof(uint16_t));
2092 }