src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28
  29 #include "panfrost-quirks.h"
  30
  31 #include "pan_allocate.h"
  32 #include "pan_bo.h"
  33 #include "pan_cmdstream.h"
  34 #include "pan_context.h"
  35 #include "pan_job.h"
  36
  37 /* If a BO is accessed for a particular shader stage, will it be in the primary
  38  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  39  * fragment will be primary, e.g. compute jobs will be considered
  40  * "vertex/tiler" by analogy */
  41
  42 static inline uint32_t
  43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  44 {
  45         assert(stage == PIPE_SHADER_FRAGMENT ||
  46                stage == PIPE_SHADER_VERTEX ||
  47                stage == PIPE_SHADER_COMPUTE);
  48
  49         return stage == PIPE_SHADER_FRAGMENT ?
  50                PAN_BO_ACCESS_FRAGMENT :
  51                PAN_BO_ACCESS_VERTEX_TILER;
  52 }
  53
  54 static void
  55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
  56                                struct mali_vertex_tiler_postfix *postfix)
  57 {
  58         struct panfrost_device *dev = pan_device(ctx->base.screen);
  59         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  60
  61         unsigned shift = panfrost_get_stack_shift(batch->stack_size);
  62         struct mali_shared_memory shared = {
  63                 .stack_shift = shift,
  64                 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
  65                 .shared_workgroup_count = ~0,
  66         };
  67         postfix->shared_memory = panfrost_upload_transient(batch, &shared, sizeof(shared));
  68 }
  69
  70 static void
  71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
  72                                struct mali_vertex_tiler_postfix *postfix)
  73 {
  74         struct panfrost_device *dev = pan_device(ctx->base.screen);
  75         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  76
  77         /* If we haven't, reserve space for the framebuffer */
  78
  79         if (!batch->framebuffer.gpu) {
  80                 unsigned size = (dev->quirks & MIDGARD_SFBD) ?
  81                         sizeof(struct mali_single_framebuffer) :
  82                         sizeof(struct mali_framebuffer);
  83
  84                 batch->framebuffer = panfrost_allocate_transient(batch, size);
  85
  86                 /* Tag the pointer */
  87                 if (!(dev->quirks & MIDGARD_SFBD))
  88                         batch->framebuffer.gpu |= MALI_MFBD;
  89         }
  90
  91         postfix->shared_memory = batch->framebuffer.gpu;
  92 }
  93
  94 static void
  95 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
  96                               struct mali_vertex_tiler_prefix *prefix,
  97                               struct mali_vertex_tiler_postfix *postfix)
  98 {
  99         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 100
 101         postfix->gl_enables |= 0x7;
 102         SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
 103                 rasterizer && rasterizer->base.front_ccw);
 104         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
 105                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
 106         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
 107                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
 108         SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
 109                 rasterizer && rasterizer->base.flatshade_first);
 110 }
 111
 112 void
 113 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
 114                                   struct mali_vertex_tiler_prefix *prefix,
 115                                   union midgard_primitive_size *primitive_size)
 116 {
 117         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 118
 119         if (!panfrost_writes_point_size(ctx)) {
 120                 bool points = prefix->draw_mode == MALI_POINTS;
 121                 float val = 0.0f;
 122
 123                 if (rasterizer)
 124                         val = points ?
 125                               rasterizer->base.point_size :
 126                               rasterizer->base.line_width;
 127
 128                 primitive_size->constant = val;
 129         }
 130 }
 131
 132 static void
 133 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
 134                                    struct mali_vertex_tiler_postfix *postfix)
 135 {
 136         SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
 137         if (ctx->occlusion_query)
 138                 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
 139         else
 140                 postfix->occlusion_counter = 0;
 141 }
 142
 143 void
 144 panfrost_vt_init(struct panfrost_context *ctx,
 145                  enum pipe_shader_type stage,
 146                  struct mali_vertex_tiler_prefix *prefix,
 147                  struct mali_vertex_tiler_postfix *postfix)
 148 {
 149         struct panfrost_device *device = pan_device(ctx->base.screen);
 150
 151         if (!ctx->shader[stage])
 152                 return;
 153
 154         memset(prefix, 0, sizeof(*prefix));
 155         memset(postfix, 0, sizeof(*postfix));
 156
 157         if (device->quirks & IS_BIFROST) {
 158                 postfix->gl_enables = 0x2;
 159                 panfrost_vt_emit_shared_memory(ctx, postfix);
 160         } else {
 161                 postfix->gl_enables = 0x6;
 162                 panfrost_vt_attach_framebuffer(ctx, postfix);
 163         }
 164
 165         if (stage == PIPE_SHADER_FRAGMENT) {
 166                 panfrost_vt_update_occlusion_query(ctx, postfix);
 167                 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
 168         }
 169 }
 170
 171 static unsigned
 172 panfrost_translate_index_size(unsigned size)
 173 {
 174         switch (size) {
 175         case 1:
 176                 return MALI_DRAW_INDEXED_UINT8;
 177
 178         case 2:
 179                 return MALI_DRAW_INDEXED_UINT16;
 180
 181         case 4:
 182                 return MALI_DRAW_INDEXED_UINT32;
 183
 184         default:
 185                 unreachable("Invalid index size");
 186         }
 187 }
 188
 189 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
 190  * good for the duration of the draw (transient), could last longer. Also get
 191  * the bounds on the index buffer for the range accessed by the draw. We do
 192  * these operations together because there are natural optimizations which
 193  * require them to be together. */
 194
 195 static mali_ptr
 196 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 197                                   const struct pipe_draw_info *info,
 198                                   unsigned *min_index, unsigned *max_index)
 199 {
 200         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 201         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 202         off_t offset = info->start * info->index_size;
 203         bool needs_indices = true;
 204         mali_ptr out = 0;
 205
 206         if (info->max_index != ~0u) {
 207                 *min_index = info->min_index;
 208                 *max_index = info->max_index;
 209                 needs_indices = false;
 210         }
 211
 212         if (!info->has_user_indices) {
 213                 /* Only resources can be directly mapped */
 214                 panfrost_batch_add_bo(batch, rsrc->bo,
 215                                       PAN_BO_ACCESS_SHARED |
 216                                       PAN_BO_ACCESS_READ |
 217                                       PAN_BO_ACCESS_VERTEX_TILER);
 218                 out = rsrc->bo->gpu + offset;
 219
 220                 /* Check the cache */
 221                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 222                                                            info->start,
 223                                                            info->count,
 224                                                            min_index,
 225                                                            max_index);
 226         } else {
 227                 /* Otherwise, we need to upload to transient memory */
 228                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 229                 out = panfrost_upload_transient(batch, ibuf8 + offset,
 230                                                 info->count *
 231                                                 info->index_size);
 232         }
 233
 234         if (needs_indices) {
 235                 /* Fallback */
 236                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 237
 238                 if (!info->has_user_indices)
 239                         panfrost_minmax_cache_add(rsrc->index_cache,
 240                                                   info->start, info->count,
 241                                                   *min_index, *max_index);
 242         }
 243
 244         return out;
 245 }
 246
 247 void
 248 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
 249                           const struct pipe_draw_info *info,
 250                           enum mali_draw_mode draw_mode,
 251                           struct mali_vertex_tiler_postfix *vertex_postfix,
 252                           struct mali_vertex_tiler_prefix *tiler_prefix,
 253                           struct mali_vertex_tiler_postfix *tiler_postfix,
 254                           unsigned *vertex_count,
 255                           unsigned *padded_count)
 256 {
 257         tiler_prefix->draw_mode = draw_mode;
 258
 259         unsigned draw_flags = 0;
 260
 261         if (panfrost_writes_point_size(ctx))
 262                 draw_flags |= MALI_DRAW_VARYING_SIZE;
 263
 264         if (info->primitive_restart)
 265                 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
 266
 267         /* These doesn't make much sense */
 268
 269         draw_flags |= 0x3000;
 270
 271         if (info->index_size) {
 272                 unsigned min_index = 0, max_index = 0;
 273
 274                 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
 275                                                                        info,
 276                                                                        &min_index,
 277                                                                        &max_index);
 278
 279                 /* Use the corresponding values */
 280                 *vertex_count = max_index - min_index + 1;
 281                 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
 282                 tiler_prefix->offset_bias_correction = -min_index;
 283                 tiler_prefix->index_count = MALI_POSITIVE(info->count);
 284                 draw_flags |= panfrost_translate_index_size(info->index_size);
 285         } else {
 286                 tiler_prefix->indices = 0;
 287                 *vertex_count = ctx->vertex_count;
 288                 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
 289                 tiler_prefix->offset_bias_correction = 0;
 290                 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
 291         }
 292
 293         tiler_prefix->unknown_draw = draw_flags;
 294
 295         /* Encode the padded vertex count */
 296
 297         if (info->instance_count > 1) {
 298                 *padded_count = panfrost_padded_vertex_count(*vertex_count);
 299
 300                 unsigned shift = __builtin_ctz(ctx->padded_count);
 301                 unsigned k = ctx->padded_count >> (shift + 1);
 302
 303                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
 304                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
 305         } else {
 306                 *padded_count = *vertex_count;
 307
 308                 /* Reset instancing state */
 309                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
 310                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
 311         }
 312 }
 313
 314 static void
 315 panfrost_shader_meta_init(struct panfrost_context *ctx,
 316                           enum pipe_shader_type st,
 317                           struct mali_shader_meta *meta)
 318 {
 319         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 320         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 321
 322         memset(meta, 0, sizeof(*meta));
 323         meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
 324         meta->attribute_count = ss->attribute_count;
 325         meta->varying_count = ss->varying_count;
 326         meta->texture_count = ctx->sampler_view_count[st];
 327         meta->sampler_count = ctx->sampler_count[st];
 328
 329         if (dev->quirks & IS_BIFROST) {
 330                 if (st == PIPE_SHADER_VERTEX)
 331                         meta->bifrost1.unk1 = 0x800000;
 332                 else {
 333                         /* First clause ATEST |= 0x4000000.
 334                          * Less than 32 regs |= 0x200 */
 335                         meta->bifrost1.unk1 = 0x958020;
 336                 }
 337
 338                 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 339                 if (st == PIPE_SHADER_VERTEX)
 340                         meta->bifrost2.preload_regs = 0xC0;
 341                 else
 342                         meta->bifrost2.preload_regs = 0x1;
 343                 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
 344                                                     ss->uniform_cutoff);
 345         } else {
 346                 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
 347                                                     ss->uniform_cutoff);
 348                 meta->midgard1.work_count = ss->work_reg_count;
 349                 meta->midgard1.flags_hi = 0x8; /* XXX */
 350                 meta->midgard1.flags_lo = 0x220;
 351                 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 352         }
 353 }
 354
 355 static unsigned
 356 panfrost_translate_compare_func(enum pipe_compare_func in)
 357 {
 358         switch (in) {
 359         case PIPE_FUNC_NEVER:
 360                 return MALI_FUNC_NEVER;
 361
 362         case PIPE_FUNC_LESS:
 363                 return MALI_FUNC_LESS;
 364
 365         case PIPE_FUNC_EQUAL:
 366                 return MALI_FUNC_EQUAL;
 367
 368         case PIPE_FUNC_LEQUAL:
 369                 return MALI_FUNC_LEQUAL;
 370
 371         case PIPE_FUNC_GREATER:
 372                 return MALI_FUNC_GREATER;
 373
 374         case PIPE_FUNC_NOTEQUAL:
 375                 return MALI_FUNC_NOTEQUAL;
 376
 377         case PIPE_FUNC_GEQUAL:
 378                 return MALI_FUNC_GEQUAL;
 379
 380         case PIPE_FUNC_ALWAYS:
 381                 return MALI_FUNC_ALWAYS;
 382
 383         default:
 384                 unreachable("Invalid func");
 385         }
 386 }
 387
 388 static unsigned
 389 panfrost_translate_stencil_op(enum pipe_stencil_op in)
 390 {
 391         switch (in) {
 392         case PIPE_STENCIL_OP_KEEP:
 393                 return MALI_STENCIL_KEEP;
 394
 395         case PIPE_STENCIL_OP_ZERO:
 396                 return MALI_STENCIL_ZERO;
 397
 398         case PIPE_STENCIL_OP_REPLACE:
 399                return MALI_STENCIL_REPLACE;
 400
 401         case PIPE_STENCIL_OP_INCR:
 402                 return MALI_STENCIL_INCR;
 403
 404         case PIPE_STENCIL_OP_DECR:
 405                 return MALI_STENCIL_DECR;
 406
 407         case PIPE_STENCIL_OP_INCR_WRAP:
 408                 return MALI_STENCIL_INCR_WRAP;
 409
 410         case PIPE_STENCIL_OP_DECR_WRAP:
 411                 return MALI_STENCIL_DECR_WRAP;
 412
 413         case PIPE_STENCIL_OP_INVERT:
 414                 return MALI_STENCIL_INVERT;
 415
 416         default:
 417                 unreachable("Invalid stencil op");
 418         }
 419 }
 420
 421 static unsigned
 422 translate_tex_wrap(enum pipe_tex_wrap w)
 423 {
 424         switch (w) {
 425         case PIPE_TEX_WRAP_REPEAT:
 426                 return MALI_WRAP_REPEAT;
 427
 428         case PIPE_TEX_WRAP_CLAMP:
 429                 return MALI_WRAP_CLAMP;
 430
 431         case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 432                 return MALI_WRAP_CLAMP_TO_EDGE;
 433
 434         case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
 435                 return MALI_WRAP_CLAMP_TO_BORDER;
 436
 437         case PIPE_TEX_WRAP_MIRROR_REPEAT:
 438                 return MALI_WRAP_MIRRORED_REPEAT;
 439
 440         case PIPE_TEX_WRAP_MIRROR_CLAMP:
 441                 return MALI_WRAP_MIRRORED_CLAMP;
 442
 443         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 444                 return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE;
 445
 446         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
 447                 return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER;
 448
 449         default:
 450                 unreachable("Invalid wrap");
 451         }
 452 }
 453
 454 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 455                                 struct mali_sampler_descriptor *hw)
 456 {
 457         unsigned func = panfrost_translate_compare_func(cso->compare_func);
 458         bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 459         bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 460         bool mip_linear  = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
 461         unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
 462         unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
 463         unsigned mip_filter = mip_linear  ?
 464                               (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
 465         unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
 466
 467         *hw = (struct mali_sampler_descriptor) {
 468                 .filter_mode = min_filter | mag_filter | mip_filter |
 469                                normalized,
 470                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 471                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 472                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 473                 .compare_func = panfrost_flip_compare_func(func),
 474                 .border_color = {
 475                         cso->border_color.f[0],
 476                         cso->border_color.f[1],
 477                         cso->border_color.f[2],
 478                         cso->border_color.f[3]
 479                 },
 480                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 481                 .max_lod = FIXED_16(cso->max_lod, false),
 482                 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
 483                 .seamless_cube_map = cso->seamless_cube_map,
 484         };
 485
 486         /* If necessary, we disable mipmapping in the sampler descriptor by
 487          * clamping the LOD as tight as possible (from 0 to epsilon,
 488          * essentially -- remember these are fixed point numbers, so
 489          * epsilon=1/256) */
 490
 491         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 492                 hw->max_lod = hw->min_lod + 1;
 493 }
 494
 495 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
 496                                         struct bifrost_sampler_descriptor *hw)
 497 {
 498         *hw = (struct bifrost_sampler_descriptor) {
 499                 .unk1 = 0x1,
 500                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 501                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 502                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 503                 .unk8 = 0x8,
 504                 .min_filter = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST,
 505                 .norm_coords = cso->normalized_coords,
 506                 .mip_filter = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR,
 507                 .mag_filter = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR,
 508                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 509                 .max_lod = FIXED_16(cso->max_lod, false),
 510         };
 511
 512         /* If necessary, we disable mipmapping in the sampler descriptor by
 513          * clamping the LOD as tight as possible (from 0 to epsilon,
 514          * essentially -- remember these are fixed point numbers, so
 515          * epsilon=1/256) */
 516
 517         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 518                 hw->max_lod = hw->min_lod + 1;
 519 }
 520
 521 static void
 522 panfrost_make_stencil_state(const struct pipe_stencil_state *in,
 523                             struct mali_stencil_test *out)
 524 {
 525         out->ref = 0; /* Gallium gets it from elsewhere */
 526
 527         out->mask = in->valuemask;
 528         out->func = panfrost_translate_compare_func(in->func);
 529         out->sfail = panfrost_translate_stencil_op(in->fail_op);
 530         out->dpfail = panfrost_translate_stencil_op(in->zfail_op);
 531         out->dppass = panfrost_translate_stencil_op(in->zpass_op);
 532 }
 533
 534 static void
 535 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
 536                                      struct mali_shader_meta *fragmeta)
 537 {
 538         if (!ctx->rasterizer) {
 539                 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
 540                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
 541                 fragmeta->depth_units = 0.0f;
 542                 fragmeta->depth_factor = 0.0f;
 543                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
 544                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
 545                 return;
 546         }
 547
 548         bool msaa = ctx->rasterizer->base.multisample;
 549
 550         /* TODO: Sample size */
 551         SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
 552         SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
 553         fragmeta->depth_units = ctx->rasterizer->base.offset_units * 2.0f;
 554         fragmeta->depth_factor = ctx->rasterizer->base.offset_scale;
 555
 556         /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
 557
 558         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A,
 559                 ctx->rasterizer->base.offset_tri);
 560         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B,
 561                 ctx->rasterizer->base.offset_tri);
 562 }
 563
 564 static void
 565 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
 566                               struct mali_shader_meta *fragmeta)
 567 {
 568         const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
 569         int zfunc = PIPE_FUNC_ALWAYS;
 570
 571         if (!zsa) {
 572                 struct pipe_stencil_state default_stencil = {
 573                         .enabled = 0,
 574                         .func = PIPE_FUNC_ALWAYS,
 575                         .fail_op = MALI_STENCIL_KEEP,
 576                         .zfail_op = MALI_STENCIL_KEEP,
 577                         .zpass_op = MALI_STENCIL_KEEP,
 578                         .writemask = 0xFF,
 579                         .valuemask = 0xFF
 580                 };
 581
 582                 panfrost_make_stencil_state(&default_stencil,
 583                                             &fragmeta->stencil_front);
 584                 fragmeta->stencil_mask_front = default_stencil.writemask;
 585                 fragmeta->stencil_back = fragmeta->stencil_front;
 586                 fragmeta->stencil_mask_back = default_stencil.writemask;
 587                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
 588                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
 589         } else {
 590                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
 591                         zsa->stencil[0].enabled);
 592                 panfrost_make_stencil_state(&zsa->stencil[0],
 593                                             &fragmeta->stencil_front);
 594                 fragmeta->stencil_mask_front = zsa->stencil[0].writemask;
 595                 fragmeta->stencil_front.ref = ctx->stencil_ref.ref_value[0];
 596
 597                 /* If back-stencil is not enabled, use the front values */
 598
 599                 if (zsa->stencil[1].enabled) {
 600                         panfrost_make_stencil_state(&zsa->stencil[1],
 601                                                     &fragmeta->stencil_back);
 602                         fragmeta->stencil_mask_back = zsa->stencil[1].writemask;
 603                         fragmeta->stencil_back.ref = ctx->stencil_ref.ref_value[1];
 604                 } else {
 605                         fragmeta->stencil_back = fragmeta->stencil_front;
 606                         fragmeta->stencil_mask_back = fragmeta->stencil_mask_front;
 607                         fragmeta->stencil_back.ref = fragmeta->stencil_front.ref;
 608                 }
 609
 610                 if (zsa->depth.enabled)
 611                         zfunc = zsa->depth.func;
 612
 613                 /* Depth state (TODO: Refactor) */
 614
 615                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
 616                         zsa->depth.writemask);
 617         }
 618
 619         fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
 620         fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
 621 }
 622
 623 static void
 624 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
 625                                 struct mali_shader_meta *fragmeta,
 626                                 void *rts)
 627 {
 628         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 629
 630         SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
 631                 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
 632                 !ctx->blend->base.dither);
 633
 634         /* Get blending setup */
 635         unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 636
 637         struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 638         unsigned shader_offset = 0;
 639         struct panfrost_bo *shader_bo = NULL;
 640
 641         for (unsigned c = 0; c < rt_count; ++c)
 642                 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
 643                                                           &shader_offset);
 644
 645          /* If there is a blend shader, work registers are shared. We impose 8
 646           * work registers as a limit for blend shaders. Should be lower XXX */
 647
 648         if (!(dev->quirks & IS_BIFROST)) {
 649                 for (unsigned c = 0; c < rt_count; ++c) {
 650                         if (blend[c].is_shader) {
 651                                 fragmeta->midgard1.work_count =
 652                                         MAX2(fragmeta->midgard1.work_count, 8);
 653                         }
 654                 }
 655         }
 656
 657         /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
 658          * copied to the blend_meta appended (by convention), but this is the
 659          * field actually read by the hardware. (Or maybe both are read...?).
 660          * Specify the last RTi with a blend shader. */
 661
 662         fragmeta->blend.shader = 0;
 663
 664         for (signed rt = (rt_count - 1); rt >= 0; --rt) {
 665                 if (!blend[rt].is_shader)
 666                         continue;
 667
 668                 fragmeta->blend.shader = blend[rt].shader.gpu |
 669                                          blend[rt].shader.first_tag;
 670                 break;
 671         }
 672
 673         if (dev->quirks & MIDGARD_SFBD) {
 674                 /* When only a single render target platform is used, the blend
 675                  * information is inside the shader meta itself. We additionally
 676                  * need to signal CAN_DISCARD for nontrivial blend modes (so
 677                  * we're able to read back the destination buffer) */
 678
 679                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
 680                         blend[0].is_shader);
 681
 682                 if (!blend[0].is_shader) {
 683                         fragmeta->blend.equation = *blend[0].equation.equation;
 684                         fragmeta->blend.constant = blend[0].equation.constant;
 685                 }
 686
 687                 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
 688                         !blend[0].no_blending);
 689                 return;
 690         }
 691
 692         /* Additional blend descriptor tacked on for jobs using MFBD */
 693
 694         for (unsigned i = 0; i < rt_count; ++i) {
 695                 if (dev->quirks & IS_BIFROST) {
 696                         struct bifrost_blend_rt *brts = rts;
 697                         struct panfrost_shader_state *fs;
 698                         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 699
 700                         brts[i].flags = 0x200;
 701                         if (blend[i].is_shader) {
 702                                 /* The blend shader's address needs to be at
 703                                  * the same top 32 bit as the fragment shader.
 704                                  * TODO: Ensure that's always the case.
 705                                  */
 706                                 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
 707                                        (fs->bo->gpu & (0xffffffffull << 32)));
 708                                 brts[i].shader = blend[i].shader.gpu;
 709                                 brts[i].unk2 = 0x0;
 710                         } else {
 711                                 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
 712                                 const struct util_format_description *format_desc;
 713                                 format_desc = util_format_description(format);
 714
 715                                 brts[i].equation = *blend[i].equation.equation;
 716
 717                                 /* TODO: this is a bit more complicated */
 718                                 brts[i].constant = blend[i].equation.constant;
 719
 720                                 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
 721                                 brts[i].unk2 = 0x19;
 722
 723                                 brts[i].shader_type = fs->blend_types[i];
 724                         }
 725                 } else {
 726                         struct midgard_blend_rt *mrts = rts;
 727
 728                         mrts[i].flags = 0x200;
 729
 730                         bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
 731                                        (ctx->pipe_framebuffer.cbufs[i]) &&
 732                                        util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
 733
 734                         SET_BIT(mrts[i].flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
 735                         SET_BIT(mrts[i].flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
 736                         SET_BIT(mrts[i].flags, MALI_BLEND_SRGB, is_srgb);
 737                         SET_BIT(mrts[i].flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
 738
 739                         if (blend[i].is_shader) {
 740                                 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 741                         } else {
 742                                 mrts[i].blend.equation = *blend[i].equation.equation;
 743                                 mrts[i].blend.constant = blend[i].equation.constant;
 744                         }
 745                 }
 746         }
 747 }
 748
 749 static void
 750 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
 751                                struct mali_shader_meta *fragmeta,
 752                                void *rts)
 753 {
 754         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 755         struct panfrost_shader_state *fs;
 756
 757         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 758
 759         fragmeta->alpha_coverage = ~MALI_ALPHA_COVERAGE(0.000000);
 760         fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x3010;
 761         fragmeta->unknown2_4 = 0x4e0;
 762
 763         /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
 764          * is required (independent of 32-bit/64-bit descriptors), or why it's
 765          * not used on later GPU revisions. Otherwise, all shader jobs fault on
 766          * these earlier chips (perhaps this is a chicken bit of some kind).
 767          * More investigation is needed. */
 768
 769         SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
 770
 771         if (dev->quirks & IS_BIFROST) {
 772                 /* TODO */
 773         } else {
 774                 /* Depending on whether it's legal to in the given shader, we try to
 775                  * enable early-z testing (or forward-pixel kill?) */
 776
 777                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
 778                         !fs->can_discard && !fs->writes_depth);
 779
 780                 /* Add the writes Z/S flags if needed. */
 781                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
 782                 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
 783
 784                 /* Any time texturing is used, derivatives are implicitly calculated,
 785                  * so we need to enable helper invocations */
 786
 787                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
 788                         fs->helper_invocations);
 789
 790                 /* CAN_DISCARD should be set if the fragment shader possibly contains a
 791                  * 'discard' instruction. It is likely this is related to optimizations
 792                  * related to forward-pixel kill, as per "Mali Performance 3: Is
 793                  * EGL_BUFFER_PRESERVED a good thing?" by Peter Harris */
 794
 795                 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
 796
 797                 bool depth_enabled = fs->writes_depth ||
 798                    (zsa && zsa->depth.enabled && zsa->depth.func != PIPE_FUNC_ALWAYS);
 799
 800                 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD, fs->can_discard);
 801                 SET_BIT(fragmeta->midgard1.flags_lo, 0x400, !depth_enabled && fs->can_discard);
 802                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, depth_enabled && fs->can_discard);
 803         }
 804
 805         panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
 806         panfrost_frag_meta_zsa_update(ctx, fragmeta);
 807         panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
 808 }
 809
 810 void
 811 panfrost_emit_shader_meta(struct panfrost_batch *batch,
 812                           enum pipe_shader_type st,
 813                           struct mali_vertex_tiler_postfix *postfix)
 814 {
 815         struct panfrost_context *ctx = batch->ctx;
 816         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 817
 818         if (!ss) {
 819                 postfix->shader = 0;
 820                 return;
 821         }
 822
 823         struct mali_shader_meta meta;
 824
 825         panfrost_shader_meta_init(ctx, st, &meta);
 826
 827         /* Add the shader BO to the batch. */
 828         panfrost_batch_add_bo(batch, ss->bo,
 829                               PAN_BO_ACCESS_PRIVATE |
 830                               PAN_BO_ACCESS_READ |
 831                               panfrost_bo_access_for_stage(st));
 832
 833         mali_ptr shader_ptr;
 834
 835         if (st == PIPE_SHADER_FRAGMENT) {
 836                 struct panfrost_device *dev = pan_device(ctx->base.screen);
 837                 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 838                 size_t desc_size = sizeof(meta);
 839                 void *rts = NULL;
 840                 struct panfrost_transfer xfer;
 841                 unsigned rt_size;
 842
 843                 if (dev->quirks & MIDGARD_SFBD)
 844                         rt_size = 0;
 845                 else if (dev->quirks & IS_BIFROST)
 846                         rt_size = sizeof(struct bifrost_blend_rt);
 847                 else
 848                         rt_size = sizeof(struct midgard_blend_rt);
 849
 850                 desc_size += rt_size * rt_count;
 851
 852                 if (rt_size)
 853                         rts = rzalloc_size(ctx, rt_size * rt_count);
 854
 855                 panfrost_frag_shader_meta_init(ctx, &meta, rts);
 856
 857                 xfer = panfrost_allocate_transient(batch, desc_size);
 858
 859                 memcpy(xfer.cpu, &meta, sizeof(meta));
 860                 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
 861
 862                 if (rt_size)
 863                         ralloc_free(rts);
 864
 865                 shader_ptr = xfer.gpu;
 866         } else {
 867                 shader_ptr = panfrost_upload_transient(batch, &meta,
 868                                                        sizeof(meta));
 869         }
 870
 871         postfix->shader = shader_ptr;
 872 }
 873
 874 static void
 875 panfrost_mali_viewport_init(struct panfrost_context *ctx,
 876                             struct mali_viewport *mvp)
 877 {
 878         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 879
 880         /* Clip bounds are encoded as floats. The viewport itself is encoded as
 881          * (somewhat) asymmetric ints. */
 882
 883         const struct pipe_scissor_state *ss = &ctx->scissor;
 884
 885         memset(mvp, 0, sizeof(*mvp));
 886
 887         /* By default, do no viewport clipping, i.e. clip to (-inf, inf) in
 888          * each direction. Clipping to the viewport in theory should work, but
 889          * in practice causes issues when we're not explicitly trying to
 890          * scissor */
 891
 892         *mvp = (struct mali_viewport) {
 893                 .clip_minx = -INFINITY,
 894                 .clip_miny = -INFINITY,
 895                 .clip_maxx = INFINITY,
 896                 .clip_maxy = INFINITY,
 897         };
 898
 899         /* Always scissor to the viewport by default. */
 900         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 901         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 902
 903         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 904         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 905
 906         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 907         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 908
 909         /* Apply the scissor test */
 910
 911         unsigned minx, miny, maxx, maxy;
 912
 913         if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
 914                 minx = MAX2(ss->minx, vp_minx);
 915                 miny = MAX2(ss->miny, vp_miny);
 916                 maxx = MIN2(ss->maxx, vp_maxx);
 917                 maxy = MIN2(ss->maxy, vp_maxy);
 918         } else {
 919                 minx = vp_minx;
 920                 miny = vp_miny;
 921                 maxx = vp_maxx;
 922                 maxy = vp_maxy;
 923         }
 924
 925         /* Hardware needs the min/max to be strictly ordered, so flip if we
 926          * need to. The viewport transformation in the vertex shader will
 927          * handle the negatives if we don't */
 928
 929         if (miny > maxy) {
 930                 unsigned temp = miny;
 931                 miny = maxy;
 932                 maxy = temp;
 933         }
 934
 935         if (minx > maxx) {
 936                 unsigned temp = minx;
 937                 minx = maxx;
 938                 maxx = temp;
 939         }
 940
 941         if (minz > maxz) {
 942                 float temp = minz;
 943                 minz = maxz;
 944                 maxz = temp;
 945         }
 946
 947         /* Clamp to the framebuffer size as a last check */
 948
 949         minx = MIN2(ctx->pipe_framebuffer.width, minx);
 950         maxx = MIN2(ctx->pipe_framebuffer.width, maxx);
 951
 952         miny = MIN2(ctx->pipe_framebuffer.height, miny);
 953         maxy = MIN2(ctx->pipe_framebuffer.height, maxy);
 954
 955         /* Upload */
 956
 957         mvp->viewport0[0] = minx;
 958         mvp->viewport1[0] = MALI_POSITIVE(maxx);
 959
 960         mvp->viewport0[1] = miny;
 961         mvp->viewport1[1] = MALI_POSITIVE(maxy);
 962
 963         mvp->clip_minz = minz;
 964         mvp->clip_maxz = maxz;
 965 }
 966
 967 void
 968 panfrost_emit_viewport(struct panfrost_batch *batch,
 969                        struct mali_vertex_tiler_postfix *tiler_postfix)
 970 {
 971         struct panfrost_context *ctx = batch->ctx;
 972         struct mali_viewport mvp;
 973
 974         panfrost_mali_viewport_init(batch->ctx,  &mvp);
 975
 976         /* Update the job, unless we're doing wallpapering (whose lack of
 977          * scissor we can ignore, since if we "miss" a tile of wallpaper, it'll
 978          * just... be faster :) */
 979
 980         if (!ctx->wallpaper_batch)
 981                 panfrost_batch_union_scissor(batch, mvp.viewport0[0],
 982                                              mvp.viewport0[1],
 983                                              mvp.viewport1[0] + 1,
 984                                              mvp.viewport1[1] + 1);
 985
 986         tiler_postfix->viewport = panfrost_upload_transient(batch, &mvp,
 987                                                             sizeof(mvp));
 988 }
 989
 990 static mali_ptr
 991 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
 992                                  enum pipe_shader_type st,
 993                                  struct panfrost_constant_buffer *buf,
 994                                  unsigned index)
 995 {
 996         struct pipe_constant_buffer *cb = &buf->cb[index];
 997         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
 998
 999         if (rsrc) {
1000                 panfrost_batch_add_bo(batch, rsrc->bo,
1001                                       PAN_BO_ACCESS_SHARED |
1002                                       PAN_BO_ACCESS_READ |
1003                                       panfrost_bo_access_for_stage(st));
1004
1005                 /* Alignment gauranteed by
1006                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
1007                 return rsrc->bo->gpu + cb->buffer_offset;
1008         } else if (cb->user_buffer) {
1009                 return panfrost_upload_transient(batch,
1010                                                  cb->user_buffer +
1011                                                  cb->buffer_offset,
1012                                                  cb->buffer_size);
1013         } else {
1014                 unreachable("No constant buffer");
1015         }
1016 }
1017
1018 struct sysval_uniform {
1019         union {
1020                 float f[4];
1021                 int32_t i[4];
1022                 uint32_t u[4];
1023                 uint64_t du[2];
1024         };
1025 };
1026
1027 static void
1028 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
1029                                       struct sysval_uniform *uniform)
1030 {
1031         struct panfrost_context *ctx = batch->ctx;
1032         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1033
1034         uniform->f[0] = vp->scale[0];
1035         uniform->f[1] = vp->scale[1];
1036         uniform->f[2] = vp->scale[2];
1037 }
1038
1039 static void
1040 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1041                                        struct sysval_uniform *uniform)
1042 {
1043         struct panfrost_context *ctx = batch->ctx;
1044         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1045
1046         uniform->f[0] = vp->translate[0];
1047         uniform->f[1] = vp->translate[1];
1048         uniform->f[2] = vp->translate[2];
1049 }
1050
1051 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1052                                        enum pipe_shader_type st,
1053                                        unsigned int sysvalid,
1054                                        struct sysval_uniform *uniform)
1055 {
1056         struct panfrost_context *ctx = batch->ctx;
1057         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1058         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1059         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1060         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1061
1062         assert(dim);
1063         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1064
1065         if (dim > 1)
1066                 uniform->i[1] = u_minify(tex->texture->height0,
1067                                          tex->u.tex.first_level);
1068
1069         if (dim > 2)
1070                 uniform->i[2] = u_minify(tex->texture->depth0,
1071                                          tex->u.tex.first_level);
1072
1073         if (is_array)
1074                 uniform->i[dim] = tex->texture->array_size;
1075 }
1076
1077 static void
1078 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1079                             enum pipe_shader_type st,
1080                             unsigned ssbo_id,
1081                             struct sysval_uniform *uniform)
1082 {
1083         struct panfrost_context *ctx = batch->ctx;
1084
1085         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1086         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1087
1088         /* Compute address */
1089         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1090
1091         panfrost_batch_add_bo(batch, bo,
1092                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1093                               panfrost_bo_access_for_stage(st));
1094
1095         /* Upload address and size as sysval */
1096         uniform->du[0] = bo->gpu + sb.buffer_offset;
1097         uniform->u[2] = sb.buffer_size;
1098 }
1099
1100 static void
1101 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1102                                enum pipe_shader_type st,
1103                                unsigned samp_idx,
1104                                struct sysval_uniform *uniform)
1105 {
1106         struct panfrost_context *ctx = batch->ctx;
1107         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1108
1109         uniform->f[0] = sampl->min_lod;
1110         uniform->f[1] = sampl->max_lod;
1111         uniform->f[2] = sampl->lod_bias;
1112
1113         /* Even without any errata, Midgard represents "no mipmapping" as
1114          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1115          * panfrost_create_sampler_state which also explains our choice of
1116          * epsilon value (again to keep behaviour consistent) */
1117
1118         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1119                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1120 }
1121
1122 static void
1123 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1124                                        struct sysval_uniform *uniform)
1125 {
1126         struct panfrost_context *ctx = batch->ctx;
1127
1128         uniform->u[0] = ctx->compute_grid->grid[0];
1129         uniform->u[1] = ctx->compute_grid->grid[1];
1130         uniform->u[2] = ctx->compute_grid->grid[2];
1131 }
1132
1133 static void
1134 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1135                         struct panfrost_shader_state *ss,
1136                         enum pipe_shader_type st)
1137 {
1138         struct sysval_uniform *uniforms = (void *)buf;
1139
1140         for (unsigned i = 0; i < ss->sysval_count; ++i) {
1141                 int sysval = ss->sysval[i];
1142
1143                 switch (PAN_SYSVAL_TYPE(sysval)) {
1144                 case PAN_SYSVAL_VIEWPORT_SCALE:
1145                         panfrost_upload_viewport_scale_sysval(batch,
1146                                                               &uniforms[i]);
1147                         break;
1148                 case PAN_SYSVAL_VIEWPORT_OFFSET:
1149                         panfrost_upload_viewport_offset_sysval(batch,
1150                                                                &uniforms[i]);
1151                         break;
1152                 case PAN_SYSVAL_TEXTURE_SIZE:
1153                         panfrost_upload_txs_sysval(batch, st,
1154                                                    PAN_SYSVAL_ID(sysval),
1155                                                    &uniforms[i]);
1156                         break;
1157                 case PAN_SYSVAL_SSBO:
1158                         panfrost_upload_ssbo_sysval(batch, st,
1159                                                     PAN_SYSVAL_ID(sysval),
1160                                                     &uniforms[i]);
1161                         break;
1162                 case PAN_SYSVAL_NUM_WORK_GROUPS:
1163                         panfrost_upload_num_work_groups_sysval(batch,
1164                                                                &uniforms[i]);
1165                         break;
1166                 case PAN_SYSVAL_SAMPLER:
1167                         panfrost_upload_sampler_sysval(batch, st,
1168                                                        PAN_SYSVAL_ID(sysval),
1169                                                        &uniforms[i]);
1170                         break;
1171                 default:
1172                         assert(0);
1173                 }
1174         }
1175 }
1176
1177 static const void *
1178 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1179                                  unsigned index)
1180 {
1181         struct pipe_constant_buffer *cb = &buf->cb[index];
1182         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1183
1184         if (rsrc)
1185                 return rsrc->bo->cpu;
1186         else if (cb->user_buffer)
1187                 return cb->user_buffer;
1188         else
1189                 unreachable("No constant buffer");
1190 }
1191
1192 void
1193 panfrost_emit_const_buf(struct panfrost_batch *batch,
1194                         enum pipe_shader_type stage,
1195                         struct mali_vertex_tiler_postfix *postfix)
1196 {
1197         struct panfrost_context *ctx = batch->ctx;
1198         struct panfrost_shader_variants *all = ctx->shader[stage];
1199
1200         if (!all)
1201                 return;
1202
1203         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1204
1205         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1206
1207         /* Uniforms are implicitly UBO #0 */
1208         bool has_uniforms = buf->enabled_mask & (1 << 0);
1209
1210         /* Allocate room for the sysval and the uniforms */
1211         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1212         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1213         size_t size = sys_size + uniform_size;
1214         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1215                                                                         size);
1216
1217         /* Upload sysvals requested by the shader */
1218         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1219
1220         /* Upload uniforms */
1221         if (has_uniforms && uniform_size) {
1222                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1223                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1224         }
1225
1226         /* Next up, attach UBOs. UBO #0 is the uniforms we just
1227          * uploaded */
1228
1229         unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1230         assert(ubo_count >= 1);
1231
1232         size_t sz = sizeof(uint64_t) * ubo_count;
1233         uint64_t ubos[PAN_MAX_CONST_BUFFERS];
1234         int uniform_count = ss->uniform_count;
1235
1236         /* Upload uniforms as a UBO */
1237         ubos[0] = MALI_MAKE_UBO(2 + uniform_count, transfer.gpu);
1238
1239         /* The rest are honest-to-goodness UBOs */
1240
1241         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1242                 size_t usz = buf->cb[ubo].buffer_size;
1243                 bool enabled = buf->enabled_mask & (1 << ubo);
1244                 bool empty = usz == 0;
1245
1246                 if (!enabled || empty) {
1247                         /* Stub out disabled UBOs to catch accesses */
1248                         ubos[ubo] = MALI_MAKE_UBO(0, 0xDEAD0000);
1249                         continue;
1250                 }
1251
1252                 mali_ptr gpu = panfrost_map_constant_buffer_gpu(batch, stage,
1253                                                                 buf, ubo);
1254
1255                 unsigned bytes_per_field = 16;
1256                 unsigned aligned = ALIGN_POT(usz, bytes_per_field);
1257                 ubos[ubo] = MALI_MAKE_UBO(aligned / bytes_per_field, gpu);
1258         }
1259
1260         mali_ptr ubufs = panfrost_upload_transient(batch, ubos, sz);
1261         postfix->uniforms = transfer.gpu;
1262         postfix->uniform_buffers = ubufs;
1263
1264         buf->dirty_mask = 0;
1265 }
1266
1267 void
1268 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1269                             const struct pipe_grid_info *info,
1270                             struct midgard_payload_vertex_tiler *vtp)
1271 {
1272         struct panfrost_context *ctx = batch->ctx;
1273         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1274         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1275         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1276                                                            128));
1277         unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1278                                info->grid[2] * 4;
1279         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1280                                                                   shared_size,
1281                                                                   1);
1282
1283         struct mali_shared_memory shared = {
1284                 .shared_memory = bo->gpu,
1285                 .shared_workgroup_count =
1286                         util_logbase2_ceil(info->grid[0]) +
1287                         util_logbase2_ceil(info->grid[1]) +
1288                         util_logbase2_ceil(info->grid[2]),
1289                 .shared_unk1 = 0x2,
1290                 .shared_shift = util_logbase2(single_size) - 1
1291         };
1292
1293         vtp->postfix.shared_memory = panfrost_upload_transient(batch, &shared,
1294                                                                sizeof(shared));
1295 }
1296
1297 static mali_ptr
1298 panfrost_get_tex_desc(struct panfrost_batch *batch,
1299                       enum pipe_shader_type st,
1300                       struct panfrost_sampler_view *view)
1301 {
1302         if (!view)
1303                 return (mali_ptr) 0;
1304
1305         struct pipe_sampler_view *pview = &view->base;
1306         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1307
1308         /* Add the BO to the job so it's retained until the job is done. */
1309
1310         panfrost_batch_add_bo(batch, rsrc->bo,
1311                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1312                               panfrost_bo_access_for_stage(st));
1313
1314         panfrost_batch_add_bo(batch, view->midgard_bo,
1315                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1316                               panfrost_bo_access_for_stage(st));
1317
1318         return view->midgard_bo->gpu;
1319 }
1320
1321 void
1322 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1323                                   enum pipe_shader_type stage,
1324                                   struct mali_vertex_tiler_postfix *postfix)
1325 {
1326         struct panfrost_context *ctx = batch->ctx;
1327         struct panfrost_device *device = pan_device(ctx->base.screen);
1328
1329         if (!ctx->sampler_view_count[stage])
1330                 return;
1331
1332         if (device->quirks & IS_BIFROST) {
1333                 struct bifrost_texture_descriptor *descriptors;
1334
1335                 descriptors = malloc(sizeof(struct bifrost_texture_descriptor) *
1336                                      ctx->sampler_view_count[stage]);
1337
1338                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1339                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1340                         struct pipe_sampler_view *pview = &view->base;
1341                         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1342
1343                         /* Add the BOs to the job so they are retained until the job is done. */
1344
1345                         panfrost_batch_add_bo(batch, rsrc->bo,
1346                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1347                                               panfrost_bo_access_for_stage(stage));
1348
1349                         panfrost_batch_add_bo(batch, view->bifrost_bo,
1350                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1351                                               panfrost_bo_access_for_stage(stage));
1352
1353                         memcpy(&descriptors[i], view->bifrost_descriptor, sizeof(*view->bifrost_descriptor));
1354                 }
1355
1356                 postfix->textures = panfrost_upload_transient(batch,
1357                                                               descriptors,
1358                                                               sizeof(struct bifrost_texture_descriptor) *
1359                                                                       ctx->sampler_view_count[stage]);
1360
1361                 free(descriptors);
1362         } else {
1363                 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1364
1365                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i)
1366                         trampolines[i] = panfrost_get_tex_desc(batch, stage,
1367                                                                ctx->sampler_views[stage][i]);
1368
1369                 postfix->textures = panfrost_upload_transient(batch,
1370                                                               trampolines,
1371                                                               sizeof(uint64_t) *
1372                                                               ctx->sampler_view_count[stage]);
1373         }
1374 }
1375
1376 void
1377 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1378                                   enum pipe_shader_type stage,
1379                                   struct mali_vertex_tiler_postfix *postfix)
1380 {
1381         struct panfrost_context *ctx = batch->ctx;
1382         struct panfrost_device *device = pan_device(ctx->base.screen);
1383
1384         if (!ctx->sampler_count[stage])
1385                 return;
1386
1387         if (device->quirks & IS_BIFROST) {
1388                 size_t desc_size = sizeof(struct bifrost_sampler_descriptor);
1389                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1390                 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1391                                                                                 transfer_size);
1392                 struct bifrost_sampler_descriptor *desc = (struct bifrost_sampler_descriptor *)transfer.cpu;
1393
1394                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1395                         desc[i] = ctx->samplers[stage][i]->bifrost_hw;
1396
1397                 postfix->sampler_descriptor = transfer.gpu;
1398         } else {
1399                 size_t desc_size = sizeof(struct mali_sampler_descriptor);
1400                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1401                 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1402                                                                                 transfer_size);
1403                 struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1404
1405                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1406                         desc[i] = ctx->samplers[stage][i]->midgard_hw;
1407
1408                 postfix->sampler_descriptor = transfer.gpu;
1409         }
1410 }
1411
1412 void
1413 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1414                                struct mali_vertex_tiler_postfix *vertex_postfix)
1415 {
1416         struct panfrost_context *ctx = batch->ctx;
1417
1418         if (!ctx->vertex)
1419                 return;
1420
1421         struct panfrost_vertex_state *so = ctx->vertex;
1422
1423         panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1424         vertex_postfix->attribute_meta = panfrost_upload_transient(batch, so->hw,
1425                                                                sizeof(*so->hw) *
1426                                                                PAN_MAX_ATTRIBUTE);
1427 }
1428
1429 void
1430 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1431                           struct mali_vertex_tiler_postfix *vertex_postfix)
1432 {
1433         struct panfrost_context *ctx = batch->ctx;
1434         struct panfrost_vertex_state *so = ctx->vertex;
1435
1436         /* Staged mali_attr, and index into them. i =/= k, depending on the
1437          * vertex buffer mask and instancing. Twice as much room is allocated,
1438          * for a worst case of NPOT_DIVIDEs which take up extra slot */
1439         union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1440         unsigned k = 0;
1441
1442         for (unsigned i = 0; i < so->num_elements; ++i) {
1443                 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1444                  * means duplicating some vertex buffers (who cares? aside from
1445                  * maybe some caching implications but I somehow doubt that
1446                  * matters) */
1447
1448                 struct pipe_vertex_element *elem = &so->pipe[i];
1449                 unsigned vbi = elem->vertex_buffer_index;
1450
1451                 /* The exception to 1:1 mapping is that we can have multiple
1452                  * entries (NPOT divisors), so we fixup anyways */
1453
1454                 so->hw[i].index = k;
1455
1456                 if (!(ctx->vb_mask & (1 << vbi)))
1457                         continue;
1458
1459                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1460                 struct panfrost_resource *rsrc;
1461
1462                 rsrc = pan_resource(buf->buffer.resource);
1463                 if (!rsrc)
1464                         continue;
1465
1466                 /* Align to 64 bytes by masking off the lower bits. This
1467                  * will be adjusted back when we fixup the src_offset in
1468                  * mali_attr_meta */
1469
1470                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1471                 mali_ptr addr = raw_addr & ~63;
1472                 unsigned chopped_addr = raw_addr - addr;
1473
1474                 /* Add a dependency of the batch on the vertex buffer */
1475                 panfrost_batch_add_bo(batch, rsrc->bo,
1476                                       PAN_BO_ACCESS_SHARED |
1477                                       PAN_BO_ACCESS_READ |
1478                                       PAN_BO_ACCESS_VERTEX_TILER);
1479
1480                 /* Set common fields */
1481                 attrs[k].elements = addr;
1482                 attrs[k].stride = buf->stride;
1483
1484                 /* Since we advanced the base pointer, we shrink the buffer
1485                  * size */
1486                 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1487
1488                 /* We need to add the extra size we masked off (for
1489                  * correctness) so the data doesn't get clamped away */
1490                 attrs[k].size += chopped_addr;
1491
1492                 /* For non-instancing make sure we initialize */
1493                 attrs[k].shift = attrs[k].extra_flags = 0;
1494
1495                 /* Instancing uses a dramatically different code path than
1496                  * linear, so dispatch for the actual emission now that the
1497                  * common code is finished */
1498
1499                 unsigned divisor = elem->instance_divisor;
1500
1501                 if (divisor && ctx->instance_count == 1) {
1502                         /* Silly corner case where there's a divisor(=1) but
1503                          * there's no legitimate instancing. So we want *every*
1504                          * attribute to be the same. So set stride to zero so
1505                          * we don't go anywhere. */
1506
1507                         attrs[k].size = attrs[k].stride + chopped_addr;
1508                         attrs[k].stride = 0;
1509                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1510                 } else if (ctx->instance_count <= 1) {
1511                         /* Normal, non-instanced attributes */
1512                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1513                 } else {
1514                         unsigned instance_shift = vertex_postfix->instance_shift;
1515                         unsigned instance_odd = vertex_postfix->instance_odd;
1516
1517                         k += panfrost_vertex_instanced(ctx->padded_count,
1518                                                        instance_shift,
1519                                                        instance_odd,
1520                                                        divisor, &attrs[k]);
1521                 }
1522         }
1523
1524         /* Add special gl_VertexID/gl_InstanceID buffers */
1525
1526         panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1527         so->hw[PAN_VERTEX_ID].index = k++;
1528         panfrost_instance_id(ctx->padded_count, &attrs[k]);
1529         so->hw[PAN_INSTANCE_ID].index = k++;
1530
1531         /* Upload whatever we emitted and go */
1532
1533         vertex_postfix->attributes = panfrost_upload_transient(batch, attrs,
1534                                                            k * sizeof(*attrs));
1535 }
1536
1537 static mali_ptr
1538 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1539                        unsigned stride, unsigned count)
1540 {
1541         /* Fill out the descriptor */
1542         slot->stride = stride;
1543         slot->size = stride * count;
1544         slot->shift = slot->extra_flags = 0;
1545
1546         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1547                                                                         slot->size);
1548
1549         slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1550
1551         return transfer.gpu;
1552 }
1553
1554 static void
1555 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1556                         unsigned stride, unsigned offset, unsigned count,
1557                         struct pipe_stream_output_target *target)
1558 {
1559         /* Fill out the descriptor */
1560         slot->stride = stride * 4;
1561         slot->shift = slot->extra_flags = 0;
1562
1563         unsigned max_size = target->buffer_size;
1564         unsigned expected_size = slot->stride * count;
1565
1566         slot->size = MIN2(max_size, expected_size);
1567
1568         /* Grab the BO and bind it to the batch */
1569         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1570
1571         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1572          * the perspective of the TILER and FRAGMENT.
1573          */
1574         panfrost_batch_add_bo(batch, bo,
1575                               PAN_BO_ACCESS_SHARED |
1576                               PAN_BO_ACCESS_RW |
1577                               PAN_BO_ACCESS_VERTEX_TILER |
1578                               PAN_BO_ACCESS_FRAGMENT);
1579
1580         mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1581         slot->elements = addr;
1582 }
1583
1584 /* Given a shader and buffer indices, link varying metadata together */
1585
1586 static bool
1587 is_special_varying(gl_varying_slot loc)
1588 {
1589         switch (loc) {
1590         case VARYING_SLOT_POS:
1591         case VARYING_SLOT_PSIZ:
1592         case VARYING_SLOT_PNTC:
1593         case VARYING_SLOT_FACE:
1594                 return true;
1595         default:
1596                 return false;
1597         }
1598 }
1599
1600 static void
1601 panfrost_emit_varying_meta(void *outptr, struct panfrost_shader_state *ss,
1602                            signed general, signed gl_Position,
1603                            signed gl_PointSize, signed gl_PointCoord,
1604                            signed gl_FrontFacing)
1605 {
1606         struct mali_attr_meta *out = (struct mali_attr_meta *) outptr;
1607
1608         for (unsigned i = 0; i < ss->varying_count; ++i) {
1609                 gl_varying_slot location = ss->varyings_loc[i];
1610                 int index = -1;
1611
1612                 switch (location) {
1613                 case VARYING_SLOT_POS:
1614                         index = gl_Position;
1615                         break;
1616                 case VARYING_SLOT_PSIZ:
1617                         index = gl_PointSize;
1618                         break;
1619                 case VARYING_SLOT_PNTC:
1620                         index = gl_PointCoord;
1621                         break;
1622                 case VARYING_SLOT_FACE:
1623                         index = gl_FrontFacing;
1624                         break;
1625                 default:
1626                         index = general;
1627                         break;
1628                 }
1629
1630                 assert(index >= 0);
1631                 out[i].index = index;
1632         }
1633 }
1634
1635 static bool
1636 has_point_coord(unsigned mask, gl_varying_slot loc)
1637 {
1638         if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1639                 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1640         else if (loc == VARYING_SLOT_PNTC)
1641                 return (mask & (1 << 8));
1642         else
1643                 return false;
1644 }
1645
1646 /* Helpers for manipulating stream out information so we can pack varyings
1647  * accordingly. Compute the src_offset for a given captured varying */
1648
1649 static struct pipe_stream_output *
1650 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1651 {
1652         for (unsigned i = 0; i < info->num_outputs; ++i) {
1653                 if (info->output[i].register_index == loc)
1654                         return &info->output[i];
1655         }
1656
1657         unreachable("Varying not captured");
1658 }
1659
1660 /* TODO: Integers */
1661 static enum mali_format
1662 pan_xfb_format(unsigned nr_components)
1663 {
1664         switch (nr_components) {
1665                 case 1: return MALI_R32F;
1666                 case 2: return MALI_RG32F;
1667                 case 3: return MALI_RGB32F;
1668                 case 4: return MALI_RGBA32F;
1669                 default: unreachable("Invalid format");
1670         }
1671 }
1672
1673 void
1674 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1675                                  unsigned vertex_count,
1676                                  struct mali_vertex_tiler_postfix *vertex_postfix,
1677                                  struct mali_vertex_tiler_postfix *tiler_postfix,
1678                                  union midgard_primitive_size *primitive_size)
1679 {
1680         /* Load the shaders */
1681         struct panfrost_context *ctx = batch->ctx;
1682         struct panfrost_shader_state *vs, *fs;
1683         unsigned int num_gen_varyings = 0;
1684         size_t vs_size, fs_size;
1685
1686         /* Allocate the varying descriptor */
1687
1688         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1689         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1690         vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
1691         fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
1692
1693         struct panfrost_transfer trans = panfrost_allocate_transient(batch,
1694                                                                      vs_size +
1695                                                                      fs_size);
1696
1697         struct pipe_stream_output_info *so = &vs->stream_output;
1698
1699         /* Check if this varying is linked by us. This is the case for
1700          * general-purpose, non-captured varyings. If it is, link it. If it's
1701          * not, use the provided stream out information to determine the
1702          * offset, since it was already linked for us. */
1703
1704         for (unsigned i = 0; i < vs->varying_count; i++) {
1705                 gl_varying_slot loc = vs->varyings_loc[i];
1706
1707                 bool special = is_special_varying(loc);
1708                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1709
1710                 if (captured) {
1711                         struct pipe_stream_output *o = pan_get_so(so, loc);
1712
1713                         unsigned dst_offset = o->dst_offset * 4; /* dwords */
1714                         vs->varyings[i].src_offset = dst_offset;
1715                 } else if (!special) {
1716                         vs->varyings[i].src_offset = 16 * (num_gen_varyings++);
1717                 }
1718         }
1719
1720         /* Conversely, we need to set src_offset for the captured varyings.
1721          * Here, the layout is defined by the stream out info, not us */
1722
1723         /* Link up with fragment varyings */
1724         bool reads_point_coord = fs->reads_point_coord;
1725
1726         for (unsigned i = 0; i < fs->varying_count; i++) {
1727                 gl_varying_slot loc = fs->varyings_loc[i];
1728                 unsigned src_offset;
1729                 signed vs_idx = -1;
1730
1731                 /* Link up */
1732                 for (unsigned j = 0; j < vs->varying_count; ++j) {
1733                         if (vs->varyings_loc[j] == loc) {
1734                                 vs_idx = j;
1735                                 break;
1736                         }
1737                 }
1738
1739                 /* Either assign or reuse */
1740                 if (vs_idx >= 0)
1741                         src_offset = vs->varyings[vs_idx].src_offset;
1742                 else
1743                         src_offset = 16 * (num_gen_varyings++);
1744
1745                 fs->varyings[i].src_offset = src_offset;
1746
1747                 if (has_point_coord(fs->point_sprite_mask, loc))
1748                         reads_point_coord = true;
1749         }
1750
1751         memcpy(trans.cpu, vs->varyings, vs_size);
1752         memcpy(trans.cpu + vs_size, fs->varyings, fs_size);
1753
1754         union mali_attr varyings[PIPE_MAX_ATTRIBS] = {0};
1755
1756         /* Figure out how many streamout buffers could be bound */
1757         unsigned so_count = ctx->streamout.num_targets;
1758         for (unsigned i = 0; i < vs->varying_count; i++) {
1759                 gl_varying_slot loc = vs->varyings_loc[i];
1760
1761                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1762                 if (!captured) continue;
1763
1764                 struct pipe_stream_output *o = pan_get_so(so, loc);
1765                 so_count = MAX2(so_count, o->output_buffer + 1);
1766         }
1767
1768         signed idx = so_count;
1769         signed general = idx++;
1770         signed gl_Position = idx++;
1771         signed gl_PointSize = vs->writes_point_size ? (idx++) : -1;
1772         signed gl_PointCoord = reads_point_coord ? (idx++) : -1;
1773         signed gl_FrontFacing = fs->reads_face ? (idx++) : -1;
1774         signed gl_FragCoord = fs->reads_frag_coord ? (idx++) : -1;
1775
1776         /* Emit the stream out buffers */
1777
1778         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1779                                                            ctx->vertex_count);
1780
1781         for (unsigned i = 0; i < so_count; ++i) {
1782                 if (i < ctx->streamout.num_targets) {
1783                         panfrost_emit_streamout(batch, &varyings[i],
1784                                                 so->stride[i],
1785                                                 ctx->streamout.offsets[i],
1786                                                 out_count,
1787                                                 ctx->streamout.targets[i]);
1788                 } else {
1789                         /* Emit a dummy buffer */
1790                         panfrost_emit_varyings(batch, &varyings[i],
1791                                                so->stride[i] * 4,
1792                                                out_count);
1793
1794                         /* Clear the attribute type */
1795                         varyings[i].elements &= ~0xF;
1796                 }
1797         }
1798
1799         panfrost_emit_varyings(batch, &varyings[general],
1800                                num_gen_varyings * 16,
1801                                vertex_count);
1802
1803         mali_ptr varyings_p;
1804
1805         /* fp32 vec4 gl_Position */
1806         varyings_p = panfrost_emit_varyings(batch, &varyings[gl_Position],
1807                                             sizeof(float) * 4, vertex_count);
1808         tiler_postfix->position_varying = varyings_p;
1809
1810
1811         if (panfrost_writes_point_size(ctx)) {
1812                 varyings_p = panfrost_emit_varyings(batch,
1813                                                     &varyings[gl_PointSize],
1814                                                     2, vertex_count);
1815                 primitive_size->pointer = varyings_p;
1816         }
1817
1818         if (reads_point_coord)
1819                 varyings[gl_PointCoord].elements = MALI_VARYING_POINT_COORD;
1820
1821         if (fs->reads_face)
1822                 varyings[gl_FrontFacing].elements = MALI_VARYING_FRONT_FACING;
1823
1824         if (fs->reads_frag_coord)
1825                 varyings[gl_FragCoord].elements = MALI_VARYING_FRAG_COORD;
1826
1827         struct panfrost_device *device = pan_device(ctx->base.screen);
1828         assert(!(device->quirks & IS_BIFROST) || !(reads_point_coord));
1829
1830         /* Let's go ahead and link varying meta to the buffer in question, now
1831          * that that information is available. VARYING_SLOT_POS is mapped to
1832          * gl_FragCoord for fragment shaders but gl_Positionf or vertex shaders
1833          * */
1834
1835         panfrost_emit_varying_meta(trans.cpu, vs, general, gl_Position,
1836                                    gl_PointSize, gl_PointCoord,
1837                                    gl_FrontFacing);
1838
1839         panfrost_emit_varying_meta(trans.cpu + vs_size, fs, general,
1840                                    gl_FragCoord, gl_PointSize,
1841                                    gl_PointCoord, gl_FrontFacing);
1842
1843         /* Replace streamout */
1844
1845         struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
1846         struct mali_attr_meta *ofs = ovs + vs->varying_count;
1847
1848         for (unsigned i = 0; i < vs->varying_count; i++) {
1849                 gl_varying_slot loc = vs->varyings_loc[i];
1850
1851                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1852                 if (!captured)
1853                         continue;
1854
1855                 struct pipe_stream_output *o = pan_get_so(so, loc);
1856                 ovs[i].index = o->output_buffer;
1857
1858                 /* Set the type appropriately. TODO: Integer varyings XXX */
1859                 assert(o->stream == 0);
1860                 ovs[i].format = pan_xfb_format(o->num_components);
1861
1862                 if (device->quirks & HAS_SWIZZLES)
1863                         ovs[i].swizzle = panfrost_get_default_swizzle(o->num_components);
1864                 else
1865                         ovs[i].swizzle = panfrost_bifrost_swizzle(o->num_components);
1866
1867                 /* Link to the fragment */
1868                 signed fs_idx = -1;
1869
1870                 /* Link up */
1871                 for (unsigned j = 0; j < fs->varying_count; ++j) {
1872                         if (fs->varyings_loc[j] == loc) {
1873                                 fs_idx = j;
1874                                 break;
1875                         }
1876                 }
1877
1878                 if (fs_idx >= 0) {
1879                         ofs[fs_idx].index = ovs[i].index;
1880                         ofs[fs_idx].format = ovs[i].format;
1881                         ofs[fs_idx].swizzle = ovs[i].swizzle;
1882                 }
1883         }
1884
1885         /* Replace point sprite */
1886         for (unsigned i = 0; i < fs->varying_count; i++) {
1887                 /* If we have a point sprite replacement, handle that here. We
1888                  * have to translate location first.  TODO: Flip y in shader.
1889                  * We're already keying ... just time crunch .. */
1890
1891                 if (has_point_coord(fs->point_sprite_mask,
1892                                     fs->varyings_loc[i])) {
1893                         ofs[i].index = gl_PointCoord;
1894
1895                         /* Swizzle out the z/w to 0/1 */
1896                         ofs[i].format = MALI_RG16F;
1897                         ofs[i].swizzle = panfrost_get_default_swizzle(2);
1898                 }
1899         }
1900
1901         /* Fix up unaligned addresses */
1902         for (unsigned i = 0; i < so_count; ++i) {
1903                 if (varyings[i].elements < MALI_RECORD_SPECIAL)
1904                         continue;
1905
1906                 unsigned align = (varyings[i].elements & 63);
1907
1908                 /* While we're at it, the SO buffers are linear */
1909
1910                 if (!align) {
1911                         varyings[i].elements |= MALI_ATTR_LINEAR;
1912                         continue;
1913                 }
1914
1915                 /* We need to adjust alignment */
1916                 varyings[i].elements &= ~63;
1917                 varyings[i].elements |= MALI_ATTR_LINEAR;
1918                 varyings[i].size += align;
1919
1920                 for (unsigned v = 0; v < vs->varying_count; ++v) {
1921                         if (ovs[v].index != i)
1922                                 continue;
1923
1924                         ovs[v].src_offset = vs->varyings[v].src_offset + align;
1925                 }
1926
1927                 for (unsigned f = 0; f < fs->varying_count; ++f) {
1928                         if (ofs[f].index != i)
1929                                 continue;
1930
1931                         ofs[f].src_offset = fs->varyings[f].src_offset + align;
1932                 }
1933         }
1934
1935         varyings_p = panfrost_upload_transient(batch, varyings,
1936                                                idx * sizeof(*varyings));
1937         vertex_postfix->varyings = varyings_p;
1938         tiler_postfix->varyings = varyings_p;
1939
1940         vertex_postfix->varying_meta = trans.gpu;
1941         tiler_postfix->varying_meta = trans.gpu + vs_size;
1942 }
1943
1944 void
1945 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1946                                 struct mali_vertex_tiler_prefix *vertex_prefix,
1947                                 struct mali_vertex_tiler_postfix *vertex_postfix,
1948                                 struct mali_vertex_tiler_prefix *tiler_prefix,
1949                                 struct mali_vertex_tiler_postfix *tiler_postfix,
1950                                 union midgard_primitive_size *primitive_size)
1951 {
1952         struct panfrost_context *ctx = batch->ctx;
1953         struct panfrost_device *device = pan_device(ctx->base.screen);
1954         bool wallpapering = ctx->wallpaper_batch && batch->tiler_dep;
1955         struct bifrost_payload_vertex bifrost_vertex = {0,};
1956         struct bifrost_payload_tiler bifrost_tiler = {0,};
1957         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
1958         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
1959         void *vp, *tp;
1960         size_t vp_size, tp_size;
1961
1962         if (device->quirks & IS_BIFROST) {
1963                 bifrost_vertex.prefix = *vertex_prefix;
1964                 bifrost_vertex.postfix = *vertex_postfix;
1965                 vp = &bifrost_vertex;
1966                 vp_size = sizeof(bifrost_vertex);
1967
1968                 bifrost_tiler.prefix = *tiler_prefix;
1969                 bifrost_tiler.tiler.primitive_size = *primitive_size;
1970                 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
1971                 bifrost_tiler.postfix = *tiler_postfix;
1972                 tp = &bifrost_tiler;
1973                 tp_size = sizeof(bifrost_tiler);
1974         } else {
1975                 midgard_vertex.prefix = *vertex_prefix;
1976                 midgard_vertex.postfix = *vertex_postfix;
1977                 vp = &midgard_vertex;
1978                 vp_size = sizeof(midgard_vertex);
1979
1980                 midgard_tiler.prefix = *tiler_prefix;
1981                 midgard_tiler.postfix = *tiler_postfix;
1982                 midgard_tiler.primitive_size = *primitive_size;
1983                 tp = &midgard_tiler;
1984                 tp_size = sizeof(midgard_tiler);
1985         }
1986
1987         if (wallpapering) {
1988                 /* Inject in reverse order, with "predicted" job indices.
1989                  * THIS IS A HACK XXX */
1990                 panfrost_new_job(batch, JOB_TYPE_TILER, false,
1991                                  batch->job_index + 2, tp, tp_size, true);
1992                 panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
1993                                  vp, vp_size, true);
1994                 return;
1995         }
1996
1997         /* If rasterizer discard is enable, only submit the vertex */
1998
1999         bool rasterizer_discard = ctx->rasterizer &&
2000                                   ctx->rasterizer->base.rasterizer_discard;
2001
2002         unsigned vertex = panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2003                                            vp, vp_size, false);
2004
2005         if (rasterizer_discard)
2006                 return;
2007
2008         panfrost_new_job(batch, JOB_TYPE_TILER, false, vertex, tp, tp_size,
2009                          false);
2010 }
2011
2012 /* TODO: stop hardcoding this */
2013 mali_ptr
2014 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2015 {
2016         uint16_t locations[] = {
2017             128, 128,
2018             0, 256,
2019             0, 256,
2020             0, 256,
2021             0, 256,
2022             0, 256,
2023             0, 256,
2024             0, 256,
2025             0, 256,
2026             0, 256,
2027             0, 256,
2028             0, 256,
2029             0, 256,
2030             0, 256,
2031             0, 256,
2032             0, 256,
2033             0, 256,
2034             0, 256,
2035             0, 256,
2036             0, 256,
2037             0, 256,
2038             0, 256,
2039             0, 256,
2040             0, 256,
2041             0, 256,
2042             0, 256,
2043             0, 256,
2044             0, 256,
2045             0, 256,
2046             0, 256,
2047             0, 256,
2048             0, 256,
2049             128, 128,
2050             0, 0,
2051             0, 0,
2052             0, 0,
2053             0, 0,
2054             0, 0,
2055             0, 0,
2056             0, 0,
2057             0, 0,
2058             0, 0,
2059             0, 0,
2060             0, 0,
2061             0, 0,
2062             0, 0,
2063             0, 0,
2064             0, 0,
2065         };
2066
2067         return panfrost_upload_transient(batch, locations, 96 * sizeof(uint16_t));
2068 }