src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28
  29 #include "panfrost-quirks.h"
  30
  31 #include "pan_allocate.h"
  32 #include "pan_bo.h"
  33 #include "pan_cmdstream.h"
  34 #include "pan_context.h"
  35 #include "pan_job.h"
  36
  37 /* If a BO is accessed for a particular shader stage, will it be in the primary
  38  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  39  * fragment will be primary, e.g. compute jobs will be considered
  40  * "vertex/tiler" by analogy */
  41
  42 static inline uint32_t
  43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  44 {
  45         assert(stage == PIPE_SHADER_FRAGMENT ||
  46                stage == PIPE_SHADER_VERTEX ||
  47                stage == PIPE_SHADER_COMPUTE);
  48
  49         return stage == PIPE_SHADER_FRAGMENT ?
  50                PAN_BO_ACCESS_FRAGMENT :
  51                PAN_BO_ACCESS_VERTEX_TILER;
  52 }
  53
  54 static void
  55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
  56                                struct mali_vertex_tiler_postfix *postfix)
  57 {
  58         struct panfrost_device *dev = pan_device(ctx->base.screen);
  59         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  60
  61         unsigned shift = panfrost_get_stack_shift(batch->stack_size);
  62         struct mali_shared_memory shared = {
  63                 .stack_shift = shift,
  64                 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
  65                 .shared_workgroup_count = ~0,
  66         };
  67         postfix->shared_memory = panfrost_upload_transient(batch, &shared, sizeof(shared));
  68 }
  69
  70 static void
  71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
  72                                struct mali_vertex_tiler_postfix *postfix)
  73 {
  74         struct panfrost_device *dev = pan_device(ctx->base.screen);
  75         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  76
  77         /* If we haven't, reserve space for the framebuffer */
  78
  79         if (!batch->framebuffer.gpu) {
  80                 unsigned size = (dev->quirks & MIDGARD_SFBD) ?
  81                         sizeof(struct mali_single_framebuffer) :
  82                         sizeof(struct mali_framebuffer);
  83
  84                 batch->framebuffer = panfrost_allocate_transient(batch, size);
  85
  86                 /* Tag the pointer */
  87                 if (!(dev->quirks & MIDGARD_SFBD))
  88                         batch->framebuffer.gpu |= MALI_MFBD;
  89         }
  90
  91         postfix->shared_memory = batch->framebuffer.gpu;
  92 }
  93
  94 static void
  95 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
  96                               struct mali_vertex_tiler_prefix *prefix,
  97                               struct mali_vertex_tiler_postfix *postfix)
  98 {
  99         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 100
 101         postfix->gl_enables |= 0x7;
 102         SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
 103                 rasterizer && rasterizer->base.front_ccw);
 104         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
 105                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
 106         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
 107                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
 108         SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
 109                 rasterizer && rasterizer->base.flatshade_first);
 110 }
 111
 112 void
 113 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
 114                                   struct mali_vertex_tiler_prefix *prefix,
 115                                   union midgard_primitive_size *primitive_size)
 116 {
 117         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 118
 119         if (!panfrost_writes_point_size(ctx)) {
 120                 bool points = prefix->draw_mode == MALI_POINTS;
 121                 float val = 0.0f;
 122
 123                 if (rasterizer)
 124                         val = points ?
 125                               rasterizer->base.point_size :
 126                               rasterizer->base.line_width;
 127
 128                 primitive_size->constant = val;
 129         }
 130 }
 131
 132 static void
 133 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
 134                                    struct mali_vertex_tiler_postfix *postfix)
 135 {
 136         SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
 137         if (ctx->occlusion_query)
 138                 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
 139         else
 140                 postfix->occlusion_counter = 0;
 141 }
 142
 143 void
 144 panfrost_vt_init(struct panfrost_context *ctx,
 145                  enum pipe_shader_type stage,
 146                  struct mali_vertex_tiler_prefix *prefix,
 147                  struct mali_vertex_tiler_postfix *postfix)
 148 {
 149         struct panfrost_device *device = pan_device(ctx->base.screen);
 150
 151         if (!ctx->shader[stage])
 152                 return;
 153
 154         memset(prefix, 0, sizeof(*prefix));
 155         memset(postfix, 0, sizeof(*postfix));
 156
 157         if (device->quirks & IS_BIFROST) {
 158                 postfix->gl_enables = 0x2;
 159                 panfrost_vt_emit_shared_memory(ctx, postfix);
 160         } else {
 161                 postfix->gl_enables = 0x6;
 162                 panfrost_vt_attach_framebuffer(ctx, postfix);
 163         }
 164
 165         if (stage == PIPE_SHADER_FRAGMENT) {
 166                 panfrost_vt_update_occlusion_query(ctx, postfix);
 167                 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
 168         }
 169 }
 170
 171 static unsigned
 172 panfrost_translate_index_size(unsigned size)
 173 {
 174         switch (size) {
 175         case 1:
 176                 return MALI_DRAW_INDEXED_UINT8;
 177
 178         case 2:
 179                 return MALI_DRAW_INDEXED_UINT16;
 180
 181         case 4:
 182                 return MALI_DRAW_INDEXED_UINT32;
 183
 184         default:
 185                 unreachable("Invalid index size");
 186         }
 187 }
 188
 189 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
 190  * good for the duration of the draw (transient), could last longer. Also get
 191  * the bounds on the index buffer for the range accessed by the draw. We do
 192  * these operations together because there are natural optimizations which
 193  * require them to be together. */
 194
 195 static mali_ptr
 196 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 197                                   const struct pipe_draw_info *info,
 198                                   unsigned *min_index, unsigned *max_index)
 199 {
 200         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 201         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 202         off_t offset = info->start * info->index_size;
 203         bool needs_indices = true;
 204         mali_ptr out = 0;
 205
 206         if (info->max_index != ~0u) {
 207                 *min_index = info->min_index;
 208                 *max_index = info->max_index;
 209                 needs_indices = false;
 210         }
 211
 212         if (!info->has_user_indices) {
 213                 /* Only resources can be directly mapped */
 214                 panfrost_batch_add_bo(batch, rsrc->bo,
 215                                       PAN_BO_ACCESS_SHARED |
 216                                       PAN_BO_ACCESS_READ |
 217                                       PAN_BO_ACCESS_VERTEX_TILER);
 218                 out = rsrc->bo->gpu + offset;
 219
 220                 /* Check the cache */
 221                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 222                                                            info->start,
 223                                                            info->count,
 224                                                            min_index,
 225                                                            max_index);
 226         } else {
 227                 /* Otherwise, we need to upload to transient memory */
 228                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 229                 out = panfrost_upload_transient(batch, ibuf8 + offset,
 230                                                 info->count *
 231                                                 info->index_size);
 232         }
 233
 234         if (needs_indices) {
 235                 /* Fallback */
 236                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 237
 238                 if (!info->has_user_indices)
 239                         panfrost_minmax_cache_add(rsrc->index_cache,
 240                                                   info->start, info->count,
 241                                                   *min_index, *max_index);
 242         }
 243
 244         return out;
 245 }
 246
 247 void
 248 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
 249                           const struct pipe_draw_info *info,
 250                           enum mali_draw_mode draw_mode,
 251                           struct mali_vertex_tiler_postfix *vertex_postfix,
 252                           struct mali_vertex_tiler_prefix *tiler_prefix,
 253                           struct mali_vertex_tiler_postfix *tiler_postfix,
 254                           unsigned *vertex_count,
 255                           unsigned *padded_count)
 256 {
 257         tiler_prefix->draw_mode = draw_mode;
 258
 259         unsigned draw_flags = 0;
 260
 261         if (panfrost_writes_point_size(ctx))
 262                 draw_flags |= MALI_DRAW_VARYING_SIZE;
 263
 264         if (info->primitive_restart)
 265                 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
 266
 267         /* These doesn't make much sense */
 268
 269         draw_flags |= 0x3000;
 270
 271         if (info->index_size) {
 272                 unsigned min_index = 0, max_index = 0;
 273
 274                 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
 275                                                                        info,
 276                                                                        &min_index,
 277                                                                        &max_index);
 278
 279                 /* Use the corresponding values */
 280                 *vertex_count = max_index - min_index + 1;
 281                 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
 282                 tiler_prefix->offset_bias_correction = -min_index;
 283                 tiler_prefix->index_count = MALI_POSITIVE(info->count);
 284                 draw_flags |= panfrost_translate_index_size(info->index_size);
 285         } else {
 286                 tiler_prefix->indices = 0;
 287                 *vertex_count = ctx->vertex_count;
 288                 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
 289                 tiler_prefix->offset_bias_correction = 0;
 290                 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
 291         }
 292
 293         tiler_prefix->unknown_draw = draw_flags;
 294
 295         /* Encode the padded vertex count */
 296
 297         if (info->instance_count > 1) {
 298                 *padded_count = panfrost_padded_vertex_count(*vertex_count);
 299
 300                 unsigned shift = __builtin_ctz(ctx->padded_count);
 301                 unsigned k = ctx->padded_count >> (shift + 1);
 302
 303                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
 304                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
 305         } else {
 306                 *padded_count = *vertex_count;
 307
 308                 /* Reset instancing state */
 309                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
 310                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
 311         }
 312 }
 313
 314 static void
 315 panfrost_shader_meta_init(struct panfrost_context *ctx,
 316                           enum pipe_shader_type st,
 317                           struct mali_shader_meta *meta)
 318 {
 319         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 320         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 321
 322         memset(meta, 0, sizeof(*meta));
 323         meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
 324         meta->attribute_count = ss->attribute_count;
 325         meta->varying_count = ss->varying_count;
 326         meta->texture_count = ctx->sampler_view_count[st];
 327         meta->sampler_count = ctx->sampler_count[st];
 328
 329         if (dev->quirks & IS_BIFROST) {
 330                 if (st == PIPE_SHADER_VERTEX)
 331                         meta->bifrost1.unk1 = 0x800000;
 332                 else {
 333                         /* First clause ATEST |= 0x4000000.
 334                          * Less than 32 regs |= 0x200 */
 335                         meta->bifrost1.unk1 = 0x958020;
 336                 }
 337
 338                 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 339                 if (st == PIPE_SHADER_VERTEX)
 340                         meta->bifrost2.preload_regs = 0xC0;
 341                 else
 342                         meta->bifrost2.preload_regs = 0x1;
 343                 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
 344                                                     ss->uniform_cutoff);
 345         } else {
 346                 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
 347                                                     ss->uniform_cutoff);
 348                 meta->midgard1.work_count = ss->work_reg_count;
 349                 meta->midgard1.flags_hi = 0x8; /* XXX */
 350                 meta->midgard1.flags_lo = 0x220;
 351                 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 352         }
 353 }
 354
 355 static unsigned
 356 panfrost_translate_compare_func(enum pipe_compare_func in)
 357 {
 358         switch (in) {
 359         case PIPE_FUNC_NEVER:
 360                 return MALI_FUNC_NEVER;
 361
 362         case PIPE_FUNC_LESS:
 363                 return MALI_FUNC_LESS;
 364
 365         case PIPE_FUNC_EQUAL:
 366                 return MALI_FUNC_EQUAL;
 367
 368         case PIPE_FUNC_LEQUAL:
 369                 return MALI_FUNC_LEQUAL;
 370
 371         case PIPE_FUNC_GREATER:
 372                 return MALI_FUNC_GREATER;
 373
 374         case PIPE_FUNC_NOTEQUAL:
 375                 return MALI_FUNC_NOTEQUAL;
 376
 377         case PIPE_FUNC_GEQUAL:
 378                 return MALI_FUNC_GEQUAL;
 379
 380         case PIPE_FUNC_ALWAYS:
 381                 return MALI_FUNC_ALWAYS;
 382
 383         default:
 384                 unreachable("Invalid func");
 385         }
 386 }
 387
 388 static unsigned
 389 panfrost_translate_stencil_op(enum pipe_stencil_op in)
 390 {
 391         switch (in) {
 392         case PIPE_STENCIL_OP_KEEP:
 393                 return MALI_STENCIL_KEEP;
 394
 395         case PIPE_STENCIL_OP_ZERO:
 396                 return MALI_STENCIL_ZERO;
 397
 398         case PIPE_STENCIL_OP_REPLACE:
 399                return MALI_STENCIL_REPLACE;
 400
 401         case PIPE_STENCIL_OP_INCR:
 402                 return MALI_STENCIL_INCR;
 403
 404         case PIPE_STENCIL_OP_DECR:
 405                 return MALI_STENCIL_DECR;
 406
 407         case PIPE_STENCIL_OP_INCR_WRAP:
 408                 return MALI_STENCIL_INCR_WRAP;
 409
 410         case PIPE_STENCIL_OP_DECR_WRAP:
 411                 return MALI_STENCIL_DECR_WRAP;
 412
 413         case PIPE_STENCIL_OP_INVERT:
 414                 return MALI_STENCIL_INVERT;
 415
 416         default:
 417                 unreachable("Invalid stencil op");
 418         }
 419 }
 420
 421 static unsigned
 422 translate_tex_wrap(enum pipe_tex_wrap w)
 423 {
 424         switch (w) {
 425         case PIPE_TEX_WRAP_REPEAT:
 426                 return MALI_WRAP_REPEAT;
 427
 428         case PIPE_TEX_WRAP_CLAMP:
 429                 return MALI_WRAP_CLAMP;
 430
 431         case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 432                 return MALI_WRAP_CLAMP_TO_EDGE;
 433
 434         case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
 435                 return MALI_WRAP_CLAMP_TO_BORDER;
 436
 437         case PIPE_TEX_WRAP_MIRROR_REPEAT:
 438                 return MALI_WRAP_MIRRORED_REPEAT;
 439
 440         case PIPE_TEX_WRAP_MIRROR_CLAMP:
 441                 return MALI_WRAP_MIRRORED_CLAMP;
 442
 443         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 444                 return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE;
 445
 446         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
 447                 return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER;
 448
 449         default:
 450                 unreachable("Invalid wrap");
 451         }
 452 }
 453
 454 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 455                                 struct mali_sampler_descriptor *hw)
 456 {
 457         unsigned func = panfrost_translate_compare_func(cso->compare_func);
 458         bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 459         bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 460         bool mip_linear  = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
 461         unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
 462         unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
 463         unsigned mip_filter = mip_linear  ?
 464                               (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
 465         unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
 466
 467         *hw = (struct mali_sampler_descriptor) {
 468                 .filter_mode = min_filter | mag_filter | mip_filter |
 469                                normalized,
 470                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 471                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 472                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 473                 .compare_func = panfrost_flip_compare_func(func),
 474                 .border_color = {
 475                         cso->border_color.f[0],
 476                         cso->border_color.f[1],
 477                         cso->border_color.f[2],
 478                         cso->border_color.f[3]
 479                 },
 480                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 481                 .max_lod = FIXED_16(cso->max_lod, false),
 482                 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
 483                 .seamless_cube_map = cso->seamless_cube_map,
 484         };
 485
 486         /* If necessary, we disable mipmapping in the sampler descriptor by
 487          * clamping the LOD as tight as possible (from 0 to epsilon,
 488          * essentially -- remember these are fixed point numbers, so
 489          * epsilon=1/256) */
 490
 491         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 492                 hw->max_lod = hw->min_lod + 1;
 493 }
 494
 495 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
 496                                         struct bifrost_sampler_descriptor *hw)
 497 {
 498         *hw = (struct bifrost_sampler_descriptor) {
 499                 .unk1 = 0x1,
 500                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 501                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 502                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 503                 .unk8 = 0x8,
 504                 .min_filter = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST,
 505                 .norm_coords = cso->normalized_coords,
 506                 .mip_filter = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR,
 507                 .mag_filter = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR,
 508                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 509                 .max_lod = FIXED_16(cso->max_lod, false),
 510         };
 511
 512         /* If necessary, we disable mipmapping in the sampler descriptor by
 513          * clamping the LOD as tight as possible (from 0 to epsilon,
 514          * essentially -- remember these are fixed point numbers, so
 515          * epsilon=1/256) */
 516
 517         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 518                 hw->max_lod = hw->min_lod + 1;
 519 }
 520
 521 static void
 522 panfrost_make_stencil_state(const struct pipe_stencil_state *in,
 523                             struct mali_stencil_test *out)
 524 {
 525         out->ref = 0; /* Gallium gets it from elsewhere */
 526
 527         out->mask = in->valuemask;
 528         out->func = panfrost_translate_compare_func(in->func);
 529         out->sfail = panfrost_translate_stencil_op(in->fail_op);
 530         out->dpfail = panfrost_translate_stencil_op(in->zfail_op);
 531         out->dppass = panfrost_translate_stencil_op(in->zpass_op);
 532 }
 533
 534 static void
 535 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
 536                                      struct mali_shader_meta *fragmeta)
 537 {
 538         if (!ctx->rasterizer) {
 539                 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
 540                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
 541                 fragmeta->depth_units = 0.0f;
 542                 fragmeta->depth_factor = 0.0f;
 543                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
 544                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
 545                 return;
 546         }
 547
 548         bool msaa = ctx->rasterizer->base.multisample;
 549
 550         /* TODO: Sample size */
 551         SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
 552         SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
 553         fragmeta->depth_units = ctx->rasterizer->base.offset_units * 2.0f;
 554         fragmeta->depth_factor = ctx->rasterizer->base.offset_scale;
 555
 556         /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
 557
 558         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A,
 559                 ctx->rasterizer->base.offset_tri);
 560         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B,
 561                 ctx->rasterizer->base.offset_tri);
 562 }
 563
 564 static void
 565 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
 566                               struct mali_shader_meta *fragmeta)
 567 {
 568         const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
 569         int zfunc = PIPE_FUNC_ALWAYS;
 570
 571         if (!zsa) {
 572                 struct pipe_stencil_state default_stencil = {
 573                         .enabled = 0,
 574                         .func = PIPE_FUNC_ALWAYS,
 575                         .fail_op = MALI_STENCIL_KEEP,
 576                         .zfail_op = MALI_STENCIL_KEEP,
 577                         .zpass_op = MALI_STENCIL_KEEP,
 578                         .writemask = 0xFF,
 579                         .valuemask = 0xFF
 580                 };
 581
 582                 panfrost_make_stencil_state(&default_stencil,
 583                                             &fragmeta->stencil_front);
 584                 fragmeta->stencil_mask_front = default_stencil.writemask;
 585                 fragmeta->stencil_back = fragmeta->stencil_front;
 586                 fragmeta->stencil_mask_back = default_stencil.writemask;
 587                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
 588                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
 589         } else {
 590                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
 591                         zsa->stencil[0].enabled);
 592                 panfrost_make_stencil_state(&zsa->stencil[0],
 593                                             &fragmeta->stencil_front);
 594                 fragmeta->stencil_mask_front = zsa->stencil[0].writemask;
 595                 fragmeta->stencil_front.ref = ctx->stencil_ref.ref_value[0];
 596
 597                 /* If back-stencil is not enabled, use the front values */
 598
 599                 if (zsa->stencil[1].enabled) {
 600                         panfrost_make_stencil_state(&zsa->stencil[1],
 601                                                     &fragmeta->stencil_back);
 602                         fragmeta->stencil_mask_back = zsa->stencil[1].writemask;
 603                         fragmeta->stencil_back.ref = ctx->stencil_ref.ref_value[1];
 604                 } else {
 605                         fragmeta->stencil_back = fragmeta->stencil_front;
 606                         fragmeta->stencil_mask_back = fragmeta->stencil_mask_front;
 607                         fragmeta->stencil_back.ref = fragmeta->stencil_front.ref;
 608                 }
 609
 610                 if (zsa->depth.enabled)
 611                         zfunc = zsa->depth.func;
 612
 613                 /* Depth state (TODO: Refactor) */
 614
 615                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
 616                         zsa->depth.writemask);
 617         }
 618
 619         fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
 620         fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
 621 }
 622
 623 static void
 624 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
 625                                 struct mali_shader_meta *fragmeta,
 626                                 void *rts)
 627 {
 628         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 629
 630         SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
 631                 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
 632                 !ctx->blend->base.dither);
 633
 634         /* Get blending setup */
 635         unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 636
 637         struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 638         unsigned shader_offset = 0;
 639         struct panfrost_bo *shader_bo = NULL;
 640
 641         for (unsigned c = 0; c < rt_count; ++c)
 642                 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
 643                                                           &shader_offset);
 644
 645          /* If there is a blend shader, work registers are shared. XXX: opt */
 646
 647         for (unsigned c = 0; c < rt_count; ++c) {
 648                 if (blend[c].is_shader)
 649                         fragmeta->midgard1.work_count = 16;
 650         }
 651
 652         /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
 653          * copied to the blend_meta appended (by convention), but this is the
 654          * field actually read by the hardware. (Or maybe both are read...?).
 655          * Specify the last RTi with a blend shader. */
 656
 657         fragmeta->blend.shader = 0;
 658
 659         for (signed rt = (rt_count - 1); rt >= 0; --rt) {
 660                 if (!blend[rt].is_shader)
 661                         continue;
 662
 663                 fragmeta->blend.shader = blend[rt].shader.gpu |
 664                                          blend[rt].shader.first_tag;
 665                 break;
 666         }
 667
 668         if (dev->quirks & MIDGARD_SFBD) {
 669                 /* When only a single render target platform is used, the blend
 670                  * information is inside the shader meta itself. We additionally
 671                  * need to signal CAN_DISCARD for nontrivial blend modes (so
 672                  * we're able to read back the destination buffer) */
 673
 674                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
 675                         blend[0].is_shader);
 676
 677                 if (!blend[0].is_shader) {
 678                         fragmeta->blend.equation = *blend[0].equation.equation;
 679                         fragmeta->blend.constant = blend[0].equation.constant;
 680                 }
 681
 682                 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
 683                         !blend[0].no_blending);
 684                 return;
 685         }
 686
 687         /* Additional blend descriptor tacked on for jobs using MFBD */
 688
 689         for (unsigned i = 0; i < rt_count; ++i) {
 690                 if (dev->quirks & IS_BIFROST) {
 691                         struct bifrost_blend_rt *brts = rts;
 692                         struct panfrost_shader_state *fs;
 693                         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 694
 695                         brts[i].flags = 0x200;
 696                         if (blend[i].is_shader) {
 697                                 /* The blend shader's address needs to be at
 698                                  * the same top 32 bit as the fragment shader.
 699                                  * TODO: Ensure that's always the case.
 700                                  */
 701                                 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
 702                                        (fs->bo->gpu & (0xffffffffull << 32)));
 703                                 brts[i].shader = blend[i].shader.gpu;
 704                                 brts[i].unk2 = 0x0;
 705                         } else {
 706                                 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
 707                                 const struct util_format_description *format_desc;
 708                                 format_desc = util_format_description(format);
 709
 710                                 brts[i].equation = *blend[i].equation.equation;
 711
 712                                 /* TODO: this is a bit more complicated */
 713                                 brts[i].constant = blend[i].equation.constant;
 714
 715                                 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
 716                                 brts[i].unk2 = 0x19;
 717
 718                                 brts[i].shader_type = fs->blend_types[i];
 719                         }
 720                 } else {
 721                         struct midgard_blend_rt *mrts = rts;
 722
 723                         mrts[i].flags = 0x200;
 724
 725                         bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
 726                                        (ctx->pipe_framebuffer.cbufs[i]) &&
 727                                        util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
 728
 729                         SET_BIT(mrts[i].flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
 730                         SET_BIT(mrts[i].flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
 731                         SET_BIT(mrts[i].flags, MALI_BLEND_SRGB, is_srgb);
 732                         SET_BIT(mrts[i].flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
 733
 734                         if (blend[i].is_shader) {
 735                                 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 736                         } else {
 737                                 mrts[i].blend.equation = *blend[i].equation.equation;
 738                                 mrts[i].blend.constant = blend[i].equation.constant;
 739                         }
 740                 }
 741         }
 742 }
 743
 744 static void
 745 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
 746                                struct mali_shader_meta *fragmeta,
 747                                void *rts)
 748 {
 749         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 750         struct panfrost_shader_state *fs;
 751
 752         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 753
 754         fragmeta->alpha_coverage = ~MALI_ALPHA_COVERAGE(0.000000);
 755         fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x3010;
 756         fragmeta->unknown2_4 = 0x4e0;
 757
 758         /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
 759          * is required (independent of 32-bit/64-bit descriptors), or why it's
 760          * not used on later GPU revisions. Otherwise, all shader jobs fault on
 761          * these earlier chips (perhaps this is a chicken bit of some kind).
 762          * More investigation is needed. */
 763
 764         SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
 765
 766         /* Depending on whether it's legal to in the given shader, we try to
 767          * enable early-z testing (or forward-pixel kill?) */
 768
 769         SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
 770                 !fs->can_discard && !fs->writes_depth);
 771
 772         /* Add the writes Z/S flags if needed. */
 773         SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
 774         SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
 775
 776         /* Any time texturing is used, derivatives are implicitly calculated,
 777          * so we need to enable helper invocations */
 778
 779         SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
 780                 fs->helper_invocations);
 781
 782         /* CAN_DISCARD should be set if the fragment shader possibly contains a
 783          * 'discard' instruction. It is likely this is related to optimizations
 784          * related to forward-pixel kill, as per "Mali Performance 3: Is
 785          * EGL_BUFFER_PRESERVED a good thing?" by Peter Harris */
 786
 787         SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD, fs->can_discard);
 788         SET_BIT(fragmeta->midgard1.flags_lo, 0x400, fs->can_discard);
 789
 790         panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
 791         panfrost_frag_meta_zsa_update(ctx, fragmeta);
 792         panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
 793 }
 794
 795 void
 796 panfrost_emit_shader_meta(struct panfrost_batch *batch,
 797                           enum pipe_shader_type st,
 798                           struct mali_vertex_tiler_postfix *postfix)
 799 {
 800         struct panfrost_context *ctx = batch->ctx;
 801         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 802
 803         if (!ss) {
 804                 postfix->shader = 0;
 805                 return;
 806         }
 807
 808         struct mali_shader_meta meta;
 809
 810         panfrost_shader_meta_init(ctx, st, &meta);
 811
 812         /* Add the shader BO to the batch. */
 813         panfrost_batch_add_bo(batch, ss->bo,
 814                               PAN_BO_ACCESS_PRIVATE |
 815                               PAN_BO_ACCESS_READ |
 816                               panfrost_bo_access_for_stage(st));
 817
 818         mali_ptr shader_ptr;
 819
 820         if (st == PIPE_SHADER_FRAGMENT) {
 821                 struct panfrost_device *dev = pan_device(ctx->base.screen);
 822                 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 823                 size_t desc_size = sizeof(meta);
 824                 void *rts = NULL;
 825                 struct panfrost_transfer xfer;
 826                 unsigned rt_size;
 827
 828                 if (dev->quirks & MIDGARD_SFBD)
 829                         rt_size = 0;
 830                 else if (dev->quirks & IS_BIFROST)
 831                         rt_size = sizeof(struct bifrost_blend_rt);
 832                 else
 833                         rt_size = sizeof(struct midgard_blend_rt);
 834
 835                 desc_size += rt_size * rt_count;
 836
 837                 if (rt_size)
 838                         rts = rzalloc_size(ctx, rt_size * rt_count);
 839
 840                 panfrost_frag_shader_meta_init(ctx, &meta, rts);
 841
 842                 xfer = panfrost_allocate_transient(batch, desc_size);
 843
 844                 memcpy(xfer.cpu, &meta, sizeof(meta));
 845                 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
 846
 847                 if (rt_size)
 848                         ralloc_free(rts);
 849
 850                 shader_ptr = xfer.gpu;
 851         } else {
 852                 shader_ptr = panfrost_upload_transient(batch, &meta,
 853                                                        sizeof(meta));
 854         }
 855
 856         postfix->shader = shader_ptr;
 857 }
 858
 859 static void
 860 panfrost_mali_viewport_init(struct panfrost_context *ctx,
 861                             struct mali_viewport *mvp)
 862 {
 863         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 864
 865         /* Clip bounds are encoded as floats. The viewport itself is encoded as
 866          * (somewhat) asymmetric ints. */
 867
 868         const struct pipe_scissor_state *ss = &ctx->scissor;
 869
 870         memset(mvp, 0, sizeof(*mvp));
 871
 872         /* By default, do no viewport clipping, i.e. clip to (-inf, inf) in
 873          * each direction. Clipping to the viewport in theory should work, but
 874          * in practice causes issues when we're not explicitly trying to
 875          * scissor */
 876
 877         *mvp = (struct mali_viewport) {
 878                 .clip_minx = -INFINITY,
 879                 .clip_miny = -INFINITY,
 880                 .clip_maxx = INFINITY,
 881                 .clip_maxy = INFINITY,
 882         };
 883
 884         /* Always scissor to the viewport by default. */
 885         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 886         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 887
 888         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 889         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 890
 891         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 892         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 893
 894         /* Apply the scissor test */
 895
 896         unsigned minx, miny, maxx, maxy;
 897
 898         if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
 899                 minx = MAX2(ss->minx, vp_minx);
 900                 miny = MAX2(ss->miny, vp_miny);
 901                 maxx = MIN2(ss->maxx, vp_maxx);
 902                 maxy = MIN2(ss->maxy, vp_maxy);
 903         } else {
 904                 minx = vp_minx;
 905                 miny = vp_miny;
 906                 maxx = vp_maxx;
 907                 maxy = vp_maxy;
 908         }
 909
 910         /* Hardware needs the min/max to be strictly ordered, so flip if we
 911          * need to. The viewport transformation in the vertex shader will
 912          * handle the negatives if we don't */
 913
 914         if (miny > maxy) {
 915                 unsigned temp = miny;
 916                 miny = maxy;
 917                 maxy = temp;
 918         }
 919
 920         if (minx > maxx) {
 921                 unsigned temp = minx;
 922                 minx = maxx;
 923                 maxx = temp;
 924         }
 925
 926         if (minz > maxz) {
 927                 float temp = minz;
 928                 minz = maxz;
 929                 maxz = temp;
 930         }
 931
 932         /* Clamp to the framebuffer size as a last check */
 933
 934         minx = MIN2(ctx->pipe_framebuffer.width, minx);
 935         maxx = MIN2(ctx->pipe_framebuffer.width, maxx);
 936
 937         miny = MIN2(ctx->pipe_framebuffer.height, miny);
 938         maxy = MIN2(ctx->pipe_framebuffer.height, maxy);
 939
 940         /* Upload */
 941
 942         mvp->viewport0[0] = minx;
 943         mvp->viewport1[0] = MALI_POSITIVE(maxx);
 944
 945         mvp->viewport0[1] = miny;
 946         mvp->viewport1[1] = MALI_POSITIVE(maxy);
 947
 948         mvp->clip_minz = minz;
 949         mvp->clip_maxz = maxz;
 950 }
 951
 952 void
 953 panfrost_emit_viewport(struct panfrost_batch *batch,
 954                        struct mali_vertex_tiler_postfix *tiler_postfix)
 955 {
 956         struct panfrost_context *ctx = batch->ctx;
 957         struct mali_viewport mvp;
 958
 959         panfrost_mali_viewport_init(batch->ctx,  &mvp);
 960
 961         /* Update the job, unless we're doing wallpapering (whose lack of
 962          * scissor we can ignore, since if we "miss" a tile of wallpaper, it'll
 963          * just... be faster :) */
 964
 965         if (!ctx->wallpaper_batch)
 966                 panfrost_batch_union_scissor(batch, mvp.viewport0[0],
 967                                              mvp.viewport0[1],
 968                                              mvp.viewport1[0] + 1,
 969                                              mvp.viewport1[1] + 1);
 970
 971         tiler_postfix->viewport = panfrost_upload_transient(batch, &mvp,
 972                                                             sizeof(mvp));
 973 }
 974
 975 static mali_ptr
 976 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
 977                                  enum pipe_shader_type st,
 978                                  struct panfrost_constant_buffer *buf,
 979                                  unsigned index)
 980 {
 981         struct pipe_constant_buffer *cb = &buf->cb[index];
 982         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
 983
 984         if (rsrc) {
 985                 panfrost_batch_add_bo(batch, rsrc->bo,
 986                                       PAN_BO_ACCESS_SHARED |
 987                                       PAN_BO_ACCESS_READ |
 988                                       panfrost_bo_access_for_stage(st));
 989
 990                 /* Alignment gauranteed by
 991                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
 992                 return rsrc->bo->gpu + cb->buffer_offset;
 993         } else if (cb->user_buffer) {
 994                 return panfrost_upload_transient(batch,
 995                                                  cb->user_buffer +
 996                                                  cb->buffer_offset,
 997                                                  cb->buffer_size);
 998         } else {
 999                 unreachable("No constant buffer");
1000         }
1001 }
1002
1003 struct sysval_uniform {
1004         union {
1005                 float f[4];
1006                 int32_t i[4];
1007                 uint32_t u[4];
1008                 uint64_t du[2];
1009         };
1010 };
1011
1012 static void
1013 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
1014                                       struct sysval_uniform *uniform)
1015 {
1016         struct panfrost_context *ctx = batch->ctx;
1017         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1018
1019         uniform->f[0] = vp->scale[0];
1020         uniform->f[1] = vp->scale[1];
1021         uniform->f[2] = vp->scale[2];
1022 }
1023
1024 static void
1025 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1026                                        struct sysval_uniform *uniform)
1027 {
1028         struct panfrost_context *ctx = batch->ctx;
1029         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1030
1031         uniform->f[0] = vp->translate[0];
1032         uniform->f[1] = vp->translate[1];
1033         uniform->f[2] = vp->translate[2];
1034 }
1035
1036 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1037                                        enum pipe_shader_type st,
1038                                        unsigned int sysvalid,
1039                                        struct sysval_uniform *uniform)
1040 {
1041         struct panfrost_context *ctx = batch->ctx;
1042         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1043         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1044         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1045         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1046
1047         assert(dim);
1048         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1049
1050         if (dim > 1)
1051                 uniform->i[1] = u_minify(tex->texture->height0,
1052                                          tex->u.tex.first_level);
1053
1054         if (dim > 2)
1055                 uniform->i[2] = u_minify(tex->texture->depth0,
1056                                          tex->u.tex.first_level);
1057
1058         if (is_array)
1059                 uniform->i[dim] = tex->texture->array_size;
1060 }
1061
1062 static void
1063 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1064                             enum pipe_shader_type st,
1065                             unsigned ssbo_id,
1066                             struct sysval_uniform *uniform)
1067 {
1068         struct panfrost_context *ctx = batch->ctx;
1069
1070         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1071         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1072
1073         /* Compute address */
1074         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1075
1076         panfrost_batch_add_bo(batch, bo,
1077                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1078                               panfrost_bo_access_for_stage(st));
1079
1080         /* Upload address and size as sysval */
1081         uniform->du[0] = bo->gpu + sb.buffer_offset;
1082         uniform->u[2] = sb.buffer_size;
1083 }
1084
1085 static void
1086 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1087                                enum pipe_shader_type st,
1088                                unsigned samp_idx,
1089                                struct sysval_uniform *uniform)
1090 {
1091         struct panfrost_context *ctx = batch->ctx;
1092         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1093
1094         uniform->f[0] = sampl->min_lod;
1095         uniform->f[1] = sampl->max_lod;
1096         uniform->f[2] = sampl->lod_bias;
1097
1098         /* Even without any errata, Midgard represents "no mipmapping" as
1099          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1100          * panfrost_create_sampler_state which also explains our choice of
1101          * epsilon value (again to keep behaviour consistent) */
1102
1103         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1104                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1105 }
1106
1107 static void
1108 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1109                                        struct sysval_uniform *uniform)
1110 {
1111         struct panfrost_context *ctx = batch->ctx;
1112
1113         uniform->u[0] = ctx->compute_grid->grid[0];
1114         uniform->u[1] = ctx->compute_grid->grid[1];
1115         uniform->u[2] = ctx->compute_grid->grid[2];
1116 }
1117
1118 static void
1119 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1120                         struct panfrost_shader_state *ss,
1121                         enum pipe_shader_type st)
1122 {
1123         struct sysval_uniform *uniforms = (void *)buf;
1124
1125         for (unsigned i = 0; i < ss->sysval_count; ++i) {
1126                 int sysval = ss->sysval[i];
1127
1128                 switch (PAN_SYSVAL_TYPE(sysval)) {
1129                 case PAN_SYSVAL_VIEWPORT_SCALE:
1130                         panfrost_upload_viewport_scale_sysval(batch,
1131                                                               &uniforms[i]);
1132                         break;
1133                 case PAN_SYSVAL_VIEWPORT_OFFSET:
1134                         panfrost_upload_viewport_offset_sysval(batch,
1135                                                                &uniforms[i]);
1136                         break;
1137                 case PAN_SYSVAL_TEXTURE_SIZE:
1138                         panfrost_upload_txs_sysval(batch, st,
1139                                                    PAN_SYSVAL_ID(sysval),
1140                                                    &uniforms[i]);
1141                         break;
1142                 case PAN_SYSVAL_SSBO:
1143                         panfrost_upload_ssbo_sysval(batch, st,
1144                                                     PAN_SYSVAL_ID(sysval),
1145                                                     &uniforms[i]);
1146                         break;
1147                 case PAN_SYSVAL_NUM_WORK_GROUPS:
1148                         panfrost_upload_num_work_groups_sysval(batch,
1149                                                                &uniforms[i]);
1150                         break;
1151                 case PAN_SYSVAL_SAMPLER:
1152                         panfrost_upload_sampler_sysval(batch, st,
1153                                                        PAN_SYSVAL_ID(sysval),
1154                                                        &uniforms[i]);
1155                         break;
1156                 default:
1157                         assert(0);
1158                 }
1159         }
1160 }
1161
1162 static const void *
1163 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1164                                  unsigned index)
1165 {
1166         struct pipe_constant_buffer *cb = &buf->cb[index];
1167         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1168
1169         if (rsrc)
1170                 return rsrc->bo->cpu;
1171         else if (cb->user_buffer)
1172                 return cb->user_buffer;
1173         else
1174                 unreachable("No constant buffer");
1175 }
1176
1177 void
1178 panfrost_emit_const_buf(struct panfrost_batch *batch,
1179                         enum pipe_shader_type stage,
1180                         struct mali_vertex_tiler_postfix *postfix)
1181 {
1182         struct panfrost_context *ctx = batch->ctx;
1183         struct panfrost_shader_variants *all = ctx->shader[stage];
1184
1185         if (!all)
1186                 return;
1187
1188         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1189
1190         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1191
1192         /* Uniforms are implicitly UBO #0 */
1193         bool has_uniforms = buf->enabled_mask & (1 << 0);
1194
1195         /* Allocate room for the sysval and the uniforms */
1196         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1197         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1198         size_t size = sys_size + uniform_size;
1199         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1200                                                                         size);
1201
1202         /* Upload sysvals requested by the shader */
1203         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1204
1205         /* Upload uniforms */
1206         if (has_uniforms && uniform_size) {
1207                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1208                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1209         }
1210
1211         /* Next up, attach UBOs. UBO #0 is the uniforms we just
1212          * uploaded */
1213
1214         unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1215         assert(ubo_count >= 1);
1216
1217         size_t sz = sizeof(uint64_t) * ubo_count;
1218         uint64_t ubos[PAN_MAX_CONST_BUFFERS];
1219         int uniform_count = ss->uniform_count;
1220
1221         /* Upload uniforms as a UBO */
1222         ubos[0] = MALI_MAKE_UBO(2 + uniform_count, transfer.gpu);
1223
1224         /* The rest are honest-to-goodness UBOs */
1225
1226         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1227                 size_t usz = buf->cb[ubo].buffer_size;
1228                 bool enabled = buf->enabled_mask & (1 << ubo);
1229                 bool empty = usz == 0;
1230
1231                 if (!enabled || empty) {
1232                         /* Stub out disabled UBOs to catch accesses */
1233                         ubos[ubo] = MALI_MAKE_UBO(0, 0xDEAD0000);
1234                         continue;
1235                 }
1236
1237                 mali_ptr gpu = panfrost_map_constant_buffer_gpu(batch, stage,
1238                                                                 buf, ubo);
1239
1240                 unsigned bytes_per_field = 16;
1241                 unsigned aligned = ALIGN_POT(usz, bytes_per_field);
1242                 ubos[ubo] = MALI_MAKE_UBO(aligned / bytes_per_field, gpu);
1243         }
1244
1245         mali_ptr ubufs = panfrost_upload_transient(batch, ubos, sz);
1246         postfix->uniforms = transfer.gpu;
1247         postfix->uniform_buffers = ubufs;
1248
1249         buf->dirty_mask = 0;
1250 }
1251
1252 void
1253 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1254                             const struct pipe_grid_info *info,
1255                             struct midgard_payload_vertex_tiler *vtp)
1256 {
1257         struct panfrost_context *ctx = batch->ctx;
1258         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1259         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1260         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1261                                                            128));
1262         unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1263                                info->grid[2] * 4;
1264         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1265                                                                   shared_size,
1266                                                                   1);
1267
1268         struct mali_shared_memory shared = {
1269                 .shared_memory = bo->gpu,
1270                 .shared_workgroup_count =
1271                         util_logbase2_ceil(info->grid[0]) +
1272                         util_logbase2_ceil(info->grid[1]) +
1273                         util_logbase2_ceil(info->grid[2]),
1274                 .shared_unk1 = 0x2,
1275                 .shared_shift = util_logbase2(single_size) - 1
1276         };
1277
1278         vtp->postfix.shared_memory = panfrost_upload_transient(batch, &shared,
1279                                                                sizeof(shared));
1280 }
1281
1282 static mali_ptr
1283 panfrost_get_tex_desc(struct panfrost_batch *batch,
1284                       enum pipe_shader_type st,
1285                       struct panfrost_sampler_view *view)
1286 {
1287         if (!view)
1288                 return (mali_ptr) 0;
1289
1290         struct pipe_sampler_view *pview = &view->base;
1291         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1292
1293         /* Add the BO to the job so it's retained until the job is done. */
1294
1295         panfrost_batch_add_bo(batch, rsrc->bo,
1296                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1297                               panfrost_bo_access_for_stage(st));
1298
1299         panfrost_batch_add_bo(batch, view->midgard_bo,
1300                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1301                               panfrost_bo_access_for_stage(st));
1302
1303         return view->midgard_bo->gpu;
1304 }
1305
1306 void
1307 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1308                                   enum pipe_shader_type stage,
1309                                   struct mali_vertex_tiler_postfix *postfix)
1310 {
1311         struct panfrost_context *ctx = batch->ctx;
1312         struct panfrost_device *device = pan_device(ctx->base.screen);
1313
1314         if (!ctx->sampler_view_count[stage])
1315                 return;
1316
1317         if (device->quirks & IS_BIFROST) {
1318                 struct bifrost_texture_descriptor *descriptors;
1319
1320                 descriptors = malloc(sizeof(struct bifrost_texture_descriptor) *
1321                                      ctx->sampler_view_count[stage]);
1322
1323                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1324                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1325                         struct pipe_sampler_view *pview = &view->base;
1326                         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1327
1328                         panfrost_batch_add_bo(batch, rsrc->bo,
1329                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1330                                               panfrost_bo_access_for_stage(stage));
1331
1332                         memcpy(&descriptors[i], view->bifrost_descriptor, sizeof(*view->bifrost_descriptor));
1333                 }
1334
1335                 postfix->textures = panfrost_upload_transient(batch,
1336                                                               descriptors,
1337                                                               sizeof(struct bifrost_texture_descriptor) *
1338                                                                       ctx->sampler_view_count[stage]);
1339
1340                 free(descriptors);
1341         } else {
1342                 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1343
1344                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i)
1345                         trampolines[i] = panfrost_get_tex_desc(batch, stage,
1346                                                                ctx->sampler_views[stage][i]);
1347
1348                 postfix->textures = panfrost_upload_transient(batch,
1349                                                               trampolines,
1350                                                               sizeof(uint64_t) *
1351                                                               ctx->sampler_view_count[stage]);
1352         }
1353 }
1354
1355 void
1356 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1357                                   enum pipe_shader_type stage,
1358                                   struct mali_vertex_tiler_postfix *postfix)
1359 {
1360         struct panfrost_context *ctx = batch->ctx;
1361         struct panfrost_device *device = pan_device(ctx->base.screen);
1362
1363         if (!ctx->sampler_count[stage])
1364                 return;
1365
1366         if (device->quirks & IS_BIFROST) {
1367                 size_t desc_size = sizeof(struct bifrost_sampler_descriptor);
1368                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1369                 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1370                                                                                 transfer_size);
1371                 struct bifrost_sampler_descriptor *desc = (struct bifrost_sampler_descriptor *)transfer.cpu;
1372
1373                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1374                         desc[i] = ctx->samplers[stage][i]->bifrost_hw;
1375
1376                 postfix->sampler_descriptor = transfer.gpu;
1377         } else {
1378                 size_t desc_size = sizeof(struct mali_sampler_descriptor);
1379                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1380                 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1381                                                                                 transfer_size);
1382                 struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1383
1384                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1385                         desc[i] = ctx->samplers[stage][i]->midgard_hw;
1386
1387                 postfix->sampler_descriptor = transfer.gpu;
1388         }
1389 }
1390
1391 void
1392 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1393                                struct mali_vertex_tiler_postfix *vertex_postfix)
1394 {
1395         struct panfrost_context *ctx = batch->ctx;
1396
1397         if (!ctx->vertex)
1398                 return;
1399
1400         struct panfrost_vertex_state *so = ctx->vertex;
1401
1402         panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1403         vertex_postfix->attribute_meta = panfrost_upload_transient(batch, so->hw,
1404                                                                sizeof(*so->hw) *
1405                                                                PAN_MAX_ATTRIBUTE);
1406 }
1407
1408 void
1409 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1410                           struct mali_vertex_tiler_postfix *vertex_postfix)
1411 {
1412         struct panfrost_context *ctx = batch->ctx;
1413         struct panfrost_vertex_state *so = ctx->vertex;
1414
1415         /* Staged mali_attr, and index into them. i =/= k, depending on the
1416          * vertex buffer mask and instancing. Twice as much room is allocated,
1417          * for a worst case of NPOT_DIVIDEs which take up extra slot */
1418         union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1419         unsigned k = 0;
1420
1421         for (unsigned i = 0; i < so->num_elements; ++i) {
1422                 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1423                  * means duplicating some vertex buffers (who cares? aside from
1424                  * maybe some caching implications but I somehow doubt that
1425                  * matters) */
1426
1427                 struct pipe_vertex_element *elem = &so->pipe[i];
1428                 unsigned vbi = elem->vertex_buffer_index;
1429
1430                 /* The exception to 1:1 mapping is that we can have multiple
1431                  * entries (NPOT divisors), so we fixup anyways */
1432
1433                 so->hw[i].index = k;
1434
1435                 if (!(ctx->vb_mask & (1 << vbi)))
1436                         continue;
1437
1438                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1439                 struct panfrost_resource *rsrc;
1440
1441                 rsrc = pan_resource(buf->buffer.resource);
1442                 if (!rsrc)
1443                         continue;
1444
1445                 /* Align to 64 bytes by masking off the lower bits. This
1446                  * will be adjusted back when we fixup the src_offset in
1447                  * mali_attr_meta */
1448
1449                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1450                 mali_ptr addr = raw_addr & ~63;
1451                 unsigned chopped_addr = raw_addr - addr;
1452
1453                 /* Add a dependency of the batch on the vertex buffer */
1454                 panfrost_batch_add_bo(batch, rsrc->bo,
1455                                       PAN_BO_ACCESS_SHARED |
1456                                       PAN_BO_ACCESS_READ |
1457                                       PAN_BO_ACCESS_VERTEX_TILER);
1458
1459                 /* Set common fields */
1460                 attrs[k].elements = addr;
1461                 attrs[k].stride = buf->stride;
1462
1463                 /* Since we advanced the base pointer, we shrink the buffer
1464                  * size */
1465                 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1466
1467                 /* We need to add the extra size we masked off (for
1468                  * correctness) so the data doesn't get clamped away */
1469                 attrs[k].size += chopped_addr;
1470
1471                 /* For non-instancing make sure we initialize */
1472                 attrs[k].shift = attrs[k].extra_flags = 0;
1473
1474                 /* Instancing uses a dramatically different code path than
1475                  * linear, so dispatch for the actual emission now that the
1476                  * common code is finished */
1477
1478                 unsigned divisor = elem->instance_divisor;
1479
1480                 if (divisor && ctx->instance_count == 1) {
1481                         /* Silly corner case where there's a divisor(=1) but
1482                          * there's no legitimate instancing. So we want *every*
1483                          * attribute to be the same. So set stride to zero so
1484                          * we don't go anywhere. */
1485
1486                         attrs[k].size = attrs[k].stride + chopped_addr;
1487                         attrs[k].stride = 0;
1488                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1489                 } else if (ctx->instance_count <= 1) {
1490                         /* Normal, non-instanced attributes */
1491                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1492                 } else {
1493                         unsigned instance_shift = vertex_postfix->instance_shift;
1494                         unsigned instance_odd = vertex_postfix->instance_odd;
1495
1496                         k += panfrost_vertex_instanced(ctx->padded_count,
1497                                                        instance_shift,
1498                                                        instance_odd,
1499                                                        divisor, &attrs[k]);
1500                 }
1501         }
1502
1503         /* Add special gl_VertexID/gl_InstanceID buffers */
1504
1505         panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1506         so->hw[PAN_VERTEX_ID].index = k++;
1507         panfrost_instance_id(ctx->padded_count, &attrs[k]);
1508         so->hw[PAN_INSTANCE_ID].index = k++;
1509
1510         /* Upload whatever we emitted and go */
1511
1512         vertex_postfix->attributes = panfrost_upload_transient(batch, attrs,
1513                                                            k * sizeof(*attrs));
1514 }
1515
1516 static mali_ptr
1517 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1518                        unsigned stride, unsigned count)
1519 {
1520         /* Fill out the descriptor */
1521         slot->stride = stride;
1522         slot->size = stride * count;
1523         slot->shift = slot->extra_flags = 0;
1524
1525         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1526                                                                         slot->size);
1527
1528         slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1529
1530         return transfer.gpu;
1531 }
1532
1533 static void
1534 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1535                         unsigned stride, unsigned offset, unsigned count,
1536                         struct pipe_stream_output_target *target)
1537 {
1538         /* Fill out the descriptor */
1539         slot->stride = stride * 4;
1540         slot->shift = slot->extra_flags = 0;
1541
1542         unsigned max_size = target->buffer_size;
1543         unsigned expected_size = slot->stride * count;
1544
1545         slot->size = MIN2(max_size, expected_size);
1546
1547         /* Grab the BO and bind it to the batch */
1548         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1549
1550         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1551          * the perspective of the TILER and FRAGMENT.
1552          */
1553         panfrost_batch_add_bo(batch, bo,
1554                               PAN_BO_ACCESS_SHARED |
1555                               PAN_BO_ACCESS_RW |
1556                               PAN_BO_ACCESS_VERTEX_TILER |
1557                               PAN_BO_ACCESS_FRAGMENT);
1558
1559         mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1560         slot->elements = addr;
1561 }
1562
1563 /* Given a shader and buffer indices, link varying metadata together */
1564
1565 static bool
1566 is_special_varying(gl_varying_slot loc)
1567 {
1568         switch (loc) {
1569         case VARYING_SLOT_POS:
1570         case VARYING_SLOT_PSIZ:
1571         case VARYING_SLOT_PNTC:
1572         case VARYING_SLOT_FACE:
1573                 return true;
1574         default:
1575                 return false;
1576         }
1577 }
1578
1579 static void
1580 panfrost_emit_varying_meta(void *outptr, struct panfrost_shader_state *ss,
1581                            signed general, signed gl_Position,
1582                            signed gl_PointSize, signed gl_PointCoord,
1583                            signed gl_FrontFacing)
1584 {
1585         struct mali_attr_meta *out = (struct mali_attr_meta *) outptr;
1586
1587         for (unsigned i = 0; i < ss->varying_count; ++i) {
1588                 gl_varying_slot location = ss->varyings_loc[i];
1589                 int index = -1;
1590
1591                 switch (location) {
1592                 case VARYING_SLOT_POS:
1593                         index = gl_Position;
1594                         break;
1595                 case VARYING_SLOT_PSIZ:
1596                         index = gl_PointSize;
1597                         break;
1598                 case VARYING_SLOT_PNTC:
1599                         index = gl_PointCoord;
1600                         break;
1601                 case VARYING_SLOT_FACE:
1602                         index = gl_FrontFacing;
1603                         break;
1604                 default:
1605                         index = general;
1606                         break;
1607                 }
1608
1609                 assert(index >= 0);
1610                 out[i].index = index;
1611         }
1612 }
1613
1614 static bool
1615 has_point_coord(unsigned mask, gl_varying_slot loc)
1616 {
1617         if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1618                 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1619         else if (loc == VARYING_SLOT_PNTC)
1620                 return (mask & (1 << 8));
1621         else
1622                 return false;
1623 }
1624
1625 /* Helpers for manipulating stream out information so we can pack varyings
1626  * accordingly. Compute the src_offset for a given captured varying */
1627
1628 static struct pipe_stream_output *
1629 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1630 {
1631         for (unsigned i = 0; i < info->num_outputs; ++i) {
1632                 if (info->output[i].register_index == loc)
1633                         return &info->output[i];
1634         }
1635
1636         unreachable("Varying not captured");
1637 }
1638
1639 /* TODO: Integers */
1640 static enum mali_format
1641 pan_xfb_format(unsigned nr_components)
1642 {
1643         switch (nr_components) {
1644                 case 1: return MALI_R32F;
1645                 case 2: return MALI_RG32F;
1646                 case 3: return MALI_RGB32F;
1647                 case 4: return MALI_RGBA32F;
1648                 default: unreachable("Invalid format");
1649         }
1650 }
1651
1652 void
1653 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1654                                  unsigned vertex_count,
1655                                  struct mali_vertex_tiler_postfix *vertex_postfix,
1656                                  struct mali_vertex_tiler_postfix *tiler_postfix,
1657                                  union midgard_primitive_size *primitive_size)
1658 {
1659         /* Load the shaders */
1660         struct panfrost_context *ctx = batch->ctx;
1661         struct panfrost_shader_state *vs, *fs;
1662         unsigned int num_gen_varyings = 0;
1663         size_t vs_size, fs_size;
1664
1665         /* Allocate the varying descriptor */
1666
1667         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1668         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1669         vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
1670         fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
1671
1672         struct panfrost_transfer trans = panfrost_allocate_transient(batch,
1673                                                                      vs_size +
1674                                                                      fs_size);
1675
1676         struct pipe_stream_output_info *so = &vs->stream_output;
1677
1678         /* Check if this varying is linked by us. This is the case for
1679          * general-purpose, non-captured varyings. If it is, link it. If it's
1680          * not, use the provided stream out information to determine the
1681          * offset, since it was already linked for us. */
1682
1683         for (unsigned i = 0; i < vs->varying_count; i++) {
1684                 gl_varying_slot loc = vs->varyings_loc[i];
1685
1686                 bool special = is_special_varying(loc);
1687                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1688
1689                 if (captured) {
1690                         struct pipe_stream_output *o = pan_get_so(so, loc);
1691
1692                         unsigned dst_offset = o->dst_offset * 4; /* dwords */
1693                         vs->varyings[i].src_offset = dst_offset;
1694                 } else if (!special) {
1695                         vs->varyings[i].src_offset = 16 * (num_gen_varyings++);
1696                 }
1697         }
1698
1699         /* Conversely, we need to set src_offset for the captured varyings.
1700          * Here, the layout is defined by the stream out info, not us */
1701
1702         /* Link up with fragment varyings */
1703         bool reads_point_coord = fs->reads_point_coord;
1704
1705         for (unsigned i = 0; i < fs->varying_count; i++) {
1706                 gl_varying_slot loc = fs->varyings_loc[i];
1707                 unsigned src_offset;
1708                 signed vs_idx = -1;
1709
1710                 /* Link up */
1711                 for (unsigned j = 0; j < vs->varying_count; ++j) {
1712                         if (vs->varyings_loc[j] == loc) {
1713                                 vs_idx = j;
1714                                 break;
1715                         }
1716                 }
1717
1718                 /* Either assign or reuse */
1719                 if (vs_idx >= 0)
1720                         src_offset = vs->varyings[vs_idx].src_offset;
1721                 else
1722                         src_offset = 16 * (num_gen_varyings++);
1723
1724                 fs->varyings[i].src_offset = src_offset;
1725
1726                 if (has_point_coord(fs->point_sprite_mask, loc))
1727                         reads_point_coord = true;
1728         }
1729
1730         memcpy(trans.cpu, vs->varyings, vs_size);
1731         memcpy(trans.cpu + vs_size, fs->varyings, fs_size);
1732
1733         union mali_attr varyings[PIPE_MAX_ATTRIBS] = {0};
1734
1735         /* Figure out how many streamout buffers could be bound */
1736         unsigned so_count = ctx->streamout.num_targets;
1737         for (unsigned i = 0; i < vs->varying_count; i++) {
1738                 gl_varying_slot loc = vs->varyings_loc[i];
1739
1740                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1741                 if (!captured) continue;
1742
1743                 struct pipe_stream_output *o = pan_get_so(so, loc);
1744                 so_count = MAX2(so_count, o->output_buffer + 1);
1745         }
1746
1747         signed idx = so_count;
1748         signed general = idx++;
1749         signed gl_Position = idx++;
1750         signed gl_PointSize = vs->writes_point_size ? (idx++) : -1;
1751         signed gl_PointCoord = reads_point_coord ? (idx++) : -1;
1752         signed gl_FrontFacing = fs->reads_face ? (idx++) : -1;
1753         signed gl_FragCoord = fs->reads_frag_coord ? (idx++) : -1;
1754
1755         /* Emit the stream out buffers */
1756
1757         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1758                                                            ctx->vertex_count);
1759
1760         for (unsigned i = 0; i < so_count; ++i) {
1761                 if (i < ctx->streamout.num_targets) {
1762                         panfrost_emit_streamout(batch, &varyings[i],
1763                                                 so->stride[i],
1764                                                 ctx->streamout.offsets[i],
1765                                                 out_count,
1766                                                 ctx->streamout.targets[i]);
1767                 } else {
1768                         /* Emit a dummy buffer */
1769                         panfrost_emit_varyings(batch, &varyings[i],
1770                                                so->stride[i] * 4,
1771                                                out_count);
1772
1773                         /* Clear the attribute type */
1774                         varyings[i].elements &= ~0xF;
1775                 }
1776         }
1777
1778         panfrost_emit_varyings(batch, &varyings[general],
1779                                num_gen_varyings * 16,
1780                                vertex_count);
1781
1782         mali_ptr varyings_p;
1783
1784         /* fp32 vec4 gl_Position */
1785         varyings_p = panfrost_emit_varyings(batch, &varyings[gl_Position],
1786                                             sizeof(float) * 4, vertex_count);
1787         tiler_postfix->position_varying = varyings_p;
1788
1789
1790         if (panfrost_writes_point_size(ctx)) {
1791                 varyings_p = panfrost_emit_varyings(batch,
1792                                                     &varyings[gl_PointSize],
1793                                                     2, vertex_count);
1794                 primitive_size->pointer = varyings_p;
1795         }
1796
1797         if (reads_point_coord)
1798                 varyings[gl_PointCoord].elements = MALI_VARYING_POINT_COORD;
1799
1800         if (fs->reads_face)
1801                 varyings[gl_FrontFacing].elements = MALI_VARYING_FRONT_FACING;
1802
1803         if (fs->reads_frag_coord)
1804                 varyings[gl_FragCoord].elements = MALI_VARYING_FRAG_COORD;
1805
1806         struct panfrost_device *device = pan_device(ctx->base.screen);
1807         assert(!(device->quirks & IS_BIFROST) || !(reads_point_coord || fs->reads_face || fs->reads_frag_coord));
1808
1809         /* Let's go ahead and link varying meta to the buffer in question, now
1810          * that that information is available. VARYING_SLOT_POS is mapped to
1811          * gl_FragCoord for fragment shaders but gl_Positionf or vertex shaders
1812          * */
1813
1814         panfrost_emit_varying_meta(trans.cpu, vs, general, gl_Position,
1815                                    gl_PointSize, gl_PointCoord,
1816                                    gl_FrontFacing);
1817
1818         panfrost_emit_varying_meta(trans.cpu + vs_size, fs, general,
1819                                    gl_FragCoord, gl_PointSize,
1820                                    gl_PointCoord, gl_FrontFacing);
1821
1822         /* Replace streamout */
1823
1824         struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
1825         struct mali_attr_meta *ofs = ovs + vs->varying_count;
1826
1827         for (unsigned i = 0; i < vs->varying_count; i++) {
1828                 gl_varying_slot loc = vs->varyings_loc[i];
1829
1830                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1831                 if (!captured)
1832                         continue;
1833
1834                 struct pipe_stream_output *o = pan_get_so(so, loc);
1835                 ovs[i].index = o->output_buffer;
1836
1837                 /* Set the type appropriately. TODO: Integer varyings XXX */
1838                 assert(o->stream == 0);
1839                 ovs[i].format = pan_xfb_format(o->num_components);
1840
1841                 if (device->quirks & HAS_SWIZZLES)
1842                         ovs[i].swizzle = panfrost_get_default_swizzle(o->num_components);
1843                 else
1844                         ovs[i].swizzle = panfrost_bifrost_swizzle(o->num_components);
1845
1846                 /* Link to the fragment */
1847                 signed fs_idx = -1;
1848
1849                 /* Link up */
1850                 for (unsigned j = 0; j < fs->varying_count; ++j) {
1851                         if (fs->varyings_loc[j] == loc) {
1852                                 fs_idx = j;
1853                                 break;
1854                         }
1855                 }
1856
1857                 if (fs_idx >= 0) {
1858                         ofs[fs_idx].index = ovs[i].index;
1859                         ofs[fs_idx].format = ovs[i].format;
1860                         ofs[fs_idx].swizzle = ovs[i].swizzle;
1861                 }
1862         }
1863
1864         /* Replace point sprite */
1865         for (unsigned i = 0; i < fs->varying_count; i++) {
1866                 /* If we have a point sprite replacement, handle that here. We
1867                  * have to translate location first.  TODO: Flip y in shader.
1868                  * We're already keying ... just time crunch .. */
1869
1870                 if (has_point_coord(fs->point_sprite_mask,
1871                                     fs->varyings_loc[i])) {
1872                         ofs[i].index = gl_PointCoord;
1873
1874                         /* Swizzle out the z/w to 0/1 */
1875                         ofs[i].format = MALI_RG16F;
1876                         ofs[i].swizzle = panfrost_get_default_swizzle(2);
1877                 }
1878         }
1879
1880         /* Fix up unaligned addresses */
1881         for (unsigned i = 0; i < so_count; ++i) {
1882                 if (varyings[i].elements < MALI_RECORD_SPECIAL)
1883                         continue;
1884
1885                 unsigned align = (varyings[i].elements & 63);
1886
1887                 /* While we're at it, the SO buffers are linear */
1888
1889                 if (!align) {
1890                         varyings[i].elements |= MALI_ATTR_LINEAR;
1891                         continue;
1892                 }
1893
1894                 /* We need to adjust alignment */
1895                 varyings[i].elements &= ~63;
1896                 varyings[i].elements |= MALI_ATTR_LINEAR;
1897                 varyings[i].size += align;
1898
1899                 for (unsigned v = 0; v < vs->varying_count; ++v) {
1900                         if (ovs[v].index != i)
1901                                 continue;
1902
1903                         ovs[v].src_offset = vs->varyings[v].src_offset + align;
1904                 }
1905
1906                 for (unsigned f = 0; f < fs->varying_count; ++f) {
1907                         if (ofs[f].index != i)
1908                                 continue;
1909
1910                         ofs[f].src_offset = fs->varyings[f].src_offset + align;
1911                 }
1912         }
1913
1914         varyings_p = panfrost_upload_transient(batch, varyings,
1915                                                idx * sizeof(*varyings));
1916         vertex_postfix->varyings = varyings_p;
1917         tiler_postfix->varyings = varyings_p;
1918
1919         vertex_postfix->varying_meta = trans.gpu;
1920         tiler_postfix->varying_meta = trans.gpu + vs_size;
1921 }
1922
1923 void
1924 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1925                                 struct mali_vertex_tiler_prefix *vertex_prefix,
1926                                 struct mali_vertex_tiler_postfix *vertex_postfix,
1927                                 struct mali_vertex_tiler_prefix *tiler_prefix,
1928                                 struct mali_vertex_tiler_postfix *tiler_postfix,
1929                                 union midgard_primitive_size *primitive_size)
1930 {
1931         struct panfrost_context *ctx = batch->ctx;
1932         struct panfrost_device *device = pan_device(ctx->base.screen);
1933         bool wallpapering = ctx->wallpaper_batch && batch->tiler_dep;
1934         struct bifrost_payload_vertex bifrost_vertex = {0,};
1935         struct bifrost_payload_tiler bifrost_tiler = {0,};
1936         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
1937         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
1938         void *vp, *tp;
1939         size_t vp_size, tp_size;
1940
1941         if (device->quirks & IS_BIFROST) {
1942                 bifrost_vertex.prefix = *vertex_prefix;
1943                 bifrost_vertex.postfix = *vertex_postfix;
1944                 vp = &bifrost_vertex;
1945                 vp_size = sizeof(bifrost_vertex);
1946
1947                 bifrost_tiler.prefix = *tiler_prefix;
1948                 bifrost_tiler.tiler.primitive_size = *primitive_size;
1949                 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
1950                 bifrost_tiler.postfix = *tiler_postfix;
1951                 tp = &bifrost_tiler;
1952                 tp_size = sizeof(bifrost_tiler);
1953         } else {
1954                 midgard_vertex.prefix = *vertex_prefix;
1955                 midgard_vertex.postfix = *vertex_postfix;
1956                 vp = &midgard_vertex;
1957                 vp_size = sizeof(midgard_vertex);
1958
1959                 midgard_tiler.prefix = *tiler_prefix;
1960                 midgard_tiler.postfix = *tiler_postfix;
1961                 midgard_tiler.primitive_size = *primitive_size;
1962                 tp = &midgard_tiler;
1963                 tp_size = sizeof(midgard_tiler);
1964         }
1965
1966         if (wallpapering) {
1967                 /* Inject in reverse order, with "predicted" job indices.
1968                  * THIS IS A HACK XXX */
1969                 panfrost_new_job(batch, JOB_TYPE_TILER, false,
1970                                  batch->job_index + 2, tp, tp_size, true);
1971                 panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
1972                                  vp, vp_size, true);
1973                 return;
1974         }
1975
1976         /* If rasterizer discard is enable, only submit the vertex */
1977
1978         bool rasterizer_discard = ctx->rasterizer &&
1979                                   ctx->rasterizer->base.rasterizer_discard;
1980
1981         unsigned vertex = panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
1982                                            vp, vp_size, false);
1983
1984         if (rasterizer_discard)
1985                 return;
1986
1987         panfrost_new_job(batch, JOB_TYPE_TILER, false, vertex, tp, tp_size,
1988                          false);
1989 }
1990
1991 /* TODO: stop hardcoding this */
1992 mali_ptr
1993 panfrost_emit_sample_locations(struct panfrost_batch *batch)
1994 {
1995         uint16_t locations[] = {
1996             128, 128,
1997             0, 256,
1998             0, 256,
1999             0, 256,
2000             0, 256,
2001             0, 256,
2002             0, 256,
2003             0, 256,
2004             0, 256,
2005             0, 256,
2006             0, 256,
2007             0, 256,
2008             0, 256,
2009             0, 256,
2010             0, 256,
2011             0, 256,
2012             0, 256,
2013             0, 256,
2014             0, 256,
2015             0, 256,
2016             0, 256,
2017             0, 256,
2018             0, 256,
2019             0, 256,
2020             0, 256,
2021             0, 256,
2022             0, 256,
2023             0, 256,
2024             0, 256,
2025             0, 256,
2026             0, 256,
2027             0, 256,
2028             128, 128,
2029             0, 0,
2030             0, 0,
2031             0, 0,
2032             0, 0,
2033             0, 0,
2034             0, 0,
2035             0, 0,
2036             0, 0,
2037             0, 0,
2038             0, 0,
2039             0, 0,
2040             0, 0,
2041             0, 0,
2042             0, 0,
2043             0, 0,
2044         };
2045
2046         return panfrost_upload_transient(batch, locations, 96 * sizeof(uint16_t));
2047 }