src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28
  29 #include "panfrost-quirks.h"
  30
  31 #include "pan_allocate.h"
  32 #include "pan_bo.h"
  33 #include "pan_cmdstream.h"
  34 #include "pan_context.h"
  35 #include "pan_job.h"
  36
  37 /* If a BO is accessed for a particular shader stage, will it be in the primary
  38  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  39  * fragment will be primary, e.g. compute jobs will be considered
  40  * "vertex/tiler" by analogy */
  41
  42 static inline uint32_t
  43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  44 {
  45         assert(stage == PIPE_SHADER_FRAGMENT ||
  46                stage == PIPE_SHADER_VERTEX ||
  47                stage == PIPE_SHADER_COMPUTE);
  48
  49         return stage == PIPE_SHADER_FRAGMENT ?
  50                PAN_BO_ACCESS_FRAGMENT :
  51                PAN_BO_ACCESS_VERTEX_TILER;
  52 }
  53
  54 static void
  55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
  56                                struct mali_vertex_tiler_postfix *postfix)
  57 {
  58         struct panfrost_device *dev = pan_device(ctx->base.screen);
  59         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  60
  61         unsigned shift = panfrost_get_stack_shift(batch->stack_size);
  62         struct mali_shared_memory shared = {
  63                 .stack_shift = shift,
  64                 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
  65                 .shared_workgroup_count = ~0,
  66         };
  67         postfix->shared_memory = panfrost_upload_transient(batch, &shared, sizeof(shared));
  68 }
  69
  70 static void
  71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
  72                                struct mali_vertex_tiler_postfix *postfix)
  73 {
  74         struct panfrost_device *dev = pan_device(ctx->base.screen);
  75         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  76
  77         /* If we haven't, reserve space for the framebuffer */
  78
  79         if (!batch->framebuffer.gpu) {
  80                 unsigned size = (dev->quirks & MIDGARD_SFBD) ?
  81                         sizeof(struct mali_single_framebuffer) :
  82                         sizeof(struct mali_framebuffer);
  83
  84                 batch->framebuffer = panfrost_allocate_transient(batch, size);
  85
  86                 /* Tag the pointer */
  87                 if (!(dev->quirks & MIDGARD_SFBD))
  88                         batch->framebuffer.gpu |= MALI_MFBD;
  89         }
  90
  91         postfix->shared_memory = batch->framebuffer.gpu;
  92 }
  93
  94 static void
  95 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
  96                               struct mali_vertex_tiler_prefix *prefix,
  97                               struct mali_vertex_tiler_postfix *postfix)
  98 {
  99         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 100
 101         postfix->gl_enables |= 0x7;
 102         SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
 103                 rasterizer && rasterizer->base.front_ccw);
 104         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
 105                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
 106         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
 107                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
 108         SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
 109                 rasterizer && rasterizer->base.flatshade_first);
 110 }
 111
 112 void
 113 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
 114                                   struct mali_vertex_tiler_prefix *prefix,
 115                                   union midgard_primitive_size *primitive_size)
 116 {
 117         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 118
 119         if (!panfrost_writes_point_size(ctx)) {
 120                 bool points = prefix->draw_mode == MALI_POINTS;
 121                 float val = 0.0f;
 122
 123                 if (rasterizer)
 124                         val = points ?
 125                               rasterizer->base.point_size :
 126                               rasterizer->base.line_width;
 127
 128                 primitive_size->constant = val;
 129         }
 130 }
 131
 132 static void
 133 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
 134                                    struct mali_vertex_tiler_postfix *postfix)
 135 {
 136         SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
 137         if (ctx->occlusion_query)
 138                 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
 139         else
 140                 postfix->occlusion_counter = 0;
 141 }
 142
 143 void
 144 panfrost_vt_init(struct panfrost_context *ctx,
 145                  enum pipe_shader_type stage,
 146                  struct mali_vertex_tiler_prefix *prefix,
 147                  struct mali_vertex_tiler_postfix *postfix)
 148 {
 149         struct panfrost_device *device = pan_device(ctx->base.screen);
 150
 151         if (!ctx->shader[stage])
 152                 return;
 153
 154         memset(prefix, 0, sizeof(*prefix));
 155         memset(postfix, 0, sizeof(*postfix));
 156
 157         if (device->quirks & IS_BIFROST) {
 158                 postfix->gl_enables = 0x2;
 159                 panfrost_vt_emit_shared_memory(ctx, postfix);
 160         } else {
 161                 postfix->gl_enables = 0x6;
 162                 panfrost_vt_attach_framebuffer(ctx, postfix);
 163         }
 164
 165         if (stage == PIPE_SHADER_FRAGMENT) {
 166                 panfrost_vt_update_occlusion_query(ctx, postfix);
 167                 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
 168         }
 169 }
 170
 171 static unsigned
 172 panfrost_translate_index_size(unsigned size)
 173 {
 174         switch (size) {
 175         case 1:
 176                 return MALI_DRAW_INDEXED_UINT8;
 177
 178         case 2:
 179                 return MALI_DRAW_INDEXED_UINT16;
 180
 181         case 4:
 182                 return MALI_DRAW_INDEXED_UINT32;
 183
 184         default:
 185                 unreachable("Invalid index size");
 186         }
 187 }
 188
 189 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
 190  * good for the duration of the draw (transient), could last longer. Also get
 191  * the bounds on the index buffer for the range accessed by the draw. We do
 192  * these operations together because there are natural optimizations which
 193  * require them to be together. */
 194
 195 static mali_ptr
 196 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 197                                   const struct pipe_draw_info *info,
 198                                   unsigned *min_index, unsigned *max_index)
 199 {
 200         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 201         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 202         off_t offset = info->start * info->index_size;
 203         bool needs_indices = true;
 204         mali_ptr out = 0;
 205
 206         if (info->max_index != ~0u) {
 207                 *min_index = info->min_index;
 208                 *max_index = info->max_index;
 209                 needs_indices = false;
 210         }
 211
 212         if (!info->has_user_indices) {
 213                 /* Only resources can be directly mapped */
 214                 panfrost_batch_add_bo(batch, rsrc->bo,
 215                                       PAN_BO_ACCESS_SHARED |
 216                                       PAN_BO_ACCESS_READ |
 217                                       PAN_BO_ACCESS_VERTEX_TILER);
 218                 out = rsrc->bo->gpu + offset;
 219
 220                 /* Check the cache */
 221                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 222                                                            info->start,
 223                                                            info->count,
 224                                                            min_index,
 225                                                            max_index);
 226         } else {
 227                 /* Otherwise, we need to upload to transient memory */
 228                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 229                 out = panfrost_upload_transient(batch, ibuf8 + offset,
 230                                                 info->count *
 231                                                 info->index_size);
 232         }
 233
 234         if (needs_indices) {
 235                 /* Fallback */
 236                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 237
 238                 if (!info->has_user_indices)
 239                         panfrost_minmax_cache_add(rsrc->index_cache,
 240                                                   info->start, info->count,
 241                                                   *min_index, *max_index);
 242         }
 243
 244         return out;
 245 }
 246
 247 void
 248 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
 249                           const struct pipe_draw_info *info,
 250                           enum mali_draw_mode draw_mode,
 251                           struct mali_vertex_tiler_postfix *vertex_postfix,
 252                           struct mali_vertex_tiler_prefix *tiler_prefix,
 253                           struct mali_vertex_tiler_postfix *tiler_postfix,
 254                           unsigned *vertex_count,
 255                           unsigned *padded_count)
 256 {
 257         tiler_prefix->draw_mode = draw_mode;
 258
 259         unsigned draw_flags = 0;
 260
 261         if (panfrost_writes_point_size(ctx))
 262                 draw_flags |= MALI_DRAW_VARYING_SIZE;
 263
 264         if (info->primitive_restart)
 265                 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
 266
 267         /* These doesn't make much sense */
 268
 269         draw_flags |= 0x3000;
 270
 271         if (info->index_size) {
 272                 unsigned min_index = 0, max_index = 0;
 273
 274                 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
 275                                                                        info,
 276                                                                        &min_index,
 277                                                                        &max_index);
 278
 279                 /* Use the corresponding values */
 280                 *vertex_count = max_index - min_index + 1;
 281                 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
 282                 tiler_prefix->offset_bias_correction = -min_index;
 283                 tiler_prefix->index_count = MALI_POSITIVE(info->count);
 284                 draw_flags |= panfrost_translate_index_size(info->index_size);
 285         } else {
 286                 tiler_prefix->indices = 0;
 287                 *vertex_count = ctx->vertex_count;
 288                 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
 289                 tiler_prefix->offset_bias_correction = 0;
 290                 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
 291         }
 292
 293         tiler_prefix->unknown_draw = draw_flags;
 294
 295         /* Encode the padded vertex count */
 296
 297         if (info->instance_count > 1) {
 298                 *padded_count = panfrost_padded_vertex_count(*vertex_count);
 299
 300                 unsigned shift = __builtin_ctz(ctx->padded_count);
 301                 unsigned k = ctx->padded_count >> (shift + 1);
 302
 303                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
 304                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
 305         } else {
 306                 *padded_count = *vertex_count;
 307
 308                 /* Reset instancing state */
 309                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
 310                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
 311         }
 312 }
 313
 314 static void
 315 panfrost_shader_meta_init(struct panfrost_context *ctx,
 316                           enum pipe_shader_type st,
 317                           struct mali_shader_meta *meta)
 318 {
 319         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 320         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 321
 322         memset(meta, 0, sizeof(*meta));
 323         meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
 324         meta->attribute_count = ss->attribute_count;
 325         meta->varying_count = ss->varying_count;
 326         meta->texture_count = ctx->sampler_view_count[st];
 327         meta->sampler_count = ctx->sampler_count[st];
 328
 329         if (dev->quirks & IS_BIFROST) {
 330                 meta->bifrost1.unk1 = 0x800200;
 331                 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 332                 meta->bifrost2.preload_regs = 0xC0;
 333                 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
 334                                                     ss->uniform_cutoff);
 335         } else {
 336                 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
 337                                                     ss->uniform_cutoff);
 338                 meta->midgard1.work_count = ss->work_reg_count;
 339                 meta->midgard1.flags_hi = 0x8; /* XXX */
 340                 meta->midgard1.flags_lo = 0x220;
 341                 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 342         }
 343
 344 }
 345
 346 static unsigned
 347 panfrost_translate_compare_func(enum pipe_compare_func in)
 348 {
 349         switch (in) {
 350         case PIPE_FUNC_NEVER:
 351                 return MALI_FUNC_NEVER;
 352
 353         case PIPE_FUNC_LESS:
 354                 return MALI_FUNC_LESS;
 355
 356         case PIPE_FUNC_EQUAL:
 357                 return MALI_FUNC_EQUAL;
 358
 359         case PIPE_FUNC_LEQUAL:
 360                 return MALI_FUNC_LEQUAL;
 361
 362         case PIPE_FUNC_GREATER:
 363                 return MALI_FUNC_GREATER;
 364
 365         case PIPE_FUNC_NOTEQUAL:
 366                 return MALI_FUNC_NOTEQUAL;
 367
 368         case PIPE_FUNC_GEQUAL:
 369                 return MALI_FUNC_GEQUAL;
 370
 371         case PIPE_FUNC_ALWAYS:
 372                 return MALI_FUNC_ALWAYS;
 373
 374         default:
 375                 unreachable("Invalid func");
 376         }
 377 }
 378
 379 static unsigned
 380 panfrost_translate_stencil_op(enum pipe_stencil_op in)
 381 {
 382         switch (in) {
 383         case PIPE_STENCIL_OP_KEEP:
 384                 return MALI_STENCIL_KEEP;
 385
 386         case PIPE_STENCIL_OP_ZERO:
 387                 return MALI_STENCIL_ZERO;
 388
 389         case PIPE_STENCIL_OP_REPLACE:
 390                return MALI_STENCIL_REPLACE;
 391
 392         case PIPE_STENCIL_OP_INCR:
 393                 return MALI_STENCIL_INCR;
 394
 395         case PIPE_STENCIL_OP_DECR:
 396                 return MALI_STENCIL_DECR;
 397
 398         case PIPE_STENCIL_OP_INCR_WRAP:
 399                 return MALI_STENCIL_INCR_WRAP;
 400
 401         case PIPE_STENCIL_OP_DECR_WRAP:
 402                 return MALI_STENCIL_DECR_WRAP;
 403
 404         case PIPE_STENCIL_OP_INVERT:
 405                 return MALI_STENCIL_INVERT;
 406
 407         default:
 408                 unreachable("Invalid stencil op");
 409         }
 410 }
 411
 412 static unsigned
 413 translate_tex_wrap(enum pipe_tex_wrap w)
 414 {
 415         switch (w) {
 416         case PIPE_TEX_WRAP_REPEAT:
 417                 return MALI_WRAP_REPEAT;
 418
 419         case PIPE_TEX_WRAP_CLAMP:
 420                 return MALI_WRAP_CLAMP;
 421
 422         case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 423                 return MALI_WRAP_CLAMP_TO_EDGE;
 424
 425         case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
 426                 return MALI_WRAP_CLAMP_TO_BORDER;
 427
 428         case PIPE_TEX_WRAP_MIRROR_REPEAT:
 429                 return MALI_WRAP_MIRRORED_REPEAT;
 430
 431         case PIPE_TEX_WRAP_MIRROR_CLAMP:
 432                 return MALI_WRAP_MIRRORED_CLAMP;
 433
 434         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 435                 return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE;
 436
 437         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
 438                 return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER;
 439
 440         default:
 441                 unreachable("Invalid wrap");
 442         }
 443 }
 444
 445 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 446                                 struct mali_sampler_descriptor *hw)
 447 {
 448         unsigned func = panfrost_translate_compare_func(cso->compare_func);
 449         bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 450         bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 451         bool mip_linear  = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
 452         unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
 453         unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
 454         unsigned mip_filter = mip_linear  ?
 455                               (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
 456         unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
 457
 458         *hw = (struct mali_sampler_descriptor) {
 459                 .filter_mode = min_filter | mag_filter | mip_filter |
 460                                normalized,
 461                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 462                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 463                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 464                 .compare_func = panfrost_flip_compare_func(func),
 465                 .border_color = {
 466                         cso->border_color.f[0],
 467                         cso->border_color.f[1],
 468                         cso->border_color.f[2],
 469                         cso->border_color.f[3]
 470                 },
 471                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 472                 .max_lod = FIXED_16(cso->max_lod, false),
 473                 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
 474                 .seamless_cube_map = cso->seamless_cube_map,
 475         };
 476
 477         /* If necessary, we disable mipmapping in the sampler descriptor by
 478          * clamping the LOD as tight as possible (from 0 to epsilon,
 479          * essentially -- remember these are fixed point numbers, so
 480          * epsilon=1/256) */
 481
 482         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 483                 hw->max_lod = hw->min_lod + 1;
 484 }
 485
 486 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
 487                                         struct bifrost_sampler_descriptor *hw)
 488 {
 489         *hw = (struct bifrost_sampler_descriptor) {
 490                 .unk1 = 0x1,
 491                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 492                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 493                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 494                 .unk8 = 0x8,
 495                 .min_filter = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST,
 496                 .norm_coords = cso->normalized_coords,
 497                 .mip_filter = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR,
 498                 .mag_filter = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR,
 499                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 500                 .max_lod = FIXED_16(cso->max_lod, false),
 501         };
 502
 503         /* If necessary, we disable mipmapping in the sampler descriptor by
 504          * clamping the LOD as tight as possible (from 0 to epsilon,
 505          * essentially -- remember these are fixed point numbers, so
 506          * epsilon=1/256) */
 507
 508         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 509                 hw->max_lod = hw->min_lod + 1;
 510 }
 511
 512 static void
 513 panfrost_make_stencil_state(const struct pipe_stencil_state *in,
 514                             struct mali_stencil_test *out)
 515 {
 516         out->ref = 0; /* Gallium gets it from elsewhere */
 517
 518         out->mask = in->valuemask;
 519         out->func = panfrost_translate_compare_func(in->func);
 520         out->sfail = panfrost_translate_stencil_op(in->fail_op);
 521         out->dpfail = panfrost_translate_stencil_op(in->zfail_op);
 522         out->dppass = panfrost_translate_stencil_op(in->zpass_op);
 523 }
 524
 525 static void
 526 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
 527                                      struct mali_shader_meta *fragmeta)
 528 {
 529         if (!ctx->rasterizer) {
 530                 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
 531                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
 532                 fragmeta->depth_units = 0.0f;
 533                 fragmeta->depth_factor = 0.0f;
 534                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
 535                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
 536                 return;
 537         }
 538
 539         bool msaa = ctx->rasterizer->base.multisample;
 540
 541         /* TODO: Sample size */
 542         SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
 543         SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
 544         fragmeta->depth_units = ctx->rasterizer->base.offset_units * 2.0f;
 545         fragmeta->depth_factor = ctx->rasterizer->base.offset_scale;
 546
 547         /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
 548
 549         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A,
 550                 ctx->rasterizer->base.offset_tri);
 551         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B,
 552                 ctx->rasterizer->base.offset_tri);
 553 }
 554
 555 static void
 556 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
 557                               struct mali_shader_meta *fragmeta)
 558 {
 559         const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
 560         int zfunc = PIPE_FUNC_ALWAYS;
 561
 562         if (!zsa) {
 563                 struct pipe_stencil_state default_stencil = {
 564                         .enabled = 0,
 565                         .func = PIPE_FUNC_ALWAYS,
 566                         .fail_op = MALI_STENCIL_KEEP,
 567                         .zfail_op = MALI_STENCIL_KEEP,
 568                         .zpass_op = MALI_STENCIL_KEEP,
 569                         .writemask = 0xFF,
 570                         .valuemask = 0xFF
 571                 };
 572
 573                 panfrost_make_stencil_state(&default_stencil,
 574                                             &fragmeta->stencil_front);
 575                 fragmeta->stencil_mask_front = default_stencil.writemask;
 576                 fragmeta->stencil_back = fragmeta->stencil_front;
 577                 fragmeta->stencil_mask_back = default_stencil.writemask;
 578                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
 579                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
 580         } else {
 581                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
 582                         zsa->stencil[0].enabled);
 583                 panfrost_make_stencil_state(&zsa->stencil[0],
 584                                             &fragmeta->stencil_front);
 585                 fragmeta->stencil_mask_front = zsa->stencil[0].writemask;
 586                 fragmeta->stencil_front.ref = ctx->stencil_ref.ref_value[0];
 587
 588                 /* If back-stencil is not enabled, use the front values */
 589
 590                 if (zsa->stencil[1].enabled) {
 591                         panfrost_make_stencil_state(&zsa->stencil[1],
 592                                                     &fragmeta->stencil_back);
 593                         fragmeta->stencil_mask_back = zsa->stencil[1].writemask;
 594                         fragmeta->stencil_back.ref = ctx->stencil_ref.ref_value[1];
 595                 } else {
 596                         fragmeta->stencil_back = fragmeta->stencil_front;
 597                         fragmeta->stencil_mask_back = fragmeta->stencil_mask_front;
 598                         fragmeta->stencil_back.ref = fragmeta->stencil_front.ref;
 599                 }
 600
 601                 if (zsa->depth.enabled)
 602                         zfunc = zsa->depth.func;
 603
 604                 /* Depth state (TODO: Refactor) */
 605
 606                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
 607                         zsa->depth.writemask);
 608         }
 609
 610         fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
 611         fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
 612 }
 613
 614 static void
 615 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
 616                                 struct mali_shader_meta *fragmeta,
 617                                 void *rts)
 618 {
 619         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 620
 621         SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
 622                 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
 623                 !ctx->blend->base.dither);
 624
 625         /* Get blending setup */
 626         unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 627
 628         struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 629         unsigned shader_offset = 0;
 630         struct panfrost_bo *shader_bo = NULL;
 631
 632         for (unsigned c = 0; c < rt_count; ++c)
 633                 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
 634                                                           &shader_offset);
 635
 636          /* If there is a blend shader, work registers are shared. XXX: opt */
 637
 638         for (unsigned c = 0; c < rt_count; ++c) {
 639                 if (blend[c].is_shader)
 640                         fragmeta->midgard1.work_count = 16;
 641         }
 642
 643         /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
 644          * copied to the blend_meta appended (by convention), but this is the
 645          * field actually read by the hardware. (Or maybe both are read...?).
 646          * Specify the last RTi with a blend shader. */
 647
 648         fragmeta->blend.shader = 0;
 649
 650         for (signed rt = (rt_count - 1); rt >= 0; --rt) {
 651                 if (!blend[rt].is_shader)
 652                         continue;
 653
 654                 fragmeta->blend.shader = blend[rt].shader.gpu |
 655                                          blend[rt].shader.first_tag;
 656                 break;
 657         }
 658
 659         if (dev->quirks & MIDGARD_SFBD) {
 660                 /* When only a single render target platform is used, the blend
 661                  * information is inside the shader meta itself. We additionally
 662                  * need to signal CAN_DISCARD for nontrivial blend modes (so
 663                  * we're able to read back the destination buffer) */
 664
 665                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
 666                         blend[0].is_shader);
 667
 668                 if (!blend[0].is_shader) {
 669                         fragmeta->blend.equation = *blend[0].equation.equation;
 670                         fragmeta->blend.constant = blend[0].equation.constant;
 671                 }
 672
 673                 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
 674                         !blend[0].no_blending);
 675                 return;
 676         }
 677
 678         /* Additional blend descriptor tacked on for jobs using MFBD */
 679
 680         for (unsigned i = 0; i < rt_count; ++i) {
 681                 if (dev->quirks & IS_BIFROST) {
 682                         struct bifrost_blend_rt *brts = rts;
 683                         struct panfrost_shader_state *fs;
 684                         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 685
 686                         brts[i].flags = 0x200;
 687                         if (blend[i].is_shader) {
 688                                 /* The blend shader's address needs to be at
 689                                  * the same top 32 bit as the fragment shader.
 690                                  * TODO: Ensure that's always the case.
 691                                  */
 692                                 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
 693                                        (fs->bo->gpu & (0xffffffffull << 32)));
 694                                 brts[i].shader = blend[i].shader.gpu;
 695                                 brts[i].unk2 = 0x0;
 696                         } else {
 697                                 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
 698                                 const struct util_format_description *format_desc;
 699                                 format_desc = util_format_description(format);
 700
 701                                 brts[i].equation = *blend[i].equation.equation;
 702
 703                                 /* TODO: this is a bit more complicated */
 704                                 brts[i].constant = blend[i].equation.constant;
 705
 706                                 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
 707                                 brts[i].unk2 = 0x19;
 708
 709                                 brts[i].shader_type = fs->blend_types[i];
 710                         }
 711                 } else {
 712                         struct midgard_blend_rt *mrts = rts;
 713
 714                         mrts[i].flags = 0x200;
 715
 716                         bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
 717                                        (ctx->pipe_framebuffer.cbufs[i]) &&
 718                                        util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
 719
 720                         SET_BIT(mrts[i].flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
 721                         SET_BIT(mrts[i].flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
 722                         SET_BIT(mrts[i].flags, MALI_BLEND_SRGB, is_srgb);
 723                         SET_BIT(mrts[i].flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
 724
 725                         if (blend[i].is_shader) {
 726                                 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 727                         } else {
 728                                 mrts[i].blend.equation = *blend[i].equation.equation;
 729                                 mrts[i].blend.constant = blend[i].equation.constant;
 730                         }
 731                 }
 732         }
 733 }
 734
 735 static void
 736 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
 737                                struct mali_shader_meta *fragmeta,
 738                                void *rts)
 739 {
 740         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 741         struct panfrost_shader_state *fs;
 742
 743         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 744
 745         fragmeta->alpha_coverage = ~MALI_ALPHA_COVERAGE(0.000000);
 746         fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x3010;
 747         fragmeta->unknown2_4 = 0x4e0;
 748
 749         /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
 750          * is required (independent of 32-bit/64-bit descriptors), or why it's
 751          * not used on later GPU revisions. Otherwise, all shader jobs fault on
 752          * these earlier chips (perhaps this is a chicken bit of some kind).
 753          * More investigation is needed. */
 754
 755         SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
 756
 757         /* Depending on whether it's legal to in the given shader, we try to
 758          * enable early-z testing (or forward-pixel kill?) */
 759
 760         SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
 761                 !fs->can_discard && !fs->writes_depth);
 762
 763         /* Add the writes Z/S flags if needed. */
 764         SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
 765         SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
 766
 767         /* Any time texturing is used, derivatives are implicitly calculated,
 768          * so we need to enable helper invocations */
 769
 770         SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
 771                 fs->helper_invocations);
 772
 773         /* CAN_DISCARD should be set if the fragment shader possibly contains a
 774          * 'discard' instruction. It is likely this is related to optimizations
 775          * related to forward-pixel kill, as per "Mali Performance 3: Is
 776          * EGL_BUFFER_PRESERVED a good thing?" by Peter Harris */
 777
 778         SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD, fs->can_discard);
 779         SET_BIT(fragmeta->midgard1.flags_lo, 0x400, fs->can_discard);
 780
 781         panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
 782         panfrost_frag_meta_zsa_update(ctx, fragmeta);
 783         panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
 784 }
 785
 786 void
 787 panfrost_emit_shader_meta(struct panfrost_batch *batch,
 788                           enum pipe_shader_type st,
 789                           struct mali_vertex_tiler_postfix *postfix)
 790 {
 791         struct panfrost_context *ctx = batch->ctx;
 792         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 793
 794         if (!ss) {
 795                 postfix->shader = 0;
 796                 return;
 797         }
 798
 799         struct mali_shader_meta meta;
 800
 801         panfrost_shader_meta_init(ctx, st, &meta);
 802
 803         /* Add the shader BO to the batch. */
 804         panfrost_batch_add_bo(batch, ss->bo,
 805                               PAN_BO_ACCESS_PRIVATE |
 806                               PAN_BO_ACCESS_READ |
 807                               panfrost_bo_access_for_stage(st));
 808
 809         mali_ptr shader_ptr;
 810
 811         if (st == PIPE_SHADER_FRAGMENT) {
 812                 struct panfrost_device *dev = pan_device(ctx->base.screen);
 813                 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 814                 size_t desc_size = sizeof(meta);
 815                 void *rts = NULL;
 816                 struct panfrost_transfer xfer;
 817                 unsigned rt_size;
 818
 819                 if (dev->quirks & MIDGARD_SFBD)
 820                         rt_size = 0;
 821                 else if (dev->quirks & IS_BIFROST)
 822                         rt_size = sizeof(struct bifrost_blend_rt);
 823                 else
 824                         rt_size = sizeof(struct midgard_blend_rt);
 825
 826                 desc_size += rt_size * rt_count;
 827
 828                 if (rt_size)
 829                         rts = rzalloc_size(ctx, rt_size * rt_count);
 830
 831                 panfrost_frag_shader_meta_init(ctx, &meta, rts);
 832
 833                 xfer = panfrost_allocate_transient(batch, desc_size);
 834
 835                 memcpy(xfer.cpu, &meta, sizeof(meta));
 836                 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
 837
 838                 if (rt_size)
 839                         ralloc_free(rts);
 840
 841                 shader_ptr = xfer.gpu;
 842         } else {
 843                 shader_ptr = panfrost_upload_transient(batch, &meta,
 844                                                        sizeof(meta));
 845         }
 846
 847         postfix->shader = shader_ptr;
 848 }
 849
 850 static void
 851 panfrost_mali_viewport_init(struct panfrost_context *ctx,
 852                             struct mali_viewport *mvp)
 853 {
 854         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 855
 856         /* Clip bounds are encoded as floats. The viewport itself is encoded as
 857          * (somewhat) asymmetric ints. */
 858
 859         const struct pipe_scissor_state *ss = &ctx->scissor;
 860
 861         memset(mvp, 0, sizeof(*mvp));
 862
 863         /* By default, do no viewport clipping, i.e. clip to (-inf, inf) in
 864          * each direction. Clipping to the viewport in theory should work, but
 865          * in practice causes issues when we're not explicitly trying to
 866          * scissor */
 867
 868         *mvp = (struct mali_viewport) {
 869                 .clip_minx = -INFINITY,
 870                 .clip_miny = -INFINITY,
 871                 .clip_maxx = INFINITY,
 872                 .clip_maxy = INFINITY,
 873         };
 874
 875         /* Always scissor to the viewport by default. */
 876         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 877         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 878
 879         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 880         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 881
 882         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 883         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 884
 885         /* Apply the scissor test */
 886
 887         unsigned minx, miny, maxx, maxy;
 888
 889         if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
 890                 minx = MAX2(ss->minx, vp_minx);
 891                 miny = MAX2(ss->miny, vp_miny);
 892                 maxx = MIN2(ss->maxx, vp_maxx);
 893                 maxy = MIN2(ss->maxy, vp_maxy);
 894         } else {
 895                 minx = vp_minx;
 896                 miny = vp_miny;
 897                 maxx = vp_maxx;
 898                 maxy = vp_maxy;
 899         }
 900
 901         /* Hardware needs the min/max to be strictly ordered, so flip if we
 902          * need to. The viewport transformation in the vertex shader will
 903          * handle the negatives if we don't */
 904
 905         if (miny > maxy) {
 906                 unsigned temp = miny;
 907                 miny = maxy;
 908                 maxy = temp;
 909         }
 910
 911         if (minx > maxx) {
 912                 unsigned temp = minx;
 913                 minx = maxx;
 914                 maxx = temp;
 915         }
 916
 917         if (minz > maxz) {
 918                 float temp = minz;
 919                 minz = maxz;
 920                 maxz = temp;
 921         }
 922
 923         /* Clamp to the framebuffer size as a last check */
 924
 925         minx = MIN2(ctx->pipe_framebuffer.width, minx);
 926         maxx = MIN2(ctx->pipe_framebuffer.width, maxx);
 927
 928         miny = MIN2(ctx->pipe_framebuffer.height, miny);
 929         maxy = MIN2(ctx->pipe_framebuffer.height, maxy);
 930
 931         /* Upload */
 932
 933         mvp->viewport0[0] = minx;
 934         mvp->viewport1[0] = MALI_POSITIVE(maxx);
 935
 936         mvp->viewport0[1] = miny;
 937         mvp->viewport1[1] = MALI_POSITIVE(maxy);
 938
 939         mvp->clip_minz = minz;
 940         mvp->clip_maxz = maxz;
 941 }
 942
 943 void
 944 panfrost_emit_viewport(struct panfrost_batch *batch,
 945                        struct mali_vertex_tiler_postfix *tiler_postfix)
 946 {
 947         struct panfrost_context *ctx = batch->ctx;
 948         struct mali_viewport mvp;
 949
 950         panfrost_mali_viewport_init(batch->ctx,  &mvp);
 951
 952         /* Update the job, unless we're doing wallpapering (whose lack of
 953          * scissor we can ignore, since if we "miss" a tile of wallpaper, it'll
 954          * just... be faster :) */
 955
 956         if (!ctx->wallpaper_batch)
 957                 panfrost_batch_union_scissor(batch, mvp.viewport0[0],
 958                                              mvp.viewport0[1],
 959                                              mvp.viewport1[0] + 1,
 960                                              mvp.viewport1[1] + 1);
 961
 962         tiler_postfix->viewport = panfrost_upload_transient(batch, &mvp,
 963                                                             sizeof(mvp));
 964 }
 965
 966 static mali_ptr
 967 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
 968                                  enum pipe_shader_type st,
 969                                  struct panfrost_constant_buffer *buf,
 970                                  unsigned index)
 971 {
 972         struct pipe_constant_buffer *cb = &buf->cb[index];
 973         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
 974
 975         if (rsrc) {
 976                 panfrost_batch_add_bo(batch, rsrc->bo,
 977                                       PAN_BO_ACCESS_SHARED |
 978                                       PAN_BO_ACCESS_READ |
 979                                       panfrost_bo_access_for_stage(st));
 980
 981                 /* Alignment gauranteed by
 982                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
 983                 return rsrc->bo->gpu + cb->buffer_offset;
 984         } else if (cb->user_buffer) {
 985                 return panfrost_upload_transient(batch,
 986                                                  cb->user_buffer +
 987                                                  cb->buffer_offset,
 988                                                  cb->buffer_size);
 989         } else {
 990                 unreachable("No constant buffer");
 991         }
 992 }
 993
 994 struct sysval_uniform {
 995         union {
 996                 float f[4];
 997                 int32_t i[4];
 998                 uint32_t u[4];
 999                 uint64_t du[2];
1000         };
1001 };
1002
1003 static void
1004 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
1005                                       struct sysval_uniform *uniform)
1006 {
1007         struct panfrost_context *ctx = batch->ctx;
1008         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1009
1010         uniform->f[0] = vp->scale[0];
1011         uniform->f[1] = vp->scale[1];
1012         uniform->f[2] = vp->scale[2];
1013 }
1014
1015 static void
1016 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1017                                        struct sysval_uniform *uniform)
1018 {
1019         struct panfrost_context *ctx = batch->ctx;
1020         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1021
1022         uniform->f[0] = vp->translate[0];
1023         uniform->f[1] = vp->translate[1];
1024         uniform->f[2] = vp->translate[2];
1025 }
1026
1027 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1028                                        enum pipe_shader_type st,
1029                                        unsigned int sysvalid,
1030                                        struct sysval_uniform *uniform)
1031 {
1032         struct panfrost_context *ctx = batch->ctx;
1033         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1034         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1035         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1036         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1037
1038         assert(dim);
1039         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1040
1041         if (dim > 1)
1042                 uniform->i[1] = u_minify(tex->texture->height0,
1043                                          tex->u.tex.first_level);
1044
1045         if (dim > 2)
1046                 uniform->i[2] = u_minify(tex->texture->depth0,
1047                                          tex->u.tex.first_level);
1048
1049         if (is_array)
1050                 uniform->i[dim] = tex->texture->array_size;
1051 }
1052
1053 static void
1054 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1055                             enum pipe_shader_type st,
1056                             unsigned ssbo_id,
1057                             struct sysval_uniform *uniform)
1058 {
1059         struct panfrost_context *ctx = batch->ctx;
1060
1061         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1062         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1063
1064         /* Compute address */
1065         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1066
1067         panfrost_batch_add_bo(batch, bo,
1068                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1069                               panfrost_bo_access_for_stage(st));
1070
1071         /* Upload address and size as sysval */
1072         uniform->du[0] = bo->gpu + sb.buffer_offset;
1073         uniform->u[2] = sb.buffer_size;
1074 }
1075
1076 static void
1077 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1078                                enum pipe_shader_type st,
1079                                unsigned samp_idx,
1080                                struct sysval_uniform *uniform)
1081 {
1082         struct panfrost_context *ctx = batch->ctx;
1083         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1084
1085         uniform->f[0] = sampl->min_lod;
1086         uniform->f[1] = sampl->max_lod;
1087         uniform->f[2] = sampl->lod_bias;
1088
1089         /* Even without any errata, Midgard represents "no mipmapping" as
1090          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1091          * panfrost_create_sampler_state which also explains our choice of
1092          * epsilon value (again to keep behaviour consistent) */
1093
1094         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1095                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1096 }
1097
1098 static void
1099 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1100                                        struct sysval_uniform *uniform)
1101 {
1102         struct panfrost_context *ctx = batch->ctx;
1103
1104         uniform->u[0] = ctx->compute_grid->grid[0];
1105         uniform->u[1] = ctx->compute_grid->grid[1];
1106         uniform->u[2] = ctx->compute_grid->grid[2];
1107 }
1108
1109 static void
1110 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1111                         struct panfrost_shader_state *ss,
1112                         enum pipe_shader_type st)
1113 {
1114         struct sysval_uniform *uniforms = (void *)buf;
1115
1116         for (unsigned i = 0; i < ss->sysval_count; ++i) {
1117                 int sysval = ss->sysval[i];
1118
1119                 switch (PAN_SYSVAL_TYPE(sysval)) {
1120                 case PAN_SYSVAL_VIEWPORT_SCALE:
1121                         panfrost_upload_viewport_scale_sysval(batch,
1122                                                               &uniforms[i]);
1123                         break;
1124                 case PAN_SYSVAL_VIEWPORT_OFFSET:
1125                         panfrost_upload_viewport_offset_sysval(batch,
1126                                                                &uniforms[i]);
1127                         break;
1128                 case PAN_SYSVAL_TEXTURE_SIZE:
1129                         panfrost_upload_txs_sysval(batch, st,
1130                                                    PAN_SYSVAL_ID(sysval),
1131                                                    &uniforms[i]);
1132                         break;
1133                 case PAN_SYSVAL_SSBO:
1134                         panfrost_upload_ssbo_sysval(batch, st,
1135                                                     PAN_SYSVAL_ID(sysval),
1136                                                     &uniforms[i]);
1137                         break;
1138                 case PAN_SYSVAL_NUM_WORK_GROUPS:
1139                         panfrost_upload_num_work_groups_sysval(batch,
1140                                                                &uniforms[i]);
1141                         break;
1142                 case PAN_SYSVAL_SAMPLER:
1143                         panfrost_upload_sampler_sysval(batch, st,
1144                                                        PAN_SYSVAL_ID(sysval),
1145                                                        &uniforms[i]);
1146                         break;
1147                 default:
1148                         assert(0);
1149                 }
1150         }
1151 }
1152
1153 static const void *
1154 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1155                                  unsigned index)
1156 {
1157         struct pipe_constant_buffer *cb = &buf->cb[index];
1158         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1159
1160         if (rsrc)
1161                 return rsrc->bo->cpu;
1162         else if (cb->user_buffer)
1163                 return cb->user_buffer;
1164         else
1165                 unreachable("No constant buffer");
1166 }
1167
1168 void
1169 panfrost_emit_const_buf(struct panfrost_batch *batch,
1170                         enum pipe_shader_type stage,
1171                         struct mali_vertex_tiler_postfix *postfix)
1172 {
1173         struct panfrost_context *ctx = batch->ctx;
1174         struct panfrost_shader_variants *all = ctx->shader[stage];
1175
1176         if (!all)
1177                 return;
1178
1179         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1180
1181         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1182
1183         /* Uniforms are implicitly UBO #0 */
1184         bool has_uniforms = buf->enabled_mask & (1 << 0);
1185
1186         /* Allocate room for the sysval and the uniforms */
1187         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1188         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1189         size_t size = sys_size + uniform_size;
1190         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1191                                                                         size);
1192
1193         /* Upload sysvals requested by the shader */
1194         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1195
1196         /* Upload uniforms */
1197         if (has_uniforms && uniform_size) {
1198                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1199                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1200         }
1201
1202         /* Next up, attach UBOs. UBO #0 is the uniforms we just
1203          * uploaded */
1204
1205         unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1206         assert(ubo_count >= 1);
1207
1208         size_t sz = sizeof(uint64_t) * ubo_count;
1209         uint64_t ubos[PAN_MAX_CONST_BUFFERS];
1210         int uniform_count = ss->uniform_count;
1211
1212         /* Upload uniforms as a UBO */
1213         ubos[0] = MALI_MAKE_UBO(2 + uniform_count, transfer.gpu);
1214
1215         /* The rest are honest-to-goodness UBOs */
1216
1217         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1218                 size_t usz = buf->cb[ubo].buffer_size;
1219                 bool enabled = buf->enabled_mask & (1 << ubo);
1220                 bool empty = usz == 0;
1221
1222                 if (!enabled || empty) {
1223                         /* Stub out disabled UBOs to catch accesses */
1224                         ubos[ubo] = MALI_MAKE_UBO(0, 0xDEAD0000);
1225                         continue;
1226                 }
1227
1228                 mali_ptr gpu = panfrost_map_constant_buffer_gpu(batch, stage,
1229                                                                 buf, ubo);
1230
1231                 unsigned bytes_per_field = 16;
1232                 unsigned aligned = ALIGN_POT(usz, bytes_per_field);
1233                 ubos[ubo] = MALI_MAKE_UBO(aligned / bytes_per_field, gpu);
1234         }
1235
1236         mali_ptr ubufs = panfrost_upload_transient(batch, ubos, sz);
1237         postfix->uniforms = transfer.gpu;
1238         postfix->uniform_buffers = ubufs;
1239
1240         buf->dirty_mask = 0;
1241 }
1242
1243 void
1244 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1245                             const struct pipe_grid_info *info,
1246                             struct midgard_payload_vertex_tiler *vtp)
1247 {
1248         struct panfrost_context *ctx = batch->ctx;
1249         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1250         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1251         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1252                                                            128));
1253         unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1254                                info->grid[2] * 4;
1255         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1256                                                                   shared_size,
1257                                                                   1);
1258
1259         struct mali_shared_memory shared = {
1260                 .shared_memory = bo->gpu,
1261                 .shared_workgroup_count =
1262                         util_logbase2_ceil(info->grid[0]) +
1263                         util_logbase2_ceil(info->grid[1]) +
1264                         util_logbase2_ceil(info->grid[2]),
1265                 .shared_unk1 = 0x2,
1266                 .shared_shift = util_logbase2(single_size) - 1
1267         };
1268
1269         vtp->postfix.shared_memory = panfrost_upload_transient(batch, &shared,
1270                                                                sizeof(shared));
1271 }
1272
1273 static mali_ptr
1274 panfrost_get_tex_desc(struct panfrost_batch *batch,
1275                       enum pipe_shader_type st,
1276                       struct panfrost_sampler_view *view)
1277 {
1278         if (!view)
1279                 return (mali_ptr) 0;
1280
1281         struct pipe_sampler_view *pview = &view->base;
1282         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1283
1284         /* Add the BO to the job so it's retained until the job is done. */
1285
1286         panfrost_batch_add_bo(batch, rsrc->bo,
1287                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1288                               panfrost_bo_access_for_stage(st));
1289
1290         panfrost_batch_add_bo(batch, view->midgard_bo,
1291                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1292                               panfrost_bo_access_for_stage(st));
1293
1294         return view->midgard_bo->gpu;
1295 }
1296
1297 void
1298 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1299                                   enum pipe_shader_type stage,
1300                                   struct mali_vertex_tiler_postfix *postfix)
1301 {
1302         struct panfrost_context *ctx = batch->ctx;
1303         struct panfrost_device *device = pan_device(ctx->base.screen);
1304
1305         if (!ctx->sampler_view_count[stage])
1306                 return;
1307
1308         if (device->quirks & IS_BIFROST) {
1309                 struct bifrost_texture_descriptor *descriptors;
1310
1311                 descriptors = malloc(sizeof(struct bifrost_texture_descriptor) *
1312                                      ctx->sampler_view_count[stage]);
1313
1314                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1315                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1316                         struct pipe_sampler_view *pview = &view->base;
1317                         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1318
1319                         panfrost_batch_add_bo(batch, rsrc->bo,
1320                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1321                                               panfrost_bo_access_for_stage(stage));
1322
1323                         memcpy(&descriptors[i], view->bifrost_descriptor, sizeof(*view->bifrost_descriptor));
1324                 }
1325
1326                 postfix->textures = panfrost_upload_transient(batch,
1327                                                               descriptors,
1328                                                               sizeof(struct bifrost_texture_descriptor) *
1329                                                                       ctx->sampler_view_count[stage]);
1330
1331                 free(descriptors);
1332         } else {
1333                 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1334
1335                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i)
1336                         trampolines[i] = panfrost_get_tex_desc(batch, stage,
1337                                                                ctx->sampler_views[stage][i]);
1338
1339                 postfix->textures = panfrost_upload_transient(batch,
1340                                                               trampolines,
1341                                                               sizeof(uint64_t) *
1342                                                               ctx->sampler_view_count[stage]);
1343         }
1344 }
1345
1346 void
1347 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1348                                   enum pipe_shader_type stage,
1349                                   struct mali_vertex_tiler_postfix *postfix)
1350 {
1351         struct panfrost_context *ctx = batch->ctx;
1352         struct panfrost_device *device = pan_device(ctx->base.screen);
1353
1354         if (!ctx->sampler_count[stage])
1355                 return;
1356
1357         if (device->quirks & IS_BIFROST) {
1358                 size_t desc_size = sizeof(struct bifrost_sampler_descriptor);
1359                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1360                 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1361                                                                                 transfer_size);
1362                 struct bifrost_sampler_descriptor *desc = (struct bifrost_sampler_descriptor *)transfer.cpu;
1363
1364                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1365                         desc[i] = ctx->samplers[stage][i]->bifrost_hw;
1366
1367                 postfix->sampler_descriptor = transfer.gpu;
1368         } else {
1369                 size_t desc_size = sizeof(struct mali_sampler_descriptor);
1370                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1371                 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1372                                                                                 transfer_size);
1373                 struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1374
1375                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1376                         desc[i] = ctx->samplers[stage][i]->midgard_hw;
1377
1378                 postfix->sampler_descriptor = transfer.gpu;
1379         }
1380 }
1381
1382 void
1383 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1384                                struct mali_vertex_tiler_postfix *vertex_postfix)
1385 {
1386         struct panfrost_context *ctx = batch->ctx;
1387
1388         if (!ctx->vertex)
1389                 return;
1390
1391         struct panfrost_vertex_state *so = ctx->vertex;
1392
1393         panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1394         vertex_postfix->attribute_meta = panfrost_upload_transient(batch, so->hw,
1395                                                                sizeof(*so->hw) *
1396                                                                PAN_MAX_ATTRIBUTE);
1397 }
1398
1399 void
1400 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1401                           struct mali_vertex_tiler_postfix *vertex_postfix)
1402 {
1403         struct panfrost_context *ctx = batch->ctx;
1404         struct panfrost_vertex_state *so = ctx->vertex;
1405
1406         /* Staged mali_attr, and index into them. i =/= k, depending on the
1407          * vertex buffer mask and instancing. Twice as much room is allocated,
1408          * for a worst case of NPOT_DIVIDEs which take up extra slot */
1409         union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1410         unsigned k = 0;
1411
1412         for (unsigned i = 0; i < so->num_elements; ++i) {
1413                 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1414                  * means duplicating some vertex buffers (who cares? aside from
1415                  * maybe some caching implications but I somehow doubt that
1416                  * matters) */
1417
1418                 struct pipe_vertex_element *elem = &so->pipe[i];
1419                 unsigned vbi = elem->vertex_buffer_index;
1420
1421                 /* The exception to 1:1 mapping is that we can have multiple
1422                  * entries (NPOT divisors), so we fixup anyways */
1423
1424                 so->hw[i].index = k;
1425
1426                 if (!(ctx->vb_mask & (1 << vbi)))
1427                         continue;
1428
1429                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1430                 struct panfrost_resource *rsrc;
1431
1432                 rsrc = pan_resource(buf->buffer.resource);
1433                 if (!rsrc)
1434                         continue;
1435
1436                 /* Align to 64 bytes by masking off the lower bits. This
1437                  * will be adjusted back when we fixup the src_offset in
1438                  * mali_attr_meta */
1439
1440                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1441                 mali_ptr addr = raw_addr & ~63;
1442                 unsigned chopped_addr = raw_addr - addr;
1443
1444                 /* Add a dependency of the batch on the vertex buffer */
1445                 panfrost_batch_add_bo(batch, rsrc->bo,
1446                                       PAN_BO_ACCESS_SHARED |
1447                                       PAN_BO_ACCESS_READ |
1448                                       PAN_BO_ACCESS_VERTEX_TILER);
1449
1450                 /* Set common fields */
1451                 attrs[k].elements = addr;
1452                 attrs[k].stride = buf->stride;
1453
1454                 /* Since we advanced the base pointer, we shrink the buffer
1455                  * size */
1456                 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1457
1458                 /* We need to add the extra size we masked off (for
1459                  * correctness) so the data doesn't get clamped away */
1460                 attrs[k].size += chopped_addr;
1461
1462                 /* For non-instancing make sure we initialize */
1463                 attrs[k].shift = attrs[k].extra_flags = 0;
1464
1465                 /* Instancing uses a dramatically different code path than
1466                  * linear, so dispatch for the actual emission now that the
1467                  * common code is finished */
1468
1469                 unsigned divisor = elem->instance_divisor;
1470
1471                 if (divisor && ctx->instance_count == 1) {
1472                         /* Silly corner case where there's a divisor(=1) but
1473                          * there's no legitimate instancing. So we want *every*
1474                          * attribute to be the same. So set stride to zero so
1475                          * we don't go anywhere. */
1476
1477                         attrs[k].size = attrs[k].stride + chopped_addr;
1478                         attrs[k].stride = 0;
1479                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1480                 } else if (ctx->instance_count <= 1) {
1481                         /* Normal, non-instanced attributes */
1482                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1483                 } else {
1484                         unsigned instance_shift = vertex_postfix->instance_shift;
1485                         unsigned instance_odd = vertex_postfix->instance_odd;
1486
1487                         k += panfrost_vertex_instanced(ctx->padded_count,
1488                                                        instance_shift,
1489                                                        instance_odd,
1490                                                        divisor, &attrs[k]);
1491                 }
1492         }
1493
1494         /* Add special gl_VertexID/gl_InstanceID buffers */
1495
1496         panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1497         so->hw[PAN_VERTEX_ID].index = k++;
1498         panfrost_instance_id(ctx->padded_count, &attrs[k]);
1499         so->hw[PAN_INSTANCE_ID].index = k++;
1500
1501         /* Upload whatever we emitted and go */
1502
1503         vertex_postfix->attributes = panfrost_upload_transient(batch, attrs,
1504                                                            k * sizeof(*attrs));
1505 }
1506
1507 static mali_ptr
1508 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1509                        unsigned stride, unsigned count)
1510 {
1511         /* Fill out the descriptor */
1512         slot->stride = stride;
1513         slot->size = stride * count;
1514         slot->shift = slot->extra_flags = 0;
1515
1516         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1517                                                                         slot->size);
1518
1519         slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1520
1521         return transfer.gpu;
1522 }
1523
1524 static void
1525 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1526                         unsigned stride, unsigned offset, unsigned count,
1527                         struct pipe_stream_output_target *target)
1528 {
1529         /* Fill out the descriptor */
1530         slot->stride = stride * 4;
1531         slot->shift = slot->extra_flags = 0;
1532
1533         unsigned max_size = target->buffer_size;
1534         unsigned expected_size = slot->stride * count;
1535
1536         slot->size = MIN2(max_size, expected_size);
1537
1538         /* Grab the BO and bind it to the batch */
1539         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1540
1541         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1542          * the perspective of the TILER and FRAGMENT.
1543          */
1544         panfrost_batch_add_bo(batch, bo,
1545                               PAN_BO_ACCESS_SHARED |
1546                               PAN_BO_ACCESS_RW |
1547                               PAN_BO_ACCESS_VERTEX_TILER |
1548                               PAN_BO_ACCESS_FRAGMENT);
1549
1550         mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1551         slot->elements = addr;
1552 }
1553
1554 /* Given a shader and buffer indices, link varying metadata together */
1555
1556 static bool
1557 is_special_varying(gl_varying_slot loc)
1558 {
1559         switch (loc) {
1560         case VARYING_SLOT_POS:
1561         case VARYING_SLOT_PSIZ:
1562         case VARYING_SLOT_PNTC:
1563         case VARYING_SLOT_FACE:
1564                 return true;
1565         default:
1566                 return false;
1567         }
1568 }
1569
1570 static void
1571 panfrost_emit_varying_meta(void *outptr, struct panfrost_shader_state *ss,
1572                            signed general, signed gl_Position,
1573                            signed gl_PointSize, signed gl_PointCoord,
1574                            signed gl_FrontFacing)
1575 {
1576         struct mali_attr_meta *out = (struct mali_attr_meta *) outptr;
1577
1578         for (unsigned i = 0; i < ss->varying_count; ++i) {
1579                 gl_varying_slot location = ss->varyings_loc[i];
1580                 int index = -1;
1581
1582                 switch (location) {
1583                 case VARYING_SLOT_POS:
1584                         index = gl_Position;
1585                         break;
1586                 case VARYING_SLOT_PSIZ:
1587                         index = gl_PointSize;
1588                         break;
1589                 case VARYING_SLOT_PNTC:
1590                         index = gl_PointCoord;
1591                         break;
1592                 case VARYING_SLOT_FACE:
1593                         index = gl_FrontFacing;
1594                         break;
1595                 default:
1596                         index = general;
1597                         break;
1598                 }
1599
1600                 assert(index >= 0);
1601                 out[i].index = index;
1602         }
1603 }
1604
1605 static bool
1606 has_point_coord(unsigned mask, gl_varying_slot loc)
1607 {
1608         if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1609                 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1610         else if (loc == VARYING_SLOT_PNTC)
1611                 return (mask & (1 << 8));
1612         else
1613                 return false;
1614 }
1615
1616 /* Helpers for manipulating stream out information so we can pack varyings
1617  * accordingly. Compute the src_offset for a given captured varying */
1618
1619 static struct pipe_stream_output *
1620 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1621 {
1622         for (unsigned i = 0; i < info->num_outputs; ++i) {
1623                 if (info->output[i].register_index == loc)
1624                         return &info->output[i];
1625         }
1626
1627         unreachable("Varying not captured");
1628 }
1629
1630 /* TODO: Integers */
1631 static enum mali_format
1632 pan_xfb_format(unsigned nr_components)
1633 {
1634         switch (nr_components) {
1635                 case 1: return MALI_R32F;
1636                 case 2: return MALI_RG32F;
1637                 case 3: return MALI_RGB32F;
1638                 case 4: return MALI_RGBA32F;
1639                 default: unreachable("Invalid format");
1640         }
1641 }
1642
1643 void
1644 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1645                                  unsigned vertex_count,
1646                                  struct mali_vertex_tiler_postfix *vertex_postfix,
1647                                  struct mali_vertex_tiler_postfix *tiler_postfix,
1648                                  union midgard_primitive_size *primitive_size)
1649 {
1650         /* Load the shaders */
1651         struct panfrost_context *ctx = batch->ctx;
1652         struct panfrost_shader_state *vs, *fs;
1653         unsigned int num_gen_varyings = 0;
1654         size_t vs_size, fs_size;
1655
1656         /* Allocate the varying descriptor */
1657
1658         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1659         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1660         vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
1661         fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
1662
1663         struct panfrost_transfer trans = panfrost_allocate_transient(batch,
1664                                                                      vs_size +
1665                                                                      fs_size);
1666
1667         struct pipe_stream_output_info *so = &vs->stream_output;
1668
1669         /* Check if this varying is linked by us. This is the case for
1670          * general-purpose, non-captured varyings. If it is, link it. If it's
1671          * not, use the provided stream out information to determine the
1672          * offset, since it was already linked for us. */
1673
1674         for (unsigned i = 0; i < vs->varying_count; i++) {
1675                 gl_varying_slot loc = vs->varyings_loc[i];
1676
1677                 bool special = is_special_varying(loc);
1678                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1679
1680                 if (captured) {
1681                         struct pipe_stream_output *o = pan_get_so(so, loc);
1682
1683                         unsigned dst_offset = o->dst_offset * 4; /* dwords */
1684                         vs->varyings[i].src_offset = dst_offset;
1685                 } else if (!special) {
1686                         vs->varyings[i].src_offset = 16 * (num_gen_varyings++);
1687                 }
1688         }
1689
1690         /* Conversely, we need to set src_offset for the captured varyings.
1691          * Here, the layout is defined by the stream out info, not us */
1692
1693         /* Link up with fragment varyings */
1694         bool reads_point_coord = fs->reads_point_coord;
1695
1696         for (unsigned i = 0; i < fs->varying_count; i++) {
1697                 gl_varying_slot loc = fs->varyings_loc[i];
1698                 unsigned src_offset;
1699                 signed vs_idx = -1;
1700
1701                 /* Link up */
1702                 for (unsigned j = 0; j < vs->varying_count; ++j) {
1703                         if (vs->varyings_loc[j] == loc) {
1704                                 vs_idx = j;
1705                                 break;
1706                         }
1707                 }
1708
1709                 /* Either assign or reuse */
1710                 if (vs_idx >= 0)
1711                         src_offset = vs->varyings[vs_idx].src_offset;
1712                 else
1713                         src_offset = 16 * (num_gen_varyings++);
1714
1715                 fs->varyings[i].src_offset = src_offset;
1716
1717                 if (has_point_coord(fs->point_sprite_mask, loc))
1718                         reads_point_coord = true;
1719         }
1720
1721         memcpy(trans.cpu, vs->varyings, vs_size);
1722         memcpy(trans.cpu + vs_size, fs->varyings, fs_size);
1723
1724         union mali_attr varyings[PIPE_MAX_ATTRIBS] = {0};
1725
1726         /* Figure out how many streamout buffers could be bound */
1727         unsigned so_count = ctx->streamout.num_targets;
1728         for (unsigned i = 0; i < vs->varying_count; i++) {
1729                 gl_varying_slot loc = vs->varyings_loc[i];
1730
1731                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1732                 if (!captured) continue;
1733
1734                 struct pipe_stream_output *o = pan_get_so(so, loc);
1735                 so_count = MAX2(so_count, o->output_buffer + 1);
1736         }
1737
1738         signed idx = so_count;
1739         signed general = idx++;
1740         signed gl_Position = idx++;
1741         signed gl_PointSize = vs->writes_point_size ? (idx++) : -1;
1742         signed gl_PointCoord = reads_point_coord ? (idx++) : -1;
1743         signed gl_FrontFacing = fs->reads_face ? (idx++) : -1;
1744         signed gl_FragCoord = fs->reads_frag_coord ? (idx++) : -1;
1745
1746         /* Emit the stream out buffers */
1747
1748         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1749                                                            ctx->vertex_count);
1750
1751         for (unsigned i = 0; i < so_count; ++i) {
1752                 if (i < ctx->streamout.num_targets) {
1753                         panfrost_emit_streamout(batch, &varyings[i],
1754                                                 so->stride[i],
1755                                                 ctx->streamout.offsets[i],
1756                                                 out_count,
1757                                                 ctx->streamout.targets[i]);
1758                 } else {
1759                         /* Emit a dummy buffer */
1760                         panfrost_emit_varyings(batch, &varyings[i],
1761                                                so->stride[i] * 4,
1762                                                out_count);
1763
1764                         /* Clear the attribute type */
1765                         varyings[i].elements &= ~0xF;
1766                 }
1767         }
1768
1769         panfrost_emit_varyings(batch, &varyings[general],
1770                                num_gen_varyings * 16,
1771                                vertex_count);
1772
1773         mali_ptr varyings_p;
1774
1775         /* fp32 vec4 gl_Position */
1776         varyings_p = panfrost_emit_varyings(batch, &varyings[gl_Position],
1777                                             sizeof(float) * 4, vertex_count);
1778         tiler_postfix->position_varying = varyings_p;
1779
1780
1781         if (panfrost_writes_point_size(ctx)) {
1782                 varyings_p = panfrost_emit_varyings(batch,
1783                                                     &varyings[gl_PointSize],
1784                                                     2, vertex_count);
1785                 primitive_size->pointer = varyings_p;
1786         }
1787
1788         if (reads_point_coord)
1789                 varyings[gl_PointCoord].elements = MALI_VARYING_POINT_COORD;
1790
1791         if (fs->reads_face)
1792                 varyings[gl_FrontFacing].elements = MALI_VARYING_FRONT_FACING;
1793
1794         if (fs->reads_frag_coord)
1795                 varyings[gl_FragCoord].elements = MALI_VARYING_FRAG_COORD;
1796
1797         struct panfrost_device *device = pan_device(ctx->base.screen);
1798         assert(!(device->quirks & IS_BIFROST) || !(reads_point_coord || fs->reads_face || fs->reads_frag_coord));
1799
1800         /* Let's go ahead and link varying meta to the buffer in question, now
1801          * that that information is available. VARYING_SLOT_POS is mapped to
1802          * gl_FragCoord for fragment shaders but gl_Positionf or vertex shaders
1803          * */
1804
1805         panfrost_emit_varying_meta(trans.cpu, vs, general, gl_Position,
1806                                    gl_PointSize, gl_PointCoord,
1807                                    gl_FrontFacing);
1808
1809         panfrost_emit_varying_meta(trans.cpu + vs_size, fs, general,
1810                                    gl_FragCoord, gl_PointSize,
1811                                    gl_PointCoord, gl_FrontFacing);
1812
1813         /* Replace streamout */
1814
1815         struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
1816         struct mali_attr_meta *ofs = ovs + vs->varying_count;
1817
1818         for (unsigned i = 0; i < vs->varying_count; i++) {
1819                 gl_varying_slot loc = vs->varyings_loc[i];
1820
1821                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1822                 if (!captured)
1823                         continue;
1824
1825                 struct pipe_stream_output *o = pan_get_so(so, loc);
1826                 ovs[i].index = o->output_buffer;
1827
1828                 /* Set the type appropriately. TODO: Integer varyings XXX */
1829                 assert(o->stream == 0);
1830                 ovs[i].format = pan_xfb_format(o->num_components);
1831                 ovs[i].swizzle = panfrost_get_default_swizzle(o->num_components);
1832
1833                 /* Link to the fragment */
1834                 signed fs_idx = -1;
1835
1836                 /* Link up */
1837                 for (unsigned j = 0; j < fs->varying_count; ++j) {
1838                         if (fs->varyings_loc[j] == loc) {
1839                                 fs_idx = j;
1840                                 break;
1841                         }
1842                 }
1843
1844                 if (fs_idx >= 0) {
1845                         ofs[fs_idx].index = ovs[i].index;
1846                         ofs[fs_idx].format = ovs[i].format;
1847                         ofs[fs_idx].swizzle = ovs[i].swizzle;
1848                 }
1849         }
1850
1851         /* Replace point sprite */
1852         for (unsigned i = 0; i < fs->varying_count; i++) {
1853                 /* If we have a point sprite replacement, handle that here. We
1854                  * have to translate location first.  TODO: Flip y in shader.
1855                  * We're already keying ... just time crunch .. */
1856
1857                 if (has_point_coord(fs->point_sprite_mask,
1858                                     fs->varyings_loc[i])) {
1859                         ofs[i].index = gl_PointCoord;
1860
1861                         /* Swizzle out the z/w to 0/1 */
1862                         ofs[i].format = MALI_RG16F;
1863                         ofs[i].swizzle = panfrost_get_default_swizzle(2);
1864                 }
1865         }
1866
1867         /* Fix up unaligned addresses */
1868         for (unsigned i = 0; i < so_count; ++i) {
1869                 if (varyings[i].elements < MALI_RECORD_SPECIAL)
1870                         continue;
1871
1872                 unsigned align = (varyings[i].elements & 63);
1873
1874                 /* While we're at it, the SO buffers are linear */
1875
1876                 if (!align) {
1877                         varyings[i].elements |= MALI_ATTR_LINEAR;
1878                         continue;
1879                 }
1880
1881                 /* We need to adjust alignment */
1882                 varyings[i].elements &= ~63;
1883                 varyings[i].elements |= MALI_ATTR_LINEAR;
1884                 varyings[i].size += align;
1885
1886                 for (unsigned v = 0; v < vs->varying_count; ++v) {
1887                         if (ovs[v].index != i)
1888                                 continue;
1889
1890                         ovs[v].src_offset = vs->varyings[v].src_offset + align;
1891                 }
1892
1893                 for (unsigned f = 0; f < fs->varying_count; ++f) {
1894                         if (ofs[f].index != i)
1895                                 continue;
1896
1897                         ofs[f].src_offset = fs->varyings[f].src_offset + align;
1898                 }
1899         }
1900
1901         varyings_p = panfrost_upload_transient(batch, varyings,
1902                                                idx * sizeof(*varyings));
1903         vertex_postfix->varyings = varyings_p;
1904         tiler_postfix->varyings = varyings_p;
1905
1906         vertex_postfix->varying_meta = trans.gpu;
1907         tiler_postfix->varying_meta = trans.gpu + vs_size;
1908 }
1909
1910 void
1911 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1912                                 struct mali_vertex_tiler_prefix *vertex_prefix,
1913                                 struct mali_vertex_tiler_postfix *vertex_postfix,
1914                                 struct mali_vertex_tiler_prefix *tiler_prefix,
1915                                 struct mali_vertex_tiler_postfix *tiler_postfix,
1916                                 union midgard_primitive_size *primitive_size)
1917 {
1918         struct panfrost_context *ctx = batch->ctx;
1919         struct panfrost_device *device = pan_device(ctx->base.screen);
1920         bool wallpapering = ctx->wallpaper_batch && batch->tiler_dep;
1921         struct bifrost_payload_vertex bifrost_vertex = {0,};
1922         struct bifrost_payload_tiler bifrost_tiler = {0,};
1923         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
1924         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
1925         void *vp, *tp;
1926         size_t vp_size, tp_size;
1927
1928         if (device->quirks & IS_BIFROST) {
1929                 bifrost_vertex.prefix = *vertex_prefix;
1930                 bifrost_vertex.postfix = *vertex_postfix;
1931                 vp = &bifrost_vertex;
1932                 vp_size = sizeof(bifrost_vertex);
1933
1934                 bifrost_tiler.prefix = *tiler_prefix;
1935                 bifrost_tiler.tiler.primitive_size = *primitive_size;
1936                 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
1937                 bifrost_tiler.postfix = *tiler_postfix;
1938                 tp = &bifrost_tiler;
1939                 tp_size = sizeof(bifrost_tiler);
1940         } else {
1941                 midgard_vertex.prefix = *vertex_prefix;
1942                 midgard_vertex.postfix = *vertex_postfix;
1943                 vp = &midgard_vertex;
1944                 vp_size = sizeof(midgard_vertex);
1945
1946                 midgard_tiler.prefix = *tiler_prefix;
1947                 midgard_tiler.postfix = *tiler_postfix;
1948                 midgard_tiler.primitive_size = *primitive_size;
1949                 tp = &midgard_tiler;
1950                 tp_size = sizeof(midgard_tiler);
1951         }
1952
1953         if (wallpapering) {
1954                 /* Inject in reverse order, with "predicted" job indices.
1955                  * THIS IS A HACK XXX */
1956                 panfrost_new_job(batch, JOB_TYPE_TILER, false,
1957                                  batch->job_index + 2, tp, tp_size, true);
1958                 panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
1959                                  vp, vp_size, true);
1960                 return;
1961         }
1962
1963         /* If rasterizer discard is enable, only submit the vertex */
1964
1965         bool rasterizer_discard = ctx->rasterizer &&
1966                                   ctx->rasterizer->base.rasterizer_discard;
1967
1968         unsigned vertex = panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
1969                                            vp, vp_size, false);
1970
1971         if (rasterizer_discard)
1972                 return;
1973
1974         panfrost_new_job(batch, JOB_TYPE_TILER, false, vertex, tp, tp_size,
1975                          false);
1976 }
1977
1978 /* TODO: stop hardcoding this */
1979 mali_ptr
1980 panfrost_emit_sample_locations(struct panfrost_batch *batch)
1981 {
1982         uint16_t locations[] = {
1983             128, 128,
1984             0, 256,
1985             0, 256,
1986             0, 256,
1987             0, 256,
1988             0, 256,
1989             0, 256,
1990             0, 256,
1991             0, 256,
1992             0, 256,
1993             0, 256,
1994             0, 256,
1995             0, 256,
1996             0, 256,
1997             0, 256,
1998             0, 256,
1999             0, 256,
2000             0, 256,
2001             0, 256,
2002             0, 256,
2003             0, 256,
2004             0, 256,
2005             0, 256,
2006             0, 256,
2007             0, 256,
2008             0, 256,
2009             0, 256,
2010             0, 256,
2011             0, 256,
2012             0, 256,
2013             0, 256,
2014             0, 256,
2015             128, 128,
2016             0, 0,
2017             0, 0,
2018             0, 0,
2019             0, 0,
2020             0, 0,
2021             0, 0,
2022             0, 0,
2023             0, 0,
2024             0, 0,
2025             0, 0,
2026             0, 0,
2027             0, 0,
2028             0, 0,
2029             0, 0,
2030             0, 0,
2031         };
2032
2033         return panfrost_upload_transient(batch, locations, 96 * sizeof(uint16_t));
2034 }