src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28
  29 #include "panfrost-quirks.h"
  30
  31 #include "pan_allocate.h"
  32 #include "pan_bo.h"
  33 #include "pan_cmdstream.h"
  34 #include "pan_context.h"
  35 #include "pan_job.h"
  36
  37 /* If a BO is accessed for a particular shader stage, will it be in the primary
  38  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  39  * fragment will be primary, e.g. compute jobs will be considered
  40  * "vertex/tiler" by analogy */
  41
  42 static inline uint32_t
  43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  44 {
  45         assert(stage == PIPE_SHADER_FRAGMENT ||
  46                stage == PIPE_SHADER_VERTEX ||
  47                stage == PIPE_SHADER_COMPUTE);
  48
  49         return stage == PIPE_SHADER_FRAGMENT ?
  50                PAN_BO_ACCESS_FRAGMENT :
  51                PAN_BO_ACCESS_VERTEX_TILER;
  52 }
  53
  54 static void
  55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
  56                                struct mali_vertex_tiler_postfix *postfix)
  57 {
  58         struct panfrost_device *dev = pan_device(ctx->base.screen);
  59         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  60
  61         unsigned shift = panfrost_get_stack_shift(batch->stack_size);
  62         struct mali_shared_memory shared = {
  63                 .stack_shift = shift,
  64                 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
  65                 .shared_workgroup_count = ~0,
  66         };
  67         postfix->shared_memory = panfrost_upload_transient(batch, &shared, sizeof(shared));
  68 }
  69
  70 static void
  71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
  72                                struct mali_vertex_tiler_postfix *postfix)
  73 {
  74         struct panfrost_device *dev = pan_device(ctx->base.screen);
  75         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  76
  77         /* If we haven't, reserve space for the framebuffer */
  78
  79         if (!batch->framebuffer.gpu) {
  80                 unsigned size = (dev->quirks & MIDGARD_SFBD) ?
  81                         sizeof(struct mali_single_framebuffer) :
  82                         sizeof(struct mali_framebuffer);
  83
  84                 batch->framebuffer = panfrost_allocate_transient(batch, size);
  85
  86                 /* Tag the pointer */
  87                 if (!(dev->quirks & MIDGARD_SFBD))
  88                         batch->framebuffer.gpu |= MALI_MFBD;
  89         }
  90
  91         postfix->shared_memory = batch->framebuffer.gpu;
  92 }
  93
  94 static void
  95 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
  96                               struct mali_vertex_tiler_prefix *prefix,
  97                               struct mali_vertex_tiler_postfix *postfix)
  98 {
  99         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 100
 101         postfix->gl_enables |= 0x7;
 102         SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
 103                 rasterizer && rasterizer->base.front_ccw);
 104         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
 105                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
 106         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
 107                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
 108         SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
 109                 rasterizer && rasterizer->base.flatshade_first);
 110 }
 111
 112 void
 113 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
 114                                   struct mali_vertex_tiler_prefix *prefix,
 115                                   union midgard_primitive_size *primitive_size)
 116 {
 117         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 118
 119         if (!panfrost_writes_point_size(ctx)) {
 120                 bool points = prefix->draw_mode == MALI_POINTS;
 121                 float val = 0.0f;
 122
 123                 if (rasterizer)
 124                         val = points ?
 125                               rasterizer->base.point_size :
 126                               rasterizer->base.line_width;
 127
 128                 primitive_size->constant = val;
 129         }
 130 }
 131
 132 static void
 133 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
 134                                    struct mali_vertex_tiler_postfix *postfix)
 135 {
 136         SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
 137         if (ctx->occlusion_query)
 138                 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
 139         else
 140                 postfix->occlusion_counter = 0;
 141 }
 142
 143 void
 144 panfrost_vt_init(struct panfrost_context *ctx,
 145                  enum pipe_shader_type stage,
 146                  struct mali_vertex_tiler_prefix *prefix,
 147                  struct mali_vertex_tiler_postfix *postfix)
 148 {
 149         struct panfrost_device *device = pan_device(ctx->base.screen);
 150
 151         if (!ctx->shader[stage])
 152                 return;
 153
 154         memset(prefix, 0, sizeof(*prefix));
 155         memset(postfix, 0, sizeof(*postfix));
 156
 157         if (device->quirks & IS_BIFROST) {
 158                 postfix->gl_enables = 0x2;
 159                 panfrost_vt_emit_shared_memory(ctx, postfix);
 160         } else {
 161                 postfix->gl_enables = 0x6;
 162                 panfrost_vt_attach_framebuffer(ctx, postfix);
 163         }
 164
 165         if (stage == PIPE_SHADER_FRAGMENT) {
 166                 panfrost_vt_update_occlusion_query(ctx, postfix);
 167                 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
 168         }
 169 }
 170
 171 static unsigned
 172 panfrost_translate_index_size(unsigned size)
 173 {
 174         switch (size) {
 175         case 1:
 176                 return MALI_DRAW_INDEXED_UINT8;
 177
 178         case 2:
 179                 return MALI_DRAW_INDEXED_UINT16;
 180
 181         case 4:
 182                 return MALI_DRAW_INDEXED_UINT32;
 183
 184         default:
 185                 unreachable("Invalid index size");
 186         }
 187 }
 188
 189 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
 190  * good for the duration of the draw (transient), could last longer. Also get
 191  * the bounds on the index buffer for the range accessed by the draw. We do
 192  * these operations together because there are natural optimizations which
 193  * require them to be together. */
 194
 195 static mali_ptr
 196 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 197                                   const struct pipe_draw_info *info,
 198                                   unsigned *min_index, unsigned *max_index)
 199 {
 200         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 201         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 202         off_t offset = info->start * info->index_size;
 203         bool needs_indices = true;
 204         mali_ptr out = 0;
 205
 206         if (info->max_index != ~0u) {
 207                 *min_index = info->min_index;
 208                 *max_index = info->max_index;
 209                 needs_indices = false;
 210         }
 211
 212         if (!info->has_user_indices) {
 213                 /* Only resources can be directly mapped */
 214                 panfrost_batch_add_bo(batch, rsrc->bo,
 215                                       PAN_BO_ACCESS_SHARED |
 216                                       PAN_BO_ACCESS_READ |
 217                                       PAN_BO_ACCESS_VERTEX_TILER);
 218                 out = rsrc->bo->gpu + offset;
 219
 220                 /* Check the cache */
 221                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 222                                                            info->start,
 223                                                            info->count,
 224                                                            min_index,
 225                                                            max_index);
 226         } else {
 227                 /* Otherwise, we need to upload to transient memory */
 228                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 229                 out = panfrost_upload_transient(batch, ibuf8 + offset,
 230                                                 info->count *
 231                                                 info->index_size);
 232         }
 233
 234         if (needs_indices) {
 235                 /* Fallback */
 236                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 237
 238                 if (!info->has_user_indices)
 239                         panfrost_minmax_cache_add(rsrc->index_cache,
 240                                                   info->start, info->count,
 241                                                   *min_index, *max_index);
 242         }
 243
 244         return out;
 245 }
 246
 247 void
 248 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
 249                           const struct pipe_draw_info *info,
 250                           enum mali_draw_mode draw_mode,
 251                           struct mali_vertex_tiler_postfix *vertex_postfix,
 252                           struct mali_vertex_tiler_prefix *tiler_prefix,
 253                           struct mali_vertex_tiler_postfix *tiler_postfix,
 254                           unsigned *vertex_count,
 255                           unsigned *padded_count)
 256 {
 257         tiler_prefix->draw_mode = draw_mode;
 258
 259         unsigned draw_flags = 0;
 260
 261         if (panfrost_writes_point_size(ctx))
 262                 draw_flags |= MALI_DRAW_VARYING_SIZE;
 263
 264         if (info->primitive_restart)
 265                 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
 266
 267         /* These doesn't make much sense */
 268
 269         draw_flags |= 0x3000;
 270
 271         if (info->index_size) {
 272                 unsigned min_index = 0, max_index = 0;
 273
 274                 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
 275                                                                        info,
 276                                                                        &min_index,
 277                                                                        &max_index);
 278
 279                 /* Use the corresponding values */
 280                 *vertex_count = max_index - min_index + 1;
 281                 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
 282                 tiler_prefix->offset_bias_correction = -min_index;
 283                 tiler_prefix->index_count = MALI_POSITIVE(info->count);
 284                 draw_flags |= panfrost_translate_index_size(info->index_size);
 285         } else {
 286                 tiler_prefix->indices = 0;
 287                 *vertex_count = ctx->vertex_count;
 288                 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
 289                 tiler_prefix->offset_bias_correction = 0;
 290                 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
 291         }
 292
 293         tiler_prefix->unknown_draw = draw_flags;
 294
 295         /* Encode the padded vertex count */
 296
 297         if (info->instance_count > 1) {
 298                 *padded_count = panfrost_padded_vertex_count(*vertex_count);
 299
 300                 unsigned shift = __builtin_ctz(ctx->padded_count);
 301                 unsigned k = ctx->padded_count >> (shift + 1);
 302
 303                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
 304                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
 305         } else {
 306                 *padded_count = *vertex_count;
 307
 308                 /* Reset instancing state */
 309                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
 310                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
 311         }
 312 }
 313
 314 static void
 315 panfrost_shader_meta_init(struct panfrost_context *ctx,
 316                           enum pipe_shader_type st,
 317                           struct mali_shader_meta *meta)
 318 {
 319         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 320         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 321
 322         memset(meta, 0, sizeof(*meta));
 323         meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
 324         meta->attribute_count = ss->attribute_count;
 325         meta->varying_count = ss->varying_count;
 326         meta->texture_count = ctx->sampler_view_count[st];
 327         meta->sampler_count = ctx->sampler_count[st];
 328
 329         if (dev->quirks & IS_BIFROST) {
 330                 meta->bifrost1.unk1 = 0x800200;
 331                 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 332                 meta->bifrost2.preload_regs = 0xC0;
 333                 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
 334                                                     ss->uniform_cutoff);
 335         } else {
 336                 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
 337                                                     ss->uniform_cutoff);
 338                 meta->midgard1.work_count = ss->work_reg_count;
 339                 meta->midgard1.flags_hi = 0x8; /* XXX */
 340                 meta->midgard1.flags_lo = 0x220;
 341                 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 342         }
 343
 344 }
 345
 346 static unsigned
 347 panfrost_translate_compare_func(enum pipe_compare_func in)
 348 {
 349         switch (in) {
 350         case PIPE_FUNC_NEVER:
 351                 return MALI_FUNC_NEVER;
 352
 353         case PIPE_FUNC_LESS:
 354                 return MALI_FUNC_LESS;
 355
 356         case PIPE_FUNC_EQUAL:
 357                 return MALI_FUNC_EQUAL;
 358
 359         case PIPE_FUNC_LEQUAL:
 360                 return MALI_FUNC_LEQUAL;
 361
 362         case PIPE_FUNC_GREATER:
 363                 return MALI_FUNC_GREATER;
 364
 365         case PIPE_FUNC_NOTEQUAL:
 366                 return MALI_FUNC_NOTEQUAL;
 367
 368         case PIPE_FUNC_GEQUAL:
 369                 return MALI_FUNC_GEQUAL;
 370
 371         case PIPE_FUNC_ALWAYS:
 372                 return MALI_FUNC_ALWAYS;
 373
 374         default:
 375                 unreachable("Invalid func");
 376         }
 377 }
 378
 379 static unsigned
 380 panfrost_translate_stencil_op(enum pipe_stencil_op in)
 381 {
 382         switch (in) {
 383         case PIPE_STENCIL_OP_KEEP:
 384                 return MALI_STENCIL_KEEP;
 385
 386         case PIPE_STENCIL_OP_ZERO:
 387                 return MALI_STENCIL_ZERO;
 388
 389         case PIPE_STENCIL_OP_REPLACE:
 390                return MALI_STENCIL_REPLACE;
 391
 392         case PIPE_STENCIL_OP_INCR:
 393                 return MALI_STENCIL_INCR;
 394
 395         case PIPE_STENCIL_OP_DECR:
 396                 return MALI_STENCIL_DECR;
 397
 398         case PIPE_STENCIL_OP_INCR_WRAP:
 399                 return MALI_STENCIL_INCR_WRAP;
 400
 401         case PIPE_STENCIL_OP_DECR_WRAP:
 402                 return MALI_STENCIL_DECR_WRAP;
 403
 404         case PIPE_STENCIL_OP_INVERT:
 405                 return MALI_STENCIL_INVERT;
 406
 407         default:
 408                 unreachable("Invalid stencil op");
 409         }
 410 }
 411
 412 static unsigned
 413 translate_tex_wrap(enum pipe_tex_wrap w)
 414 {
 415         switch (w) {
 416         case PIPE_TEX_WRAP_REPEAT:
 417                 return MALI_WRAP_REPEAT;
 418
 419         case PIPE_TEX_WRAP_CLAMP:
 420                 return MALI_WRAP_CLAMP;
 421
 422         case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 423                 return MALI_WRAP_CLAMP_TO_EDGE;
 424
 425         case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
 426                 return MALI_WRAP_CLAMP_TO_BORDER;
 427
 428         case PIPE_TEX_WRAP_MIRROR_REPEAT:
 429                 return MALI_WRAP_MIRRORED_REPEAT;
 430
 431         case PIPE_TEX_WRAP_MIRROR_CLAMP:
 432                 return MALI_WRAP_MIRRORED_CLAMP;
 433
 434         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 435                 return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE;
 436
 437         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
 438                 return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER;
 439
 440         default:
 441                 unreachable("Invalid wrap");
 442         }
 443 }
 444
 445 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 446                                 struct mali_sampler_descriptor *hw)
 447 {
 448         unsigned func = panfrost_translate_compare_func(cso->compare_func);
 449         bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 450         bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 451         bool mip_linear  = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
 452         unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
 453         unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
 454         unsigned mip_filter = mip_linear  ?
 455                               (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
 456         unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
 457
 458         *hw = (struct mali_sampler_descriptor) {
 459                 .filter_mode = min_filter | mag_filter | mip_filter |
 460                                normalized,
 461                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 462                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 463                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 464                 .compare_func = panfrost_flip_compare_func(func),
 465                 .border_color = {
 466                         cso->border_color.f[0],
 467                         cso->border_color.f[1],
 468                         cso->border_color.f[2],
 469                         cso->border_color.f[3]
 470                 },
 471                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 472                 .max_lod = FIXED_16(cso->max_lod, false),
 473                 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
 474                 .seamless_cube_map = cso->seamless_cube_map,
 475         };
 476
 477         /* If necessary, we disable mipmapping in the sampler descriptor by
 478          * clamping the LOD as tight as possible (from 0 to epsilon,
 479          * essentially -- remember these are fixed point numbers, so
 480          * epsilon=1/256) */
 481
 482         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 483                 hw->max_lod = hw->min_lod + 1;
 484 }
 485
 486 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
 487                                         struct bifrost_sampler_descriptor *hw)
 488 {
 489         *hw = (struct bifrost_sampler_descriptor) {
 490                 .unk1 = 0x1,
 491                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 492                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 493                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 494                 .unk8 = 0x8,
 495                 .unk2 = 0x2,
 496                 .min_filter = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST,
 497                 .norm_coords = cso->normalized_coords,
 498                 .mip_filter = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR,
 499                 .mag_filter = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR,
 500                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 501                 .max_lod = FIXED_16(cso->max_lod, false),
 502         };
 503
 504         /* If necessary, we disable mipmapping in the sampler descriptor by
 505          * clamping the LOD as tight as possible (from 0 to epsilon,
 506          * essentially -- remember these are fixed point numbers, so
 507          * epsilon=1/256) */
 508
 509         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 510                 hw->max_lod = hw->min_lod + 1;
 511 }
 512
 513 static void
 514 panfrost_make_stencil_state(const struct pipe_stencil_state *in,
 515                             struct mali_stencil_test *out)
 516 {
 517         out->ref = 0; /* Gallium gets it from elsewhere */
 518
 519         out->mask = in->valuemask;
 520         out->func = panfrost_translate_compare_func(in->func);
 521         out->sfail = panfrost_translate_stencil_op(in->fail_op);
 522         out->dpfail = panfrost_translate_stencil_op(in->zfail_op);
 523         out->dppass = panfrost_translate_stencil_op(in->zpass_op);
 524 }
 525
 526 static void
 527 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
 528                                      struct mali_shader_meta *fragmeta)
 529 {
 530         if (!ctx->rasterizer) {
 531                 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
 532                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
 533                 fragmeta->depth_units = 0.0f;
 534                 fragmeta->depth_factor = 0.0f;
 535                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
 536                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
 537                 return;
 538         }
 539
 540         bool msaa = ctx->rasterizer->base.multisample;
 541
 542         /* TODO: Sample size */
 543         SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
 544         SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
 545         fragmeta->depth_units = ctx->rasterizer->base.offset_units * 2.0f;
 546         fragmeta->depth_factor = ctx->rasterizer->base.offset_scale;
 547
 548         /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
 549
 550         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A,
 551                 ctx->rasterizer->base.offset_tri);
 552         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B,
 553                 ctx->rasterizer->base.offset_tri);
 554 }
 555
 556 static void
 557 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
 558                               struct mali_shader_meta *fragmeta)
 559 {
 560         const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
 561         int zfunc = PIPE_FUNC_ALWAYS;
 562
 563         if (!zsa) {
 564                 struct pipe_stencil_state default_stencil = {
 565                         .enabled = 0,
 566                         .func = PIPE_FUNC_ALWAYS,
 567                         .fail_op = MALI_STENCIL_KEEP,
 568                         .zfail_op = MALI_STENCIL_KEEP,
 569                         .zpass_op = MALI_STENCIL_KEEP,
 570                         .writemask = 0xFF,
 571                         .valuemask = 0xFF
 572                 };
 573
 574                 panfrost_make_stencil_state(&default_stencil,
 575                                             &fragmeta->stencil_front);
 576                 fragmeta->stencil_mask_front = default_stencil.writemask;
 577                 fragmeta->stencil_back = fragmeta->stencil_front;
 578                 fragmeta->stencil_mask_back = default_stencil.writemask;
 579                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
 580                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
 581         } else {
 582                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
 583                         zsa->stencil[0].enabled);
 584                 panfrost_make_stencil_state(&zsa->stencil[0],
 585                                             &fragmeta->stencil_front);
 586                 fragmeta->stencil_mask_front = zsa->stencil[0].writemask;
 587                 fragmeta->stencil_front.ref = ctx->stencil_ref.ref_value[0];
 588
 589                 /* If back-stencil is not enabled, use the front values */
 590
 591                 if (zsa->stencil[1].enabled) {
 592                         panfrost_make_stencil_state(&zsa->stencil[1],
 593                                                     &fragmeta->stencil_back);
 594                         fragmeta->stencil_mask_back = zsa->stencil[1].writemask;
 595                         fragmeta->stencil_back.ref = ctx->stencil_ref.ref_value[1];
 596                 } else {
 597                         fragmeta->stencil_back = fragmeta->stencil_front;
 598                         fragmeta->stencil_mask_back = fragmeta->stencil_mask_front;
 599                         fragmeta->stencil_back.ref = fragmeta->stencil_front.ref;
 600                 }
 601
 602                 if (zsa->depth.enabled)
 603                         zfunc = zsa->depth.func;
 604
 605                 /* Depth state (TODO: Refactor) */
 606
 607                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
 608                         zsa->depth.writemask);
 609         }
 610
 611         fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
 612         fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
 613 }
 614
 615 static void
 616 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
 617                                 struct mali_shader_meta *fragmeta,
 618                                 void *rts)
 619 {
 620         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 621
 622         SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
 623                 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
 624                 !ctx->blend->base.dither);
 625
 626         /* Get blending setup */
 627         unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 628
 629         struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 630         unsigned shader_offset = 0;
 631         struct panfrost_bo *shader_bo = NULL;
 632
 633         for (unsigned c = 0; c < rt_count; ++c)
 634                 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
 635                                                           &shader_offset);
 636
 637          /* If there is a blend shader, work registers are shared. XXX: opt */
 638
 639         for (unsigned c = 0; c < rt_count; ++c) {
 640                 if (blend[c].is_shader)
 641                         fragmeta->midgard1.work_count = 16;
 642         }
 643
 644         /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
 645          * copied to the blend_meta appended (by convention), but this is the
 646          * field actually read by the hardware. (Or maybe both are read...?).
 647          * Specify the last RTi with a blend shader. */
 648
 649         fragmeta->blend.shader = 0;
 650
 651         for (signed rt = (rt_count - 1); rt >= 0; --rt) {
 652                 if (!blend[rt].is_shader)
 653                         continue;
 654
 655                 fragmeta->blend.shader = blend[rt].shader.gpu |
 656                                          blend[rt].shader.first_tag;
 657                 break;
 658         }
 659
 660         if (dev->quirks & MIDGARD_SFBD) {
 661                 /* When only a single render target platform is used, the blend
 662                  * information is inside the shader meta itself. We additionally
 663                  * need to signal CAN_DISCARD for nontrivial blend modes (so
 664                  * we're able to read back the destination buffer) */
 665
 666                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
 667                         blend[0].is_shader);
 668
 669                 if (!blend[0].is_shader) {
 670                         fragmeta->blend.equation = *blend[0].equation.equation;
 671                         fragmeta->blend.constant = blend[0].equation.constant;
 672                 }
 673
 674                 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
 675                         !blend[0].no_blending);
 676                 return;
 677         }
 678
 679         /* Additional blend descriptor tacked on for jobs using MFBD */
 680
 681         for (unsigned i = 0; i < rt_count; ++i) {
 682                 if (dev->quirks & IS_BIFROST) {
 683                         struct bifrost_blend_rt *brts = rts;
 684                         struct panfrost_shader_state *fs;
 685                         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 686
 687                         brts[i].flags = 0x200;
 688                         if (blend[i].is_shader) {
 689                                 /* The blend shader's address needs to be at
 690                                  * the same top 32 bit as the fragment shader.
 691                                  * TODO: Ensure that's always the case.
 692                                  */
 693                                 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
 694                                        (fs->bo->gpu & (0xffffffffull << 32)));
 695                                 brts[i].shader = blend[i].shader.gpu;
 696                                 brts[i].unk2 = 0x0;
 697                         } else {
 698                                 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
 699                                 const struct util_format_description *format_desc;
 700                                 format_desc = util_format_description(format);
 701
 702                                 brts[i].equation = *blend[i].equation.equation;
 703
 704                                 /* TODO: this is a bit more complicated */
 705                                 brts[i].constant = blend[i].equation.constant;
 706
 707                                 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
 708                                 brts[i].unk2 = 0x19;
 709
 710                                 brts[i].shader_type = fs->blend_types[i];
 711                         }
 712                 } else {
 713                         struct midgard_blend_rt *mrts = rts;
 714
 715                         mrts[i].flags = 0x200;
 716
 717                         bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
 718                                        (ctx->pipe_framebuffer.cbufs[i]) &&
 719                                        util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
 720
 721                         SET_BIT(mrts[i].flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
 722                         SET_BIT(mrts[i].flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
 723                         SET_BIT(mrts[i].flags, MALI_BLEND_SRGB, is_srgb);
 724                         SET_BIT(mrts[i].flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
 725
 726                         if (blend[i].is_shader) {
 727                                 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 728                         } else {
 729                                 mrts[i].blend.equation = *blend[i].equation.equation;
 730                                 mrts[i].blend.constant = blend[i].equation.constant;
 731                         }
 732                 }
 733         }
 734 }
 735
 736 static void
 737 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
 738                                struct mali_shader_meta *fragmeta,
 739                                void *rts)
 740 {
 741         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 742         struct panfrost_shader_state *fs;
 743
 744         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 745
 746         fragmeta->alpha_coverage = ~MALI_ALPHA_COVERAGE(0.000000);
 747         fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x3010;
 748         fragmeta->unknown2_4 = 0x4e0;
 749
 750         /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
 751          * is required (independent of 32-bit/64-bit descriptors), or why it's
 752          * not used on later GPU revisions. Otherwise, all shader jobs fault on
 753          * these earlier chips (perhaps this is a chicken bit of some kind).
 754          * More investigation is needed. */
 755
 756         SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
 757
 758         /* Depending on whether it's legal to in the given shader, we try to
 759          * enable early-z testing (or forward-pixel kill?) */
 760
 761         SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
 762                 !fs->can_discard && !fs->writes_depth);
 763
 764         /* Add the writes Z/S flags if needed. */
 765         SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
 766         SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
 767
 768         /* Any time texturing is used, derivatives are implicitly calculated,
 769          * so we need to enable helper invocations */
 770
 771         SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
 772                 fs->helper_invocations);
 773
 774         /* CAN_DISCARD should be set if the fragment shader possibly contains a
 775          * 'discard' instruction. It is likely this is related to optimizations
 776          * related to forward-pixel kill, as per "Mali Performance 3: Is
 777          * EGL_BUFFER_PRESERVED a good thing?" by Peter Harris */
 778
 779         SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD, fs->can_discard);
 780         SET_BIT(fragmeta->midgard1.flags_lo, 0x400, fs->can_discard);
 781
 782         panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
 783         panfrost_frag_meta_zsa_update(ctx, fragmeta);
 784         panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
 785 }
 786
 787 void
 788 panfrost_emit_shader_meta(struct panfrost_batch *batch,
 789                           enum pipe_shader_type st,
 790                           struct mali_vertex_tiler_postfix *postfix)
 791 {
 792         struct panfrost_context *ctx = batch->ctx;
 793         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 794
 795         if (!ss) {
 796                 postfix->shader = 0;
 797                 return;
 798         }
 799
 800         struct mali_shader_meta meta;
 801
 802         panfrost_shader_meta_init(ctx, st, &meta);
 803
 804         /* Add the shader BO to the batch. */
 805         panfrost_batch_add_bo(batch, ss->bo,
 806                               PAN_BO_ACCESS_PRIVATE |
 807                               PAN_BO_ACCESS_READ |
 808                               panfrost_bo_access_for_stage(st));
 809
 810         mali_ptr shader_ptr;
 811
 812         if (st == PIPE_SHADER_FRAGMENT) {
 813                 struct panfrost_device *dev = pan_device(ctx->base.screen);
 814                 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 815                 size_t desc_size = sizeof(meta);
 816                 void *rts = NULL;
 817                 struct panfrost_transfer xfer;
 818                 unsigned rt_size;
 819
 820                 if (dev->quirks & MIDGARD_SFBD)
 821                         rt_size = 0;
 822                 else if (dev->quirks & IS_BIFROST)
 823                         rt_size = sizeof(struct bifrost_blend_rt);
 824                 else
 825                         rt_size = sizeof(struct midgard_blend_rt);
 826
 827                 desc_size += rt_size * rt_count;
 828
 829                 if (rt_size)
 830                         rts = rzalloc_size(ctx, rt_size * rt_count);
 831
 832                 panfrost_frag_shader_meta_init(ctx, &meta, rts);
 833
 834                 xfer = panfrost_allocate_transient(batch, desc_size);
 835
 836                 memcpy(xfer.cpu, &meta, sizeof(meta));
 837                 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
 838
 839                 if (rt_size)
 840                         ralloc_free(rts);
 841
 842                 shader_ptr = xfer.gpu;
 843         } else {
 844                 shader_ptr = panfrost_upload_transient(batch, &meta,
 845                                                        sizeof(meta));
 846         }
 847
 848         postfix->shader = shader_ptr;
 849 }
 850
 851 static void
 852 panfrost_mali_viewport_init(struct panfrost_context *ctx,
 853                             struct mali_viewport *mvp)
 854 {
 855         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 856
 857         /* Clip bounds are encoded as floats. The viewport itself is encoded as
 858          * (somewhat) asymmetric ints. */
 859
 860         const struct pipe_scissor_state *ss = &ctx->scissor;
 861
 862         memset(mvp, 0, sizeof(*mvp));
 863
 864         /* By default, do no viewport clipping, i.e. clip to (-inf, inf) in
 865          * each direction. Clipping to the viewport in theory should work, but
 866          * in practice causes issues when we're not explicitly trying to
 867          * scissor */
 868
 869         *mvp = (struct mali_viewport) {
 870                 .clip_minx = -INFINITY,
 871                 .clip_miny = -INFINITY,
 872                 .clip_maxx = INFINITY,
 873                 .clip_maxy = INFINITY,
 874         };
 875
 876         /* Always scissor to the viewport by default. */
 877         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 878         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 879
 880         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 881         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 882
 883         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 884         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 885
 886         /* Apply the scissor test */
 887
 888         unsigned minx, miny, maxx, maxy;
 889
 890         if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
 891                 minx = MAX2(ss->minx, vp_minx);
 892                 miny = MAX2(ss->miny, vp_miny);
 893                 maxx = MIN2(ss->maxx, vp_maxx);
 894                 maxy = MIN2(ss->maxy, vp_maxy);
 895         } else {
 896                 minx = vp_minx;
 897                 miny = vp_miny;
 898                 maxx = vp_maxx;
 899                 maxy = vp_maxy;
 900         }
 901
 902         /* Hardware needs the min/max to be strictly ordered, so flip if we
 903          * need to. The viewport transformation in the vertex shader will
 904          * handle the negatives if we don't */
 905
 906         if (miny > maxy) {
 907                 unsigned temp = miny;
 908                 miny = maxy;
 909                 maxy = temp;
 910         }
 911
 912         if (minx > maxx) {
 913                 unsigned temp = minx;
 914                 minx = maxx;
 915                 maxx = temp;
 916         }
 917
 918         if (minz > maxz) {
 919                 float temp = minz;
 920                 minz = maxz;
 921                 maxz = temp;
 922         }
 923
 924         /* Clamp to the framebuffer size as a last check */
 925
 926         minx = MIN2(ctx->pipe_framebuffer.width, minx);
 927         maxx = MIN2(ctx->pipe_framebuffer.width, maxx);
 928
 929         miny = MIN2(ctx->pipe_framebuffer.height, miny);
 930         maxy = MIN2(ctx->pipe_framebuffer.height, maxy);
 931
 932         /* Upload */
 933
 934         mvp->viewport0[0] = minx;
 935         mvp->viewport1[0] = MALI_POSITIVE(maxx);
 936
 937         mvp->viewport0[1] = miny;
 938         mvp->viewport1[1] = MALI_POSITIVE(maxy);
 939
 940         mvp->clip_minz = minz;
 941         mvp->clip_maxz = maxz;
 942 }
 943
 944 void
 945 panfrost_emit_viewport(struct panfrost_batch *batch,
 946                        struct mali_vertex_tiler_postfix *tiler_postfix)
 947 {
 948         struct panfrost_context *ctx = batch->ctx;
 949         struct mali_viewport mvp;
 950
 951         panfrost_mali_viewport_init(batch->ctx,  &mvp);
 952
 953         /* Update the job, unless we're doing wallpapering (whose lack of
 954          * scissor we can ignore, since if we "miss" a tile of wallpaper, it'll
 955          * just... be faster :) */
 956
 957         if (!ctx->wallpaper_batch)
 958                 panfrost_batch_union_scissor(batch, mvp.viewport0[0],
 959                                              mvp.viewport0[1],
 960                                              mvp.viewport1[0] + 1,
 961                                              mvp.viewport1[1] + 1);
 962
 963         tiler_postfix->viewport = panfrost_upload_transient(batch, &mvp,
 964                                                             sizeof(mvp));
 965 }
 966
 967 static mali_ptr
 968 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
 969                                  enum pipe_shader_type st,
 970                                  struct panfrost_constant_buffer *buf,
 971                                  unsigned index)
 972 {
 973         struct pipe_constant_buffer *cb = &buf->cb[index];
 974         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
 975
 976         if (rsrc) {
 977                 panfrost_batch_add_bo(batch, rsrc->bo,
 978                                       PAN_BO_ACCESS_SHARED |
 979                                       PAN_BO_ACCESS_READ |
 980                                       panfrost_bo_access_for_stage(st));
 981
 982                 /* Alignment gauranteed by
 983                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
 984                 return rsrc->bo->gpu + cb->buffer_offset;
 985         } else if (cb->user_buffer) {
 986                 return panfrost_upload_transient(batch,
 987                                                  cb->user_buffer +
 988                                                  cb->buffer_offset,
 989                                                  cb->buffer_size);
 990         } else {
 991                 unreachable("No constant buffer");
 992         }
 993 }
 994
 995 struct sysval_uniform {
 996         union {
 997                 float f[4];
 998                 int32_t i[4];
 999                 uint32_t u[4];
1000                 uint64_t du[2];
1001         };
1002 };
1003
1004 static void
1005 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
1006                                       struct sysval_uniform *uniform)
1007 {
1008         struct panfrost_context *ctx = batch->ctx;
1009         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1010
1011         uniform->f[0] = vp->scale[0];
1012         uniform->f[1] = vp->scale[1];
1013         uniform->f[2] = vp->scale[2];
1014 }
1015
1016 static void
1017 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1018                                        struct sysval_uniform *uniform)
1019 {
1020         struct panfrost_context *ctx = batch->ctx;
1021         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1022
1023         uniform->f[0] = vp->translate[0];
1024         uniform->f[1] = vp->translate[1];
1025         uniform->f[2] = vp->translate[2];
1026 }
1027
1028 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1029                                        enum pipe_shader_type st,
1030                                        unsigned int sysvalid,
1031                                        struct sysval_uniform *uniform)
1032 {
1033         struct panfrost_context *ctx = batch->ctx;
1034         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1035         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1036         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1037         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1038
1039         assert(dim);
1040         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1041
1042         if (dim > 1)
1043                 uniform->i[1] = u_minify(tex->texture->height0,
1044                                          tex->u.tex.first_level);
1045
1046         if (dim > 2)
1047                 uniform->i[2] = u_minify(tex->texture->depth0,
1048                                          tex->u.tex.first_level);
1049
1050         if (is_array)
1051                 uniform->i[dim] = tex->texture->array_size;
1052 }
1053
1054 static void
1055 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1056                             enum pipe_shader_type st,
1057                             unsigned ssbo_id,
1058                             struct sysval_uniform *uniform)
1059 {
1060         struct panfrost_context *ctx = batch->ctx;
1061
1062         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1063         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1064
1065         /* Compute address */
1066         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1067
1068         panfrost_batch_add_bo(batch, bo,
1069                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1070                               panfrost_bo_access_for_stage(st));
1071
1072         /* Upload address and size as sysval */
1073         uniform->du[0] = bo->gpu + sb.buffer_offset;
1074         uniform->u[2] = sb.buffer_size;
1075 }
1076
1077 static void
1078 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1079                                enum pipe_shader_type st,
1080                                unsigned samp_idx,
1081                                struct sysval_uniform *uniform)
1082 {
1083         struct panfrost_context *ctx = batch->ctx;
1084         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1085
1086         uniform->f[0] = sampl->min_lod;
1087         uniform->f[1] = sampl->max_lod;
1088         uniform->f[2] = sampl->lod_bias;
1089
1090         /* Even without any errata, Midgard represents "no mipmapping" as
1091          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1092          * panfrost_create_sampler_state which also explains our choice of
1093          * epsilon value (again to keep behaviour consistent) */
1094
1095         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1096                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1097 }
1098
1099 static void
1100 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1101                                        struct sysval_uniform *uniform)
1102 {
1103         struct panfrost_context *ctx = batch->ctx;
1104
1105         uniform->u[0] = ctx->compute_grid->grid[0];
1106         uniform->u[1] = ctx->compute_grid->grid[1];
1107         uniform->u[2] = ctx->compute_grid->grid[2];
1108 }
1109
1110 static void
1111 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1112                         struct panfrost_shader_state *ss,
1113                         enum pipe_shader_type st)
1114 {
1115         struct sysval_uniform *uniforms = (void *)buf;
1116
1117         for (unsigned i = 0; i < ss->sysval_count; ++i) {
1118                 int sysval = ss->sysval[i];
1119
1120                 switch (PAN_SYSVAL_TYPE(sysval)) {
1121                 case PAN_SYSVAL_VIEWPORT_SCALE:
1122                         panfrost_upload_viewport_scale_sysval(batch,
1123                                                               &uniforms[i]);
1124                         break;
1125                 case PAN_SYSVAL_VIEWPORT_OFFSET:
1126                         panfrost_upload_viewport_offset_sysval(batch,
1127                                                                &uniforms[i]);
1128                         break;
1129                 case PAN_SYSVAL_TEXTURE_SIZE:
1130                         panfrost_upload_txs_sysval(batch, st,
1131                                                    PAN_SYSVAL_ID(sysval),
1132                                                    &uniforms[i]);
1133                         break;
1134                 case PAN_SYSVAL_SSBO:
1135                         panfrost_upload_ssbo_sysval(batch, st,
1136                                                     PAN_SYSVAL_ID(sysval),
1137                                                     &uniforms[i]);
1138                         break;
1139                 case PAN_SYSVAL_NUM_WORK_GROUPS:
1140                         panfrost_upload_num_work_groups_sysval(batch,
1141                                                                &uniforms[i]);
1142                         break;
1143                 case PAN_SYSVAL_SAMPLER:
1144                         panfrost_upload_sampler_sysval(batch, st,
1145                                                        PAN_SYSVAL_ID(sysval),
1146                                                        &uniforms[i]);
1147                         break;
1148                 default:
1149                         assert(0);
1150                 }
1151         }
1152 }
1153
1154 static const void *
1155 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1156                                  unsigned index)
1157 {
1158         struct pipe_constant_buffer *cb = &buf->cb[index];
1159         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1160
1161         if (rsrc)
1162                 return rsrc->bo->cpu;
1163         else if (cb->user_buffer)
1164                 return cb->user_buffer;
1165         else
1166                 unreachable("No constant buffer");
1167 }
1168
1169 void
1170 panfrost_emit_const_buf(struct panfrost_batch *batch,
1171                         enum pipe_shader_type stage,
1172                         struct mali_vertex_tiler_postfix *postfix)
1173 {
1174         struct panfrost_context *ctx = batch->ctx;
1175         struct panfrost_shader_variants *all = ctx->shader[stage];
1176
1177         if (!all)
1178                 return;
1179
1180         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1181
1182         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1183
1184         /* Uniforms are implicitly UBO #0 */
1185         bool has_uniforms = buf->enabled_mask & (1 << 0);
1186
1187         /* Allocate room for the sysval and the uniforms */
1188         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1189         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1190         size_t size = sys_size + uniform_size;
1191         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1192                                                                         size);
1193
1194         /* Upload sysvals requested by the shader */
1195         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1196
1197         /* Upload uniforms */
1198         if (has_uniforms && uniform_size) {
1199                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1200                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1201         }
1202
1203         /* Next up, attach UBOs. UBO #0 is the uniforms we just
1204          * uploaded */
1205
1206         unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1207         assert(ubo_count >= 1);
1208
1209         size_t sz = sizeof(uint64_t) * ubo_count;
1210         uint64_t ubos[PAN_MAX_CONST_BUFFERS];
1211         int uniform_count = ss->uniform_count;
1212
1213         /* Upload uniforms as a UBO */
1214         ubos[0] = MALI_MAKE_UBO(2 + uniform_count, transfer.gpu);
1215
1216         /* The rest are honest-to-goodness UBOs */
1217
1218         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1219                 size_t usz = buf->cb[ubo].buffer_size;
1220                 bool enabled = buf->enabled_mask & (1 << ubo);
1221                 bool empty = usz == 0;
1222
1223                 if (!enabled || empty) {
1224                         /* Stub out disabled UBOs to catch accesses */
1225                         ubos[ubo] = MALI_MAKE_UBO(0, 0xDEAD0000);
1226                         continue;
1227                 }
1228
1229                 mali_ptr gpu = panfrost_map_constant_buffer_gpu(batch, stage,
1230                                                                 buf, ubo);
1231
1232                 unsigned bytes_per_field = 16;
1233                 unsigned aligned = ALIGN_POT(usz, bytes_per_field);
1234                 ubos[ubo] = MALI_MAKE_UBO(aligned / bytes_per_field, gpu);
1235         }
1236
1237         mali_ptr ubufs = panfrost_upload_transient(batch, ubos, sz);
1238         postfix->uniforms = transfer.gpu;
1239         postfix->uniform_buffers = ubufs;
1240
1241         buf->dirty_mask = 0;
1242 }
1243
1244 void
1245 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1246                             const struct pipe_grid_info *info,
1247                             struct midgard_payload_vertex_tiler *vtp)
1248 {
1249         struct panfrost_context *ctx = batch->ctx;
1250         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1251         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1252         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1253                                                            128));
1254         unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1255                                info->grid[2] * 4;
1256         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1257                                                                   shared_size,
1258                                                                   1);
1259
1260         struct mali_shared_memory shared = {
1261                 .shared_memory = bo->gpu,
1262                 .shared_workgroup_count =
1263                         util_logbase2_ceil(info->grid[0]) +
1264                         util_logbase2_ceil(info->grid[1]) +
1265                         util_logbase2_ceil(info->grid[2]),
1266                 .shared_unk1 = 0x2,
1267                 .shared_shift = util_logbase2(single_size) - 1
1268         };
1269
1270         vtp->postfix.shared_memory = panfrost_upload_transient(batch, &shared,
1271                                                                sizeof(shared));
1272 }
1273
1274 static mali_ptr
1275 panfrost_get_tex_desc(struct panfrost_batch *batch,
1276                       enum pipe_shader_type st,
1277                       struct panfrost_sampler_view *view)
1278 {
1279         if (!view)
1280                 return (mali_ptr) 0;
1281
1282         struct pipe_sampler_view *pview = &view->base;
1283         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1284
1285         /* Add the BO to the job so it's retained until the job is done. */
1286
1287         panfrost_batch_add_bo(batch, rsrc->bo,
1288                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1289                               panfrost_bo_access_for_stage(st));
1290
1291         panfrost_batch_add_bo(batch, view->midgard_bo,
1292                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1293                               panfrost_bo_access_for_stage(st));
1294
1295         return view->midgard_bo->gpu;
1296 }
1297
1298 void
1299 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1300                                   enum pipe_shader_type stage,
1301                                   struct mali_vertex_tiler_postfix *postfix)
1302 {
1303         struct panfrost_context *ctx = batch->ctx;
1304         struct panfrost_device *device = pan_device(ctx->base.screen);
1305
1306         if (!ctx->sampler_view_count[stage])
1307                 return;
1308
1309         if (device->quirks & IS_BIFROST) {
1310                 struct bifrost_texture_descriptor *descriptors;
1311
1312                 descriptors = malloc(sizeof(struct bifrost_texture_descriptor) *
1313                                      ctx->sampler_view_count[stage]);
1314
1315                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1316                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1317                         struct pipe_sampler_view *pview = &view->base;
1318                         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1319
1320                         panfrost_batch_add_bo(batch, rsrc->bo,
1321                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1322                                               panfrost_bo_access_for_stage(stage));
1323
1324                         memcpy(&descriptors[i], view->bifrost_descriptor, sizeof(*view->bifrost_descriptor));
1325                 }
1326
1327                 postfix->textures = panfrost_upload_transient(batch,
1328                                                               descriptors,
1329                                                               sizeof(struct bifrost_texture_descriptor) *
1330                                                                       ctx->sampler_view_count[stage]);
1331
1332                 free(descriptors);
1333         } else {
1334                 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1335
1336                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i)
1337                         trampolines[i] = panfrost_get_tex_desc(batch, stage,
1338                                                                ctx->sampler_views[stage][i]);
1339
1340                 postfix->textures = panfrost_upload_transient(batch,
1341                                                               trampolines,
1342                                                               sizeof(uint64_t) *
1343                                                               ctx->sampler_view_count[stage]);
1344         }
1345 }
1346
1347 void
1348 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1349                                   enum pipe_shader_type stage,
1350                                   struct mali_vertex_tiler_postfix *postfix)
1351 {
1352         struct panfrost_context *ctx = batch->ctx;
1353         struct panfrost_device *device = pan_device(ctx->base.screen);
1354
1355         if (!ctx->sampler_count[stage])
1356                 return;
1357
1358         if (device->quirks & IS_BIFROST) {
1359                 size_t desc_size = sizeof(struct bifrost_sampler_descriptor);
1360                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1361                 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1362                                                                                 transfer_size);
1363                 struct bifrost_sampler_descriptor *desc = (struct bifrost_sampler_descriptor *)transfer.cpu;
1364
1365                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1366                         desc[i] = ctx->samplers[stage][i]->bifrost_hw;
1367
1368                 postfix->sampler_descriptor = transfer.gpu;
1369         } else {
1370                 size_t desc_size = sizeof(struct mali_sampler_descriptor);
1371                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1372                 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1373                                                                                 transfer_size);
1374                 struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1375
1376                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1377                         desc[i] = ctx->samplers[stage][i]->midgard_hw;
1378
1379                 postfix->sampler_descriptor = transfer.gpu;
1380         }
1381 }
1382
1383 void
1384 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1385                                struct mali_vertex_tiler_postfix *vertex_postfix)
1386 {
1387         struct panfrost_context *ctx = batch->ctx;
1388
1389         if (!ctx->vertex)
1390                 return;
1391
1392         struct panfrost_vertex_state *so = ctx->vertex;
1393
1394         panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1395         vertex_postfix->attribute_meta = panfrost_upload_transient(batch, so->hw,
1396                                                                sizeof(*so->hw) *
1397                                                                PAN_MAX_ATTRIBUTE);
1398 }
1399
1400 void
1401 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1402                           struct mali_vertex_tiler_postfix *vertex_postfix)
1403 {
1404         struct panfrost_context *ctx = batch->ctx;
1405         struct panfrost_vertex_state *so = ctx->vertex;
1406
1407         /* Staged mali_attr, and index into them. i =/= k, depending on the
1408          * vertex buffer mask and instancing. Twice as much room is allocated,
1409          * for a worst case of NPOT_DIVIDEs which take up extra slot */
1410         union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1411         unsigned k = 0;
1412
1413         for (unsigned i = 0; i < so->num_elements; ++i) {
1414                 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1415                  * means duplicating some vertex buffers (who cares? aside from
1416                  * maybe some caching implications but I somehow doubt that
1417                  * matters) */
1418
1419                 struct pipe_vertex_element *elem = &so->pipe[i];
1420                 unsigned vbi = elem->vertex_buffer_index;
1421
1422                 /* The exception to 1:1 mapping is that we can have multiple
1423                  * entries (NPOT divisors), so we fixup anyways */
1424
1425                 so->hw[i].index = k;
1426
1427                 if (!(ctx->vb_mask & (1 << vbi)))
1428                         continue;
1429
1430                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1431                 struct panfrost_resource *rsrc;
1432
1433                 rsrc = pan_resource(buf->buffer.resource);
1434                 if (!rsrc)
1435                         continue;
1436
1437                 /* Align to 64 bytes by masking off the lower bits. This
1438                  * will be adjusted back when we fixup the src_offset in
1439                  * mali_attr_meta */
1440
1441                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1442                 mali_ptr addr = raw_addr & ~63;
1443                 unsigned chopped_addr = raw_addr - addr;
1444
1445                 /* Add a dependency of the batch on the vertex buffer */
1446                 panfrost_batch_add_bo(batch, rsrc->bo,
1447                                       PAN_BO_ACCESS_SHARED |
1448                                       PAN_BO_ACCESS_READ |
1449                                       PAN_BO_ACCESS_VERTEX_TILER);
1450
1451                 /* Set common fields */
1452                 attrs[k].elements = addr;
1453                 attrs[k].stride = buf->stride;
1454
1455                 /* Since we advanced the base pointer, we shrink the buffer
1456                  * size */
1457                 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1458
1459                 /* We need to add the extra size we masked off (for
1460                  * correctness) so the data doesn't get clamped away */
1461                 attrs[k].size += chopped_addr;
1462
1463                 /* For non-instancing make sure we initialize */
1464                 attrs[k].shift = attrs[k].extra_flags = 0;
1465
1466                 /* Instancing uses a dramatically different code path than
1467                  * linear, so dispatch for the actual emission now that the
1468                  * common code is finished */
1469
1470                 unsigned divisor = elem->instance_divisor;
1471
1472                 if (divisor && ctx->instance_count == 1) {
1473                         /* Silly corner case where there's a divisor(=1) but
1474                          * there's no legitimate instancing. So we want *every*
1475                          * attribute to be the same. So set stride to zero so
1476                          * we don't go anywhere. */
1477
1478                         attrs[k].size = attrs[k].stride + chopped_addr;
1479                         attrs[k].stride = 0;
1480                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1481                 } else if (ctx->instance_count <= 1) {
1482                         /* Normal, non-instanced attributes */
1483                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1484                 } else {
1485                         unsigned instance_shift = vertex_postfix->instance_shift;
1486                         unsigned instance_odd = vertex_postfix->instance_odd;
1487
1488                         k += panfrost_vertex_instanced(ctx->padded_count,
1489                                                        instance_shift,
1490                                                        instance_odd,
1491                                                        divisor, &attrs[k]);
1492                 }
1493         }
1494
1495         /* Add special gl_VertexID/gl_InstanceID buffers */
1496
1497         panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1498         so->hw[PAN_VERTEX_ID].index = k++;
1499         panfrost_instance_id(ctx->padded_count, &attrs[k]);
1500         so->hw[PAN_INSTANCE_ID].index = k++;
1501
1502         /* Upload whatever we emitted and go */
1503
1504         vertex_postfix->attributes = panfrost_upload_transient(batch, attrs,
1505                                                            k * sizeof(*attrs));
1506 }
1507
1508 static mali_ptr
1509 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1510                        unsigned stride, unsigned count)
1511 {
1512         /* Fill out the descriptor */
1513         slot->stride = stride;
1514         slot->size = stride * count;
1515         slot->shift = slot->extra_flags = 0;
1516
1517         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1518                                                                         slot->size);
1519
1520         slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1521
1522         return transfer.gpu;
1523 }
1524
1525 static void
1526 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1527                         unsigned stride, unsigned offset, unsigned count,
1528                         struct pipe_stream_output_target *target)
1529 {
1530         /* Fill out the descriptor */
1531         slot->stride = stride * 4;
1532         slot->shift = slot->extra_flags = 0;
1533
1534         unsigned max_size = target->buffer_size;
1535         unsigned expected_size = slot->stride * count;
1536
1537         slot->size = MIN2(max_size, expected_size);
1538
1539         /* Grab the BO and bind it to the batch */
1540         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1541
1542         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1543          * the perspective of the TILER and FRAGMENT.
1544          */
1545         panfrost_batch_add_bo(batch, bo,
1546                               PAN_BO_ACCESS_SHARED |
1547                               PAN_BO_ACCESS_RW |
1548                               PAN_BO_ACCESS_VERTEX_TILER |
1549                               PAN_BO_ACCESS_FRAGMENT);
1550
1551         mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1552         slot->elements = addr;
1553 }
1554
1555 /* Given a shader and buffer indices, link varying metadata together */
1556
1557 static bool
1558 is_special_varying(gl_varying_slot loc)
1559 {
1560         switch (loc) {
1561         case VARYING_SLOT_POS:
1562         case VARYING_SLOT_PSIZ:
1563         case VARYING_SLOT_PNTC:
1564         case VARYING_SLOT_FACE:
1565                 return true;
1566         default:
1567                 return false;
1568         }
1569 }
1570
1571 static void
1572 panfrost_emit_varying_meta(void *outptr, struct panfrost_shader_state *ss,
1573                            signed general, signed gl_Position,
1574                            signed gl_PointSize, signed gl_PointCoord,
1575                            signed gl_FrontFacing)
1576 {
1577         struct mali_attr_meta *out = (struct mali_attr_meta *) outptr;
1578
1579         for (unsigned i = 0; i < ss->varying_count; ++i) {
1580                 gl_varying_slot location = ss->varyings_loc[i];
1581                 int index = -1;
1582
1583                 switch (location) {
1584                 case VARYING_SLOT_POS:
1585                         index = gl_Position;
1586                         break;
1587                 case VARYING_SLOT_PSIZ:
1588                         index = gl_PointSize;
1589                         break;
1590                 case VARYING_SLOT_PNTC:
1591                         index = gl_PointCoord;
1592                         break;
1593                 case VARYING_SLOT_FACE:
1594                         index = gl_FrontFacing;
1595                         break;
1596                 default:
1597                         index = general;
1598                         break;
1599                 }
1600
1601                 assert(index >= 0);
1602                 out[i].index = index;
1603         }
1604 }
1605
1606 static bool
1607 has_point_coord(unsigned mask, gl_varying_slot loc)
1608 {
1609         if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1610                 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1611         else if (loc == VARYING_SLOT_PNTC)
1612                 return (mask & (1 << 8));
1613         else
1614                 return false;
1615 }
1616
1617 /* Helpers for manipulating stream out information so we can pack varyings
1618  * accordingly. Compute the src_offset for a given captured varying */
1619
1620 static struct pipe_stream_output *
1621 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1622 {
1623         for (unsigned i = 0; i < info->num_outputs; ++i) {
1624                 if (info->output[i].register_index == loc)
1625                         return &info->output[i];
1626         }
1627
1628         unreachable("Varying not captured");
1629 }
1630
1631 /* TODO: Integers */
1632 static enum mali_format
1633 pan_xfb_format(unsigned nr_components)
1634 {
1635         switch (nr_components) {
1636                 case 1: return MALI_R32F;
1637                 case 2: return MALI_RG32F;
1638                 case 3: return MALI_RGB32F;
1639                 case 4: return MALI_RGBA32F;
1640                 default: unreachable("Invalid format");
1641         }
1642 }
1643
1644 void
1645 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1646                                  unsigned vertex_count,
1647                                  struct mali_vertex_tiler_postfix *vertex_postfix,
1648                                  struct mali_vertex_tiler_postfix *tiler_postfix,
1649                                  union midgard_primitive_size *primitive_size)
1650 {
1651         /* Load the shaders */
1652         struct panfrost_context *ctx = batch->ctx;
1653         struct panfrost_shader_state *vs, *fs;
1654         unsigned int num_gen_varyings = 0;
1655         size_t vs_size, fs_size;
1656
1657         /* Allocate the varying descriptor */
1658
1659         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1660         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1661         vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
1662         fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
1663
1664         struct panfrost_transfer trans = panfrost_allocate_transient(batch,
1665                                                                      vs_size +
1666                                                                      fs_size);
1667
1668         struct pipe_stream_output_info *so = &vs->stream_output;
1669
1670         /* Check if this varying is linked by us. This is the case for
1671          * general-purpose, non-captured varyings. If it is, link it. If it's
1672          * not, use the provided stream out information to determine the
1673          * offset, since it was already linked for us. */
1674
1675         for (unsigned i = 0; i < vs->varying_count; i++) {
1676                 gl_varying_slot loc = vs->varyings_loc[i];
1677
1678                 bool special = is_special_varying(loc);
1679                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1680
1681                 if (captured) {
1682                         struct pipe_stream_output *o = pan_get_so(so, loc);
1683
1684                         unsigned dst_offset = o->dst_offset * 4; /* dwords */
1685                         vs->varyings[i].src_offset = dst_offset;
1686                 } else if (!special) {
1687                         vs->varyings[i].src_offset = 16 * (num_gen_varyings++);
1688                 }
1689         }
1690
1691         /* Conversely, we need to set src_offset for the captured varyings.
1692          * Here, the layout is defined by the stream out info, not us */
1693
1694         /* Link up with fragment varyings */
1695         bool reads_point_coord = fs->reads_point_coord;
1696
1697         for (unsigned i = 0; i < fs->varying_count; i++) {
1698                 gl_varying_slot loc = fs->varyings_loc[i];
1699                 unsigned src_offset;
1700                 signed vs_idx = -1;
1701
1702                 /* Link up */
1703                 for (unsigned j = 0; j < vs->varying_count; ++j) {
1704                         if (vs->varyings_loc[j] == loc) {
1705                                 vs_idx = j;
1706                                 break;
1707                         }
1708                 }
1709
1710                 /* Either assign or reuse */
1711                 if (vs_idx >= 0)
1712                         src_offset = vs->varyings[vs_idx].src_offset;
1713                 else
1714                         src_offset = 16 * (num_gen_varyings++);
1715
1716                 fs->varyings[i].src_offset = src_offset;
1717
1718                 if (has_point_coord(fs->point_sprite_mask, loc))
1719                         reads_point_coord = true;
1720         }
1721
1722         memcpy(trans.cpu, vs->varyings, vs_size);
1723         memcpy(trans.cpu + vs_size, fs->varyings, fs_size);
1724
1725         union mali_attr varyings[PIPE_MAX_ATTRIBS] = {0};
1726
1727         /* Figure out how many streamout buffers could be bound */
1728         unsigned so_count = ctx->streamout.num_targets;
1729         for (unsigned i = 0; i < vs->varying_count; i++) {
1730                 gl_varying_slot loc = vs->varyings_loc[i];
1731
1732                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1733                 if (!captured) continue;
1734
1735                 struct pipe_stream_output *o = pan_get_so(so, loc);
1736                 so_count = MAX2(so_count, o->output_buffer + 1);
1737         }
1738
1739         signed idx = so_count;
1740         signed general = idx++;
1741         signed gl_Position = idx++;
1742         signed gl_PointSize = vs->writes_point_size ? (idx++) : -1;
1743         signed gl_PointCoord = reads_point_coord ? (idx++) : -1;
1744         signed gl_FrontFacing = fs->reads_face ? (idx++) : -1;
1745         signed gl_FragCoord = fs->reads_frag_coord ? (idx++) : -1;
1746
1747         /* Emit the stream out buffers */
1748
1749         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1750                                                            ctx->vertex_count);
1751
1752         for (unsigned i = 0; i < so_count; ++i) {
1753                 if (i < ctx->streamout.num_targets) {
1754                         panfrost_emit_streamout(batch, &varyings[i],
1755                                                 so->stride[i],
1756                                                 ctx->streamout.offsets[i],
1757                                                 out_count,
1758                                                 ctx->streamout.targets[i]);
1759                 } else {
1760                         /* Emit a dummy buffer */
1761                         panfrost_emit_varyings(batch, &varyings[i],
1762                                                so->stride[i] * 4,
1763                                                out_count);
1764
1765                         /* Clear the attribute type */
1766                         varyings[i].elements &= ~0xF;
1767                 }
1768         }
1769
1770         panfrost_emit_varyings(batch, &varyings[general],
1771                                num_gen_varyings * 16,
1772                                vertex_count);
1773
1774         mali_ptr varyings_p;
1775
1776         /* fp32 vec4 gl_Position */
1777         varyings_p = panfrost_emit_varyings(batch, &varyings[gl_Position],
1778                                             sizeof(float) * 4, vertex_count);
1779         tiler_postfix->position_varying = varyings_p;
1780
1781
1782         if (panfrost_writes_point_size(ctx)) {
1783                 varyings_p = panfrost_emit_varyings(batch,
1784                                                     &varyings[gl_PointSize],
1785                                                     2, vertex_count);
1786                 primitive_size->pointer = varyings_p;
1787         }
1788
1789         if (reads_point_coord)
1790                 varyings[gl_PointCoord].elements = MALI_VARYING_POINT_COORD;
1791
1792         if (fs->reads_face)
1793                 varyings[gl_FrontFacing].elements = MALI_VARYING_FRONT_FACING;
1794
1795         if (fs->reads_frag_coord)
1796                 varyings[gl_FragCoord].elements = MALI_VARYING_FRAG_COORD;
1797
1798         struct panfrost_device *device = pan_device(ctx->base.screen);
1799         assert(!(device->quirks & IS_BIFROST) || !(reads_point_coord || fs->reads_face || fs->reads_frag_coord));
1800
1801         /* Let's go ahead and link varying meta to the buffer in question, now
1802          * that that information is available. VARYING_SLOT_POS is mapped to
1803          * gl_FragCoord for fragment shaders but gl_Positionf or vertex shaders
1804          * */
1805
1806         panfrost_emit_varying_meta(trans.cpu, vs, general, gl_Position,
1807                                    gl_PointSize, gl_PointCoord,
1808                                    gl_FrontFacing);
1809
1810         panfrost_emit_varying_meta(trans.cpu + vs_size, fs, general,
1811                                    gl_FragCoord, gl_PointSize,
1812                                    gl_PointCoord, gl_FrontFacing);
1813
1814         /* Replace streamout */
1815
1816         struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
1817         struct mali_attr_meta *ofs = ovs + vs->varying_count;
1818
1819         for (unsigned i = 0; i < vs->varying_count; i++) {
1820                 gl_varying_slot loc = vs->varyings_loc[i];
1821
1822                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1823                 if (!captured)
1824                         continue;
1825
1826                 struct pipe_stream_output *o = pan_get_so(so, loc);
1827                 ovs[i].index = o->output_buffer;
1828
1829                 /* Set the type appropriately. TODO: Integer varyings XXX */
1830                 assert(o->stream == 0);
1831                 ovs[i].format = pan_xfb_format(o->num_components);
1832                 ovs[i].swizzle = panfrost_get_default_swizzle(o->num_components);
1833
1834                 /* Link to the fragment */
1835                 signed fs_idx = -1;
1836
1837                 /* Link up */
1838                 for (unsigned j = 0; j < fs->varying_count; ++j) {
1839                         if (fs->varyings_loc[j] == loc) {
1840                                 fs_idx = j;
1841                                 break;
1842                         }
1843                 }
1844
1845                 if (fs_idx >= 0) {
1846                         ofs[fs_idx].index = ovs[i].index;
1847                         ofs[fs_idx].format = ovs[i].format;
1848                         ofs[fs_idx].swizzle = ovs[i].swizzle;
1849                 }
1850         }
1851
1852         /* Replace point sprite */
1853         for (unsigned i = 0; i < fs->varying_count; i++) {
1854                 /* If we have a point sprite replacement, handle that here. We
1855                  * have to translate location first.  TODO: Flip y in shader.
1856                  * We're already keying ... just time crunch .. */
1857
1858                 if (has_point_coord(fs->point_sprite_mask,
1859                                     fs->varyings_loc[i])) {
1860                         ofs[i].index = gl_PointCoord;
1861
1862                         /* Swizzle out the z/w to 0/1 */
1863                         ofs[i].format = MALI_RG16F;
1864                         ofs[i].swizzle = panfrost_get_default_swizzle(2);
1865                 }
1866         }
1867
1868         /* Fix up unaligned addresses */
1869         for (unsigned i = 0; i < so_count; ++i) {
1870                 if (varyings[i].elements < MALI_RECORD_SPECIAL)
1871                         continue;
1872
1873                 unsigned align = (varyings[i].elements & 63);
1874
1875                 /* While we're at it, the SO buffers are linear */
1876
1877                 if (!align) {
1878                         varyings[i].elements |= MALI_ATTR_LINEAR;
1879                         continue;
1880                 }
1881
1882                 /* We need to adjust alignment */
1883                 varyings[i].elements &= ~63;
1884                 varyings[i].elements |= MALI_ATTR_LINEAR;
1885                 varyings[i].size += align;
1886
1887                 for (unsigned v = 0; v < vs->varying_count; ++v) {
1888                         if (ovs[v].index != i)
1889                                 continue;
1890
1891                         ovs[v].src_offset = vs->varyings[v].src_offset + align;
1892                 }
1893
1894                 for (unsigned f = 0; f < fs->varying_count; ++f) {
1895                         if (ofs[f].index != i)
1896                                 continue;
1897
1898                         ofs[f].src_offset = fs->varyings[f].src_offset + align;
1899                 }
1900         }
1901
1902         varyings_p = panfrost_upload_transient(batch, varyings,
1903                                                idx * sizeof(*varyings));
1904         vertex_postfix->varyings = varyings_p;
1905         tiler_postfix->varyings = varyings_p;
1906
1907         vertex_postfix->varying_meta = trans.gpu;
1908         tiler_postfix->varying_meta = trans.gpu + vs_size;
1909 }
1910
1911 void
1912 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1913                                 struct mali_vertex_tiler_prefix *vertex_prefix,
1914                                 struct mali_vertex_tiler_postfix *vertex_postfix,
1915                                 struct mali_vertex_tiler_prefix *tiler_prefix,
1916                                 struct mali_vertex_tiler_postfix *tiler_postfix,
1917                                 union midgard_primitive_size *primitive_size)
1918 {
1919         struct panfrost_context *ctx = batch->ctx;
1920         struct panfrost_device *device = pan_device(ctx->base.screen);
1921         bool wallpapering = ctx->wallpaper_batch && batch->tiler_dep;
1922         struct bifrost_payload_vertex bifrost_vertex = {0,};
1923         struct bifrost_payload_tiler bifrost_tiler = {0,};
1924         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
1925         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
1926         void *vp, *tp;
1927         size_t vp_size, tp_size;
1928
1929         if (device->quirks & IS_BIFROST) {
1930                 bifrost_vertex.prefix = *vertex_prefix;
1931                 bifrost_vertex.postfix = *vertex_postfix;
1932                 vp = &bifrost_vertex;
1933                 vp_size = sizeof(bifrost_vertex);
1934
1935                 bifrost_tiler.prefix = *tiler_prefix;
1936                 bifrost_tiler.tiler.primitive_size = *primitive_size;
1937                 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
1938                 bifrost_tiler.postfix = *tiler_postfix;
1939                 tp = &bifrost_tiler;
1940                 tp_size = sizeof(bifrost_tiler);
1941         } else {
1942                 midgard_vertex.prefix = *vertex_prefix;
1943                 midgard_vertex.postfix = *vertex_postfix;
1944                 vp = &midgard_vertex;
1945                 vp_size = sizeof(midgard_vertex);
1946
1947                 midgard_tiler.prefix = *tiler_prefix;
1948                 midgard_tiler.postfix = *tiler_postfix;
1949                 midgard_tiler.primitive_size = *primitive_size;
1950                 tp = &midgard_tiler;
1951                 tp_size = sizeof(midgard_tiler);
1952         }
1953
1954         if (wallpapering) {
1955                 /* Inject in reverse order, with "predicted" job indices.
1956                  * THIS IS A HACK XXX */
1957                 panfrost_new_job(batch, JOB_TYPE_TILER, false,
1958                                  batch->job_index + 2, tp, tp_size, true);
1959                 panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
1960                                  vp, vp_size, true);
1961                 return;
1962         }
1963
1964         /* If rasterizer discard is enable, only submit the vertex */
1965
1966         bool rasterizer_discard = ctx->rasterizer &&
1967                                   ctx->rasterizer->base.rasterizer_discard;
1968
1969         unsigned vertex = panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
1970                                            vp, vp_size, false);
1971
1972         if (rasterizer_discard)
1973                 return;
1974
1975         panfrost_new_job(batch, JOB_TYPE_TILER, false, vertex, tp, tp_size,
1976                          false);
1977 }
1978
1979 /* TODO: stop hardcoding this */
1980 mali_ptr
1981 panfrost_emit_sample_locations(struct panfrost_batch *batch)
1982 {
1983         uint16_t locations[] = {
1984             128, 128,
1985             0, 256,
1986             0, 256,
1987             0, 256,
1988             0, 256,
1989             0, 256,
1990             0, 256,
1991             0, 256,
1992             0, 256,
1993             0, 256,
1994             0, 256,
1995             0, 256,
1996             0, 256,
1997             0, 256,
1998             0, 256,
1999             0, 256,
2000             0, 256,
2001             0, 256,
2002             0, 256,
2003             0, 256,
2004             0, 256,
2005             0, 256,
2006             0, 256,
2007             0, 256,
2008             0, 256,
2009             0, 256,
2010             0, 256,
2011             0, 256,
2012             0, 256,
2013             0, 256,
2014             0, 256,
2015             0, 256,
2016             128, 128,
2017             0, 0,
2018             0, 0,
2019             0, 0,
2020             0, 0,
2021             0, 0,
2022             0, 0,
2023             0, 0,
2024             0, 0,
2025             0, 0,
2026             0, 0,
2027             0, 0,
2028             0, 0,
2029             0, 0,
2030             0, 0,
2031             0, 0,
2032         };
2033
2034         return panfrost_upload_transient(batch, locations, 96 * sizeof(uint16_t));
2035 }