src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28
  29 #include "panfrost-quirks.h"
  30
  31 #include "pan_allocate.h"
  32 #include "pan_bo.h"
  33 #include "pan_cmdstream.h"
  34 #include "pan_context.h"
  35 #include "pan_job.h"
  36
  37 /* If a BO is accessed for a particular shader stage, will it be in the primary
  38  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  39  * fragment will be primary, e.g. compute jobs will be considered
  40  * "vertex/tiler" by analogy */
  41
  42 static inline uint32_t
  43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  44 {
  45         assert(stage == PIPE_SHADER_FRAGMENT ||
  46                stage == PIPE_SHADER_VERTEX ||
  47                stage == PIPE_SHADER_COMPUTE);
  48
  49         return stage == PIPE_SHADER_FRAGMENT ?
  50                PAN_BO_ACCESS_FRAGMENT :
  51                PAN_BO_ACCESS_VERTEX_TILER;
  52 }
  53
  54 static void
  55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
  56                                struct mali_vertex_tiler_postfix *postfix)
  57 {
  58         struct panfrost_device *dev = pan_device(ctx->base.screen);
  59         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  60
  61         unsigned shift = panfrost_get_stack_shift(batch->stack_size);
  62         struct mali_shared_memory shared = {
  63                 .stack_shift = shift,
  64                 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
  65                 .shared_workgroup_count = ~0,
  66         };
  67         postfix->shared_memory = panfrost_upload_transient(batch, &shared, sizeof(shared));
  68 }
  69
  70 static void
  71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
  72                                struct mali_vertex_tiler_postfix *postfix)
  73 {
  74         struct panfrost_device *dev = pan_device(ctx->base.screen);
  75         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  76
  77         /* If we haven't, reserve space for the framebuffer */
  78
  79         if (!batch->framebuffer.gpu) {
  80                 unsigned size = (dev->quirks & MIDGARD_SFBD) ?
  81                         sizeof(struct mali_single_framebuffer) :
  82                         sizeof(struct mali_framebuffer);
  83
  84                 batch->framebuffer = panfrost_allocate_transient(batch, size);
  85
  86                 /* Tag the pointer */
  87                 if (!(dev->quirks & MIDGARD_SFBD))
  88                         batch->framebuffer.gpu |= MALI_MFBD;
  89         }
  90
  91         postfix->shared_memory = batch->framebuffer.gpu;
  92 }
  93
  94 static void
  95 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
  96                               struct mali_vertex_tiler_prefix *prefix,
  97                               struct mali_vertex_tiler_postfix *postfix)
  98 {
  99         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 100
 101         postfix->gl_enables |= 0x7;
 102         SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
 103                 rasterizer && rasterizer->base.front_ccw);
 104         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
 105                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
 106         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
 107                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
 108         SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
 109                 rasterizer && rasterizer->base.flatshade_first);
 110 }
 111
 112 void
 113 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
 114                                   struct mali_vertex_tiler_prefix *prefix,
 115                                   union midgard_primitive_size *primitive_size)
 116 {
 117         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 118
 119         if (!panfrost_writes_point_size(ctx)) {
 120                 bool points = prefix->draw_mode == MALI_POINTS;
 121                 float val = 0.0f;
 122
 123                 if (rasterizer)
 124                         val = points ?
 125                               rasterizer->base.point_size :
 126                               rasterizer->base.line_width;
 127
 128                 primitive_size->constant = val;
 129         }
 130 }
 131
 132 static void
 133 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
 134                                    struct mali_vertex_tiler_postfix *postfix)
 135 {
 136         SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
 137         if (ctx->occlusion_query)
 138                 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
 139         else
 140                 postfix->occlusion_counter = 0;
 141 }
 142
 143 void
 144 panfrost_vt_init(struct panfrost_context *ctx,
 145                  enum pipe_shader_type stage,
 146                  struct mali_vertex_tiler_prefix *prefix,
 147                  struct mali_vertex_tiler_postfix *postfix)
 148 {
 149         struct panfrost_device *device = pan_device(ctx->base.screen);
 150
 151         if (!ctx->shader[stage])
 152                 return;
 153
 154         memset(prefix, 0, sizeof(*prefix));
 155         memset(postfix, 0, sizeof(*postfix));
 156
 157         if (device->quirks & IS_BIFROST) {
 158                 postfix->gl_enables = 0x2;
 159                 panfrost_vt_emit_shared_memory(ctx, postfix);
 160         } else {
 161                 postfix->gl_enables = 0x6;
 162                 panfrost_vt_attach_framebuffer(ctx, postfix);
 163         }
 164
 165         if (stage == PIPE_SHADER_FRAGMENT) {
 166                 panfrost_vt_update_occlusion_query(ctx, postfix);
 167                 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
 168         }
 169 }
 170
 171 static unsigned
 172 panfrost_translate_index_size(unsigned size)
 173 {
 174         switch (size) {
 175         case 1:
 176                 return MALI_DRAW_INDEXED_UINT8;
 177
 178         case 2:
 179                 return MALI_DRAW_INDEXED_UINT16;
 180
 181         case 4:
 182                 return MALI_DRAW_INDEXED_UINT32;
 183
 184         default:
 185                 unreachable("Invalid index size");
 186         }
 187 }
 188
 189 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
 190  * good for the duration of the draw (transient), could last longer. Also get
 191  * the bounds on the index buffer for the range accessed by the draw. We do
 192  * these operations together because there are natural optimizations which
 193  * require them to be together. */
 194
 195 static mali_ptr
 196 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 197                                   const struct pipe_draw_info *info,
 198                                   unsigned *min_index, unsigned *max_index)
 199 {
 200         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 201         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 202         off_t offset = info->start * info->index_size;
 203         bool needs_indices = true;
 204         mali_ptr out = 0;
 205
 206         if (info->max_index != ~0u) {
 207                 *min_index = info->min_index;
 208                 *max_index = info->max_index;
 209                 needs_indices = false;
 210         }
 211
 212         if (!info->has_user_indices) {
 213                 /* Only resources can be directly mapped */
 214                 panfrost_batch_add_bo(batch, rsrc->bo,
 215                                       PAN_BO_ACCESS_SHARED |
 216                                       PAN_BO_ACCESS_READ |
 217                                       PAN_BO_ACCESS_VERTEX_TILER);
 218                 out = rsrc->bo->gpu + offset;
 219
 220                 /* Check the cache */
 221                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 222                                                            info->start,
 223                                                            info->count,
 224                                                            min_index,
 225                                                            max_index);
 226         } else {
 227                 /* Otherwise, we need to upload to transient memory */
 228                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 229                 out = panfrost_upload_transient(batch, ibuf8 + offset,
 230                                                 info->count *
 231                                                 info->index_size);
 232         }
 233
 234         if (needs_indices) {
 235                 /* Fallback */
 236                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 237
 238                 if (!info->has_user_indices)
 239                         panfrost_minmax_cache_add(rsrc->index_cache,
 240                                                   info->start, info->count,
 241                                                   *min_index, *max_index);
 242         }
 243
 244         return out;
 245 }
 246
 247 void
 248 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
 249                           const struct pipe_draw_info *info,
 250                           enum mali_draw_mode draw_mode,
 251                           struct mali_vertex_tiler_postfix *vertex_postfix,
 252                           struct mali_vertex_tiler_prefix *tiler_prefix,
 253                           struct mali_vertex_tiler_postfix *tiler_postfix,
 254                           unsigned *vertex_count,
 255                           unsigned *padded_count)
 256 {
 257         tiler_prefix->draw_mode = draw_mode;
 258
 259         unsigned draw_flags = 0;
 260
 261         if (panfrost_writes_point_size(ctx))
 262                 draw_flags |= MALI_DRAW_VARYING_SIZE;
 263
 264         if (info->primitive_restart)
 265                 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
 266
 267         /* These doesn't make much sense */
 268
 269         draw_flags |= 0x3000;
 270
 271         if (info->index_size) {
 272                 unsigned min_index = 0, max_index = 0;
 273
 274                 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
 275                                                                        info,
 276                                                                        &min_index,
 277                                                                        &max_index);
 278
 279                 /* Use the corresponding values */
 280                 *vertex_count = max_index - min_index + 1;
 281                 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
 282                 tiler_prefix->offset_bias_correction = -min_index;
 283                 tiler_prefix->index_count = MALI_POSITIVE(info->count);
 284                 draw_flags |= panfrost_translate_index_size(info->index_size);
 285         } else {
 286                 tiler_prefix->indices = 0;
 287                 *vertex_count = ctx->vertex_count;
 288                 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
 289                 tiler_prefix->offset_bias_correction = 0;
 290                 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
 291         }
 292
 293         tiler_prefix->unknown_draw = draw_flags;
 294
 295         /* Encode the padded vertex count */
 296
 297         if (info->instance_count > 1) {
 298                 *padded_count = panfrost_padded_vertex_count(*vertex_count);
 299
 300                 unsigned shift = __builtin_ctz(ctx->padded_count);
 301                 unsigned k = ctx->padded_count >> (shift + 1);
 302
 303                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
 304                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
 305         } else {
 306                 *padded_count = *vertex_count;
 307
 308                 /* Reset instancing state */
 309                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
 310                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
 311         }
 312 }
 313
 314 static void
 315 panfrost_shader_meta_init(struct panfrost_context *ctx,
 316                           enum pipe_shader_type st,
 317                           struct mali_shader_meta *meta)
 318 {
 319         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 320         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 321
 322         memset(meta, 0, sizeof(*meta));
 323         meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
 324         meta->attribute_count = ss->attribute_count;
 325         meta->varying_count = ss->varying_count;
 326         meta->texture_count = ctx->sampler_view_count[st];
 327         meta->sampler_count = ctx->sampler_count[st];
 328
 329         if (dev->quirks & IS_BIFROST) {
 330                 meta->bifrost1.unk1 = 0x800200;
 331                 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 332                 meta->bifrost2.preload_regs = 0xC0;
 333                 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
 334                                                     ss->uniform_cutoff);
 335         } else {
 336                 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
 337                                                     ss->uniform_cutoff);
 338                 meta->midgard1.work_count = ss->work_reg_count;
 339                 meta->midgard1.flags_hi = 0x8; /* XXX */
 340                 meta->midgard1.flags_lo = 0x220;
 341                 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 342         }
 343
 344 }
 345
 346 static unsigned
 347 panfrost_translate_compare_func(enum pipe_compare_func in)
 348 {
 349         switch (in) {
 350         case PIPE_FUNC_NEVER:
 351                 return MALI_FUNC_NEVER;
 352
 353         case PIPE_FUNC_LESS:
 354                 return MALI_FUNC_LESS;
 355
 356         case PIPE_FUNC_EQUAL:
 357                 return MALI_FUNC_EQUAL;
 358
 359         case PIPE_FUNC_LEQUAL:
 360                 return MALI_FUNC_LEQUAL;
 361
 362         case PIPE_FUNC_GREATER:
 363                 return MALI_FUNC_GREATER;
 364
 365         case PIPE_FUNC_NOTEQUAL:
 366                 return MALI_FUNC_NOTEQUAL;
 367
 368         case PIPE_FUNC_GEQUAL:
 369                 return MALI_FUNC_GEQUAL;
 370
 371         case PIPE_FUNC_ALWAYS:
 372                 return MALI_FUNC_ALWAYS;
 373
 374         default:
 375                 unreachable("Invalid func");
 376         }
 377 }
 378
 379 static unsigned
 380 panfrost_translate_stencil_op(enum pipe_stencil_op in)
 381 {
 382         switch (in) {
 383         case PIPE_STENCIL_OP_KEEP:
 384                 return MALI_STENCIL_KEEP;
 385
 386         case PIPE_STENCIL_OP_ZERO:
 387                 return MALI_STENCIL_ZERO;
 388
 389         case PIPE_STENCIL_OP_REPLACE:
 390                return MALI_STENCIL_REPLACE;
 391
 392         case PIPE_STENCIL_OP_INCR:
 393                 return MALI_STENCIL_INCR;
 394
 395         case PIPE_STENCIL_OP_DECR:
 396                 return MALI_STENCIL_DECR;
 397
 398         case PIPE_STENCIL_OP_INCR_WRAP:
 399                 return MALI_STENCIL_INCR_WRAP;
 400
 401         case PIPE_STENCIL_OP_DECR_WRAP:
 402                 return MALI_STENCIL_DECR_WRAP;
 403
 404         case PIPE_STENCIL_OP_INVERT:
 405                 return MALI_STENCIL_INVERT;
 406
 407         default:
 408                 unreachable("Invalid stencil op");
 409         }
 410 }
 411
 412 static unsigned
 413 translate_tex_wrap(enum pipe_tex_wrap w)
 414 {
 415         switch (w) {
 416         case PIPE_TEX_WRAP_REPEAT:
 417                 return MALI_WRAP_REPEAT;
 418
 419         case PIPE_TEX_WRAP_CLAMP:
 420                 return MALI_WRAP_CLAMP;
 421
 422         case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 423                 return MALI_WRAP_CLAMP_TO_EDGE;
 424
 425         case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
 426                 return MALI_WRAP_CLAMP_TO_BORDER;
 427
 428         case PIPE_TEX_WRAP_MIRROR_REPEAT:
 429                 return MALI_WRAP_MIRRORED_REPEAT;
 430
 431         case PIPE_TEX_WRAP_MIRROR_CLAMP:
 432                 return MALI_WRAP_MIRRORED_CLAMP;
 433
 434         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 435                 return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE;
 436
 437         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
 438                 return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER;
 439
 440         default:
 441                 unreachable("Invalid wrap");
 442         }
 443 }
 444
 445 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 446                                 struct mali_sampler_descriptor *hw)
 447 {
 448         unsigned func = panfrost_translate_compare_func(cso->compare_func);
 449         bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 450         bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 451         bool mip_linear  = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
 452         unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
 453         unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
 454         unsigned mip_filter = mip_linear  ?
 455                               (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
 456         unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
 457
 458         *hw = (struct mali_sampler_descriptor) {
 459                 .filter_mode = min_filter | mag_filter | mip_filter |
 460                                normalized,
 461                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 462                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 463                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 464                 .compare_func = panfrost_flip_compare_func(func),
 465                 .border_color = {
 466                         cso->border_color.f[0],
 467                         cso->border_color.f[1],
 468                         cso->border_color.f[2],
 469                         cso->border_color.f[3]
 470                 },
 471                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 472                 .max_lod = FIXED_16(cso->max_lod, false),
 473                 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
 474                 .seamless_cube_map = cso->seamless_cube_map,
 475         };
 476
 477         /* If necessary, we disable mipmapping in the sampler descriptor by
 478          * clamping the LOD as tight as possible (from 0 to epsilon,
 479          * essentially -- remember these are fixed point numbers, so
 480          * epsilon=1/256) */
 481
 482         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 483                 hw->max_lod = hw->min_lod + 1;
 484 }
 485
 486 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
 487                                         struct bifrost_sampler_descriptor *hw)
 488 {
 489         *hw = (struct bifrost_sampler_descriptor) {
 490                 .unk1 = 0x1,
 491                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 492                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 493                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 494                 .unk8 = 0x8,
 495                 .unk2 = 0x2,
 496                 .min_filter = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST,
 497                 .norm_coords = cso->normalized_coords,
 498                 .mip_filter = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR,
 499                 .mag_filter = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR,
 500                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 501                 .max_lod = FIXED_16(cso->max_lod, false),
 502         };
 503
 504         /* If necessary, we disable mipmapping in the sampler descriptor by
 505          * clamping the LOD as tight as possible (from 0 to epsilon,
 506          * essentially -- remember these are fixed point numbers, so
 507          * epsilon=1/256) */
 508
 509         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 510                 hw->max_lod = hw->min_lod + 1;
 511 }
 512
 513 static void
 514 panfrost_make_stencil_state(const struct pipe_stencil_state *in,
 515                             struct mali_stencil_test *out)
 516 {
 517         out->ref = 0; /* Gallium gets it from elsewhere */
 518
 519         out->mask = in->valuemask;
 520         out->func = panfrost_translate_compare_func(in->func);
 521         out->sfail = panfrost_translate_stencil_op(in->fail_op);
 522         out->dpfail = panfrost_translate_stencil_op(in->zfail_op);
 523         out->dppass = panfrost_translate_stencil_op(in->zpass_op);
 524 }
 525
 526 static void
 527 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
 528                                      struct mali_shader_meta *fragmeta)
 529 {
 530         if (!ctx->rasterizer) {
 531                 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
 532                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
 533                 fragmeta->depth_units = 0.0f;
 534                 fragmeta->depth_factor = 0.0f;
 535                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
 536                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
 537                 return;
 538         }
 539
 540         bool msaa = ctx->rasterizer->base.multisample;
 541
 542         /* TODO: Sample size */
 543         SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
 544         SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
 545         fragmeta->depth_units = ctx->rasterizer->base.offset_units * 2.0f;
 546         fragmeta->depth_factor = ctx->rasterizer->base.offset_scale;
 547
 548         /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
 549
 550         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A,
 551                 ctx->rasterizer->base.offset_tri);
 552         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B,
 553                 ctx->rasterizer->base.offset_tri);
 554 }
 555
 556 static void
 557 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
 558                               struct mali_shader_meta *fragmeta)
 559 {
 560         const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
 561         int zfunc = PIPE_FUNC_ALWAYS;
 562
 563         if (!zsa) {
 564                 struct pipe_stencil_state default_stencil = {
 565                         .enabled = 0,
 566                         .func = PIPE_FUNC_ALWAYS,
 567                         .fail_op = MALI_STENCIL_KEEP,
 568                         .zfail_op = MALI_STENCIL_KEEP,
 569                         .zpass_op = MALI_STENCIL_KEEP,
 570                         .writemask = 0xFF,
 571                         .valuemask = 0xFF
 572                 };
 573
 574                 panfrost_make_stencil_state(&default_stencil,
 575                                             &fragmeta->stencil_front);
 576                 fragmeta->stencil_mask_front = default_stencil.writemask;
 577                 fragmeta->stencil_back = fragmeta->stencil_front;
 578                 fragmeta->stencil_mask_back = default_stencil.writemask;
 579                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
 580                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
 581         } else {
 582                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
 583                         zsa->stencil[0].enabled);
 584                 panfrost_make_stencil_state(&zsa->stencil[0],
 585                                             &fragmeta->stencil_front);
 586                 fragmeta->stencil_mask_front = zsa->stencil[0].writemask;
 587                 fragmeta->stencil_front.ref = ctx->stencil_ref.ref_value[0];
 588
 589                 /* If back-stencil is not enabled, use the front values */
 590
 591                 if (zsa->stencil[1].enabled) {
 592                         panfrost_make_stencil_state(&zsa->stencil[1],
 593                                                     &fragmeta->stencil_back);
 594                         fragmeta->stencil_mask_back = zsa->stencil[1].writemask;
 595                         fragmeta->stencil_back.ref = ctx->stencil_ref.ref_value[1];
 596                 } else {
 597                         fragmeta->stencil_back = fragmeta->stencil_front;
 598                         fragmeta->stencil_mask_back = fragmeta->stencil_mask_front;
 599                         fragmeta->stencil_back.ref = fragmeta->stencil_front.ref;
 600                 }
 601
 602                 if (zsa->depth.enabled)
 603                         zfunc = zsa->depth.func;
 604
 605                 /* Depth state (TODO: Refactor) */
 606
 607                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
 608                         zsa->depth.writemask);
 609         }
 610
 611         fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
 612         fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
 613 }
 614
 615 static void
 616 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
 617                                 struct mali_shader_meta *fragmeta,
 618                                 void *rts)
 619 {
 620         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 621
 622         SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
 623                 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
 624                 !ctx->blend->base.dither);
 625
 626         /* Get blending setup */
 627         unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 628
 629         struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 630         unsigned shader_offset = 0;
 631         struct panfrost_bo *shader_bo = NULL;
 632
 633         for (unsigned c = 0; c < rt_count; ++c)
 634                 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
 635                                                           &shader_offset);
 636
 637          /* If there is a blend shader, work registers are shared. XXX: opt */
 638
 639         for (unsigned c = 0; c < rt_count; ++c) {
 640                 if (blend[c].is_shader)
 641                         fragmeta->midgard1.work_count = 16;
 642         }
 643
 644         /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
 645          * copied to the blend_meta appended (by convention), but this is the
 646          * field actually read by the hardware. (Or maybe both are read...?).
 647          * Specify the last RTi with a blend shader. */
 648
 649         fragmeta->blend.shader = 0;
 650
 651         for (signed rt = (rt_count - 1); rt >= 0; --rt) {
 652                 if (!blend[rt].is_shader)
 653                         continue;
 654
 655                 fragmeta->blend.shader = blend[rt].shader.gpu |
 656                                          blend[rt].shader.first_tag;
 657                 break;
 658         }
 659
 660         if (dev->quirks & MIDGARD_SFBD) {
 661                 /* When only a single render target platform is used, the blend
 662                  * information is inside the shader meta itself. We additionally
 663                  * need to signal CAN_DISCARD for nontrivial blend modes (so
 664                  * we're able to read back the destination buffer) */
 665
 666                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
 667                         blend[0].is_shader);
 668
 669                 if (!blend[0].is_shader) {
 670                         fragmeta->blend.equation = *blend[0].equation.equation;
 671                         fragmeta->blend.constant = blend[0].equation.constant;
 672                 }
 673
 674                 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
 675                         !blend[0].no_blending);
 676                 return;
 677         }
 678
 679         /* Additional blend descriptor tacked on for jobs using MFBD */
 680
 681         for (unsigned i = 0; i < rt_count; ++i) {
 682                 if (dev->quirks & IS_BIFROST) {
 683                         struct bifrost_blend_rt *brts = rts;
 684                         struct panfrost_shader_state *fs;
 685                         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 686
 687                         brts[i].flags = 0x200;
 688                         if (blend[i].is_shader) {
 689                                 /* The blend shader's address needs to be at
 690                                  * the same top 32 bit as the fragment shader.
 691                                  * TODO: Ensure that's always the case.
 692                                  */
 693                                 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
 694                                        (fs->bo->gpu & (0xffffffffull << 32)));
 695                                 brts[i].shader = blend[i].shader.gpu;
 696                                 brts[i].unk2 = 0x0;
 697                         } else {
 698                                 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
 699                                 const struct util_format_description *format_desc;
 700                                 format_desc = util_format_description(format);
 701
 702                                 brts[i].equation = *blend[i].equation.equation;
 703
 704                                 /* TODO: this is a bit more complicated */
 705                                 brts[i].constant = blend[i].equation.constant;
 706
 707                                 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
 708                                 brts[i].unk2 = 0x19;
 709
 710                                 brts[i].shader_type = fs->blend_types[i];
 711                         }
 712                 } else {
 713                         struct midgard_blend_rt *mrts = rts;
 714
 715                         mrts[i].flags = 0x200;
 716
 717                         bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
 718                                        (ctx->pipe_framebuffer.cbufs[i]) &&
 719                                        util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
 720
 721                         SET_BIT(mrts[i].flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
 722                         SET_BIT(mrts[i].flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
 723                         SET_BIT(mrts[i].flags, MALI_BLEND_SRGB, is_srgb);
 724                         SET_BIT(mrts[i].flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
 725
 726                         if (blend[i].is_shader) {
 727                                 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 728                         } else {
 729                                 mrts[i].blend.equation = *blend[i].equation.equation;
 730                                 mrts[i].blend.constant = blend[i].equation.constant;
 731                         }
 732                 }
 733         }
 734 }
 735
 736 static void
 737 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
 738                                struct mali_shader_meta *fragmeta,
 739                                void *rts)
 740 {
 741         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 742         struct panfrost_shader_state *fs;
 743
 744         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 745
 746         fragmeta->alpha_coverage = ~MALI_ALPHA_COVERAGE(0.000000);
 747         fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x3010;
 748         fragmeta->unknown2_4 = 0x4e0;
 749
 750         /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
 751          * is required (independent of 32-bit/64-bit descriptors), or why it's
 752          * not used on later GPU revisions. Otherwise, all shader jobs fault on
 753          * these earlier chips (perhaps this is a chicken bit of some kind).
 754          * More investigation is needed. */
 755
 756         SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
 757
 758         /* Depending on whether it's legal to in the given shader, we try to
 759          * enable early-z testing (or forward-pixel kill?) */
 760
 761         SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
 762                 !fs->can_discard && !fs->writes_depth);
 763
 764         /* Add the writes Z/S flags if needed. */
 765         SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
 766         SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
 767
 768         /* Any time texturing is used, derivatives are implicitly calculated,
 769          * so we need to enable helper invocations */
 770
 771         SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
 772                 fs->helper_invocations);
 773
 774         /* CAN_DISCARD should be set if the fragment shader possibly contains a
 775          * 'discard' instruction. It is likely this is related to optimizations
 776          * related to forward-pixel kill, as per "Mali Performance 3: Is
 777          * EGL_BUFFER_PRESERVED a good thing?" by Peter Harris */
 778
 779         SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD, fs->can_discard);
 780         SET_BIT(fragmeta->midgard1.flags_lo, 0x400, fs->can_discard);
 781
 782         panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
 783         panfrost_frag_meta_zsa_update(ctx, fragmeta);
 784         panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
 785 }
 786
 787 void
 788 panfrost_emit_shader_meta(struct panfrost_batch *batch,
 789                           enum pipe_shader_type st,
 790                           struct mali_vertex_tiler_postfix *postfix)
 791 {
 792         struct panfrost_context *ctx = batch->ctx;
 793         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 794
 795         if (!ss) {
 796                 postfix->shader = 0;
 797                 return;
 798         }
 799
 800         struct mali_shader_meta meta;
 801
 802         panfrost_shader_meta_init(ctx, st, &meta);
 803
 804         /* Add the shader BO to the batch. */
 805         panfrost_batch_add_bo(batch, ss->bo,
 806                               PAN_BO_ACCESS_PRIVATE |
 807                               PAN_BO_ACCESS_READ |
 808                               panfrost_bo_access_for_stage(st));
 809
 810         mali_ptr shader_ptr;
 811
 812         if (st == PIPE_SHADER_FRAGMENT) {
 813                 struct panfrost_device *dev = pan_device(ctx->base.screen);
 814                 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 815                 size_t desc_size = sizeof(meta);
 816                 void *rts = NULL;
 817                 struct panfrost_transfer xfer;
 818                 unsigned rt_size;
 819
 820                 if (dev->quirks & MIDGARD_SFBD)
 821                         rt_size = 0;
 822                 else if (dev->quirks & IS_BIFROST)
 823                         rt_size = sizeof(struct bifrost_blend_rt);
 824                 else
 825                         rt_size = sizeof(struct midgard_blend_rt);
 826
 827                 desc_size += rt_size * rt_count;
 828
 829                 if (rt_size)
 830                         rts = rzalloc_size(ctx, rt_size * rt_count);
 831
 832                 panfrost_frag_shader_meta_init(ctx, &meta, rts);
 833
 834                 xfer = panfrost_allocate_transient(batch, desc_size);
 835
 836                 memcpy(xfer.cpu, &meta, sizeof(meta));
 837                 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
 838
 839                 if (rt_size)
 840                         ralloc_free(rts);
 841
 842                 shader_ptr = xfer.gpu;
 843         } else {
 844                 shader_ptr = panfrost_upload_transient(batch, &meta,
 845                                                        sizeof(meta));
 846         }
 847
 848         postfix->shader = shader_ptr;
 849 }
 850
 851 static void
 852 panfrost_mali_viewport_init(struct panfrost_context *ctx,
 853                             struct mali_viewport *mvp)
 854 {
 855         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 856
 857         /* Clip bounds are encoded as floats. The viewport itself is encoded as
 858          * (somewhat) asymmetric ints. */
 859
 860         const struct pipe_scissor_state *ss = &ctx->scissor;
 861
 862         memset(mvp, 0, sizeof(*mvp));
 863
 864         /* By default, do no viewport clipping, i.e. clip to (-inf, inf) in
 865          * each direction. Clipping to the viewport in theory should work, but
 866          * in practice causes issues when we're not explicitly trying to
 867          * scissor */
 868
 869         *mvp = (struct mali_viewport) {
 870                 .clip_minx = -INFINITY,
 871                 .clip_miny = -INFINITY,
 872                 .clip_maxx = INFINITY,
 873                 .clip_maxy = INFINITY,
 874         };
 875
 876         /* Always scissor to the viewport by default. */
 877         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 878         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 879
 880         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 881         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 882
 883         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 884         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 885
 886         /* Apply the scissor test */
 887
 888         unsigned minx, miny, maxx, maxy;
 889
 890         if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
 891                 minx = MAX2(ss->minx, vp_minx);
 892                 miny = MAX2(ss->miny, vp_miny);
 893                 maxx = MIN2(ss->maxx, vp_maxx);
 894                 maxy = MIN2(ss->maxy, vp_maxy);
 895         } else {
 896                 minx = vp_minx;
 897                 miny = vp_miny;
 898                 maxx = vp_maxx;
 899                 maxy = vp_maxy;
 900         }
 901
 902         /* Hardware needs the min/max to be strictly ordered, so flip if we
 903          * need to. The viewport transformation in the vertex shader will
 904          * handle the negatives if we don't */
 905
 906         if (miny > maxy) {
 907                 unsigned temp = miny;
 908                 miny = maxy;
 909                 maxy = temp;
 910         }
 911
 912         if (minx > maxx) {
 913                 unsigned temp = minx;
 914                 minx = maxx;
 915                 maxx = temp;
 916         }
 917
 918         if (minz > maxz) {
 919                 float temp = minz;
 920                 minz = maxz;
 921                 maxz = temp;
 922         }
 923
 924         /* Clamp to the framebuffer size as a last check */
 925
 926         minx = MIN2(ctx->pipe_framebuffer.width, minx);
 927         maxx = MIN2(ctx->pipe_framebuffer.width, maxx);
 928
 929         miny = MIN2(ctx->pipe_framebuffer.height, miny);
 930         maxy = MIN2(ctx->pipe_framebuffer.height, maxy);
 931
 932         /* Upload */
 933
 934         mvp->viewport0[0] = minx;
 935         mvp->viewport1[0] = MALI_POSITIVE(maxx);
 936
 937         mvp->viewport0[1] = miny;
 938         mvp->viewport1[1] = MALI_POSITIVE(maxy);
 939
 940         mvp->clip_minz = minz;
 941         mvp->clip_maxz = maxz;
 942 }
 943
 944 void
 945 panfrost_emit_viewport(struct panfrost_batch *batch,
 946                        struct mali_vertex_tiler_postfix *tiler_postfix)
 947 {
 948         struct panfrost_context *ctx = batch->ctx;
 949         struct mali_viewport mvp;
 950
 951         panfrost_mali_viewport_init(batch->ctx,  &mvp);
 952
 953         /* Update the job, unless we're doing wallpapering (whose lack of
 954          * scissor we can ignore, since if we "miss" a tile of wallpaper, it'll
 955          * just... be faster :) */
 956
 957         if (!ctx->wallpaper_batch)
 958                 panfrost_batch_union_scissor(batch, mvp.viewport0[0],
 959                                              mvp.viewport0[1],
 960                                              mvp.viewport1[0] + 1,
 961                                              mvp.viewport1[1] + 1);
 962
 963         tiler_postfix->viewport = panfrost_upload_transient(batch, &mvp,
 964                                                             sizeof(mvp));
 965 }
 966
 967 static mali_ptr
 968 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
 969                                  enum pipe_shader_type st,
 970                                  struct panfrost_constant_buffer *buf,
 971                                  unsigned index)
 972 {
 973         struct pipe_constant_buffer *cb = &buf->cb[index];
 974         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
 975
 976         if (rsrc) {
 977                 panfrost_batch_add_bo(batch, rsrc->bo,
 978                                       PAN_BO_ACCESS_SHARED |
 979                                       PAN_BO_ACCESS_READ |
 980                                       panfrost_bo_access_for_stage(st));
 981
 982                 /* Alignment gauranteed by
 983                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
 984                 return rsrc->bo->gpu + cb->buffer_offset;
 985         } else if (cb->user_buffer) {
 986                 return panfrost_upload_transient(batch,
 987                                                  cb->user_buffer +
 988                                                  cb->buffer_offset,
 989                                                  cb->buffer_size);
 990         } else {
 991                 unreachable("No constant buffer");
 992         }
 993 }
 994
 995 struct sysval_uniform {
 996         union {
 997                 float f[4];
 998                 int32_t i[4];
 999                 uint32_t u[4];
1000                 uint64_t du[2];
1001         };
1002 };
1003
1004 static void
1005 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
1006                                       struct sysval_uniform *uniform)
1007 {
1008         struct panfrost_context *ctx = batch->ctx;
1009         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1010
1011         uniform->f[0] = vp->scale[0];
1012         uniform->f[1] = vp->scale[1];
1013         uniform->f[2] = vp->scale[2];
1014 }
1015
1016 static void
1017 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1018                                        struct sysval_uniform *uniform)
1019 {
1020         struct panfrost_context *ctx = batch->ctx;
1021         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1022
1023         uniform->f[0] = vp->translate[0];
1024         uniform->f[1] = vp->translate[1];
1025         uniform->f[2] = vp->translate[2];
1026 }
1027
1028 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1029                                        enum pipe_shader_type st,
1030                                        unsigned int sysvalid,
1031                                        struct sysval_uniform *uniform)
1032 {
1033         struct panfrost_context *ctx = batch->ctx;
1034         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1035         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1036         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1037         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1038
1039         assert(dim);
1040         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1041
1042         if (dim > 1)
1043                 uniform->i[1] = u_minify(tex->texture->height0,
1044                                          tex->u.tex.first_level);
1045
1046         if (dim > 2)
1047                 uniform->i[2] = u_minify(tex->texture->depth0,
1048                                          tex->u.tex.first_level);
1049
1050         if (is_array)
1051                 uniform->i[dim] = tex->texture->array_size;
1052 }
1053
1054 static void
1055 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1056                             enum pipe_shader_type st,
1057                             unsigned ssbo_id,
1058                             struct sysval_uniform *uniform)
1059 {
1060         struct panfrost_context *ctx = batch->ctx;
1061
1062         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1063         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1064
1065         /* Compute address */
1066         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1067
1068         panfrost_batch_add_bo(batch, bo,
1069                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1070                               panfrost_bo_access_for_stage(st));
1071
1072         /* Upload address and size as sysval */
1073         uniform->du[0] = bo->gpu + sb.buffer_offset;
1074         uniform->u[2] = sb.buffer_size;
1075 }
1076
1077 static void
1078 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1079                                enum pipe_shader_type st,
1080                                unsigned samp_idx,
1081                                struct sysval_uniform *uniform)
1082 {
1083         struct panfrost_context *ctx = batch->ctx;
1084         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1085
1086         uniform->f[0] = sampl->min_lod;
1087         uniform->f[1] = sampl->max_lod;
1088         uniform->f[2] = sampl->lod_bias;
1089
1090         /* Even without any errata, Midgard represents "no mipmapping" as
1091          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1092          * panfrost_create_sampler_state which also explains our choice of
1093          * epsilon value (again to keep behaviour consistent) */
1094
1095         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1096                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1097 }
1098
1099 static void
1100 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1101                                        struct sysval_uniform *uniform)
1102 {
1103         struct panfrost_context *ctx = batch->ctx;
1104
1105         uniform->u[0] = ctx->compute_grid->grid[0];
1106         uniform->u[1] = ctx->compute_grid->grid[1];
1107         uniform->u[2] = ctx->compute_grid->grid[2];
1108 }
1109
1110 static void
1111 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1112                         struct panfrost_shader_state *ss,
1113                         enum pipe_shader_type st)
1114 {
1115         struct sysval_uniform *uniforms = (void *)buf;
1116
1117         for (unsigned i = 0; i < ss->sysval_count; ++i) {
1118                 int sysval = ss->sysval[i];
1119
1120                 switch (PAN_SYSVAL_TYPE(sysval)) {
1121                 case PAN_SYSVAL_VIEWPORT_SCALE:
1122                         panfrost_upload_viewport_scale_sysval(batch,
1123                                                               &uniforms[i]);
1124                         break;
1125                 case PAN_SYSVAL_VIEWPORT_OFFSET:
1126                         panfrost_upload_viewport_offset_sysval(batch,
1127                                                                &uniforms[i]);
1128                         break;
1129                 case PAN_SYSVAL_TEXTURE_SIZE:
1130                         panfrost_upload_txs_sysval(batch, st,
1131                                                    PAN_SYSVAL_ID(sysval),
1132                                                    &uniforms[i]);
1133                         break;
1134                 case PAN_SYSVAL_SSBO:
1135                         panfrost_upload_ssbo_sysval(batch, st,
1136                                                     PAN_SYSVAL_ID(sysval),
1137                                                     &uniforms[i]);
1138                         break;
1139                 case PAN_SYSVAL_NUM_WORK_GROUPS:
1140                         panfrost_upload_num_work_groups_sysval(batch,
1141                                                                &uniforms[i]);
1142                         break;
1143                 case PAN_SYSVAL_SAMPLER:
1144                         panfrost_upload_sampler_sysval(batch, st,
1145                                                        PAN_SYSVAL_ID(sysval),
1146                                                        &uniforms[i]);
1147                         break;
1148                 default:
1149                         assert(0);
1150                 }
1151         }
1152 }
1153
1154 static const void *
1155 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1156                                  unsigned index)
1157 {
1158         struct pipe_constant_buffer *cb = &buf->cb[index];
1159         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1160
1161         if (rsrc)
1162                 return rsrc->bo->cpu;
1163         else if (cb->user_buffer)
1164                 return cb->user_buffer;
1165         else
1166                 unreachable("No constant buffer");
1167 }
1168
1169 void
1170 panfrost_emit_const_buf(struct panfrost_batch *batch,
1171                         enum pipe_shader_type stage,
1172                         struct mali_vertex_tiler_postfix *postfix)
1173 {
1174         struct panfrost_context *ctx = batch->ctx;
1175         struct panfrost_shader_variants *all = ctx->shader[stage];
1176
1177         if (!all)
1178                 return;
1179
1180         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1181
1182         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1183
1184         /* Uniforms are implicitly UBO #0 */
1185         bool has_uniforms = buf->enabled_mask & (1 << 0);
1186
1187         /* Allocate room for the sysval and the uniforms */
1188         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1189         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1190         size_t size = sys_size + uniform_size;
1191         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1192                                                                         size);
1193
1194         /* Upload sysvals requested by the shader */
1195         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1196
1197         /* Upload uniforms */
1198         if (has_uniforms && uniform_size) {
1199                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1200                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1201         }
1202
1203         /* Next up, attach UBOs. UBO #0 is the uniforms we just
1204          * uploaded */
1205
1206         unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1207         assert(ubo_count >= 1);
1208
1209         size_t sz = sizeof(uint64_t) * ubo_count;
1210         uint64_t ubos[PAN_MAX_CONST_BUFFERS];
1211         int uniform_count = ss->uniform_count;
1212
1213         /* Upload uniforms as a UBO */
1214         ubos[0] = MALI_MAKE_UBO(2 + uniform_count, transfer.gpu);
1215
1216         /* The rest are honest-to-goodness UBOs */
1217
1218         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1219                 size_t usz = buf->cb[ubo].buffer_size;
1220                 bool enabled = buf->enabled_mask & (1 << ubo);
1221                 bool empty = usz == 0;
1222
1223                 if (!enabled || empty) {
1224                         /* Stub out disabled UBOs to catch accesses */
1225                         ubos[ubo] = MALI_MAKE_UBO(0, 0xDEAD0000);
1226                         continue;
1227                 }
1228
1229                 mali_ptr gpu = panfrost_map_constant_buffer_gpu(batch, stage,
1230                                                                 buf, ubo);
1231
1232                 unsigned bytes_per_field = 16;
1233                 unsigned aligned = ALIGN_POT(usz, bytes_per_field);
1234                 ubos[ubo] = MALI_MAKE_UBO(aligned / bytes_per_field, gpu);
1235         }
1236
1237         mali_ptr ubufs = panfrost_upload_transient(batch, ubos, sz);
1238         postfix->uniforms = transfer.gpu;
1239         postfix->uniform_buffers = ubufs;
1240
1241         buf->dirty_mask = 0;
1242 }
1243
1244 void
1245 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1246                             const struct pipe_grid_info *info,
1247                             struct midgard_payload_vertex_tiler *vtp)
1248 {
1249         struct panfrost_context *ctx = batch->ctx;
1250         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1251         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1252         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1253                                                            128));
1254         unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1255                                info->grid[2] * 4;
1256         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1257                                                                   shared_size,
1258                                                                   1);
1259
1260         struct mali_shared_memory shared = {
1261                 .shared_memory = bo->gpu,
1262                 .shared_workgroup_count =
1263                         util_logbase2_ceil(info->grid[0]) +
1264                         util_logbase2_ceil(info->grid[1]) +
1265                         util_logbase2_ceil(info->grid[2]),
1266                 .shared_unk1 = 0x2,
1267                 .shared_shift = util_logbase2(single_size) - 1
1268         };
1269
1270         vtp->postfix.shared_memory = panfrost_upload_transient(batch, &shared,
1271                                                                sizeof(shared));
1272 }
1273
1274 static mali_ptr
1275 panfrost_get_tex_desc(struct panfrost_batch *batch,
1276                       enum pipe_shader_type st,
1277                       struct panfrost_sampler_view *view)
1278 {
1279         if (!view)
1280                 return (mali_ptr) 0;
1281
1282         struct pipe_sampler_view *pview = &view->base;
1283         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1284
1285         /* Add the BO to the job so it's retained until the job is done. */
1286
1287         panfrost_batch_add_bo(batch, rsrc->bo,
1288                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1289                               panfrost_bo_access_for_stage(st));
1290
1291         panfrost_batch_add_bo(batch, view->midgard_bo,
1292                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1293                               panfrost_bo_access_for_stage(st));
1294
1295         return view->midgard_bo->gpu;
1296 }
1297
1298 void
1299 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1300                                   enum pipe_shader_type stage,
1301                                   struct mali_vertex_tiler_postfix *postfix)
1302 {
1303         struct panfrost_context *ctx = batch->ctx;
1304         struct panfrost_device *device = pan_device(ctx->base.screen);
1305
1306         if (!ctx->sampler_view_count[stage])
1307                 return;
1308
1309         if (device->quirks & IS_BIFROST) {
1310                 struct bifrost_texture_descriptor *descriptors;
1311
1312                 descriptors = malloc(sizeof(struct bifrost_texture_descriptor) *
1313                                      ctx->sampler_view_count[stage]);
1314
1315                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1316                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1317                         struct pipe_sampler_view *pview = &view->base;
1318                         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1319
1320                         panfrost_batch_add_bo(batch, rsrc->bo,
1321                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1322                                               panfrost_bo_access_for_stage(stage));
1323
1324                         memcpy(&descriptors[i], view->bifrost_descriptor, sizeof(*view->bifrost_descriptor));
1325                 }
1326
1327                 postfix->textures = panfrost_upload_transient(batch,
1328                                                               descriptors,
1329                                                               sizeof(struct bifrost_texture_descriptor) *
1330                                                                       ctx->sampler_view_count[stage]);
1331         } else {
1332                 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1333
1334                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i)
1335                         trampolines[i] = panfrost_get_tex_desc(batch, stage,
1336                                                                ctx->sampler_views[stage][i]);
1337
1338                 postfix->textures = panfrost_upload_transient(batch,
1339                                                               trampolines,
1340                                                               sizeof(uint64_t) *
1341                                                               ctx->sampler_view_count[stage]);
1342         }
1343 }
1344
1345 void
1346 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1347                                   enum pipe_shader_type stage,
1348                                   struct mali_vertex_tiler_postfix *postfix)
1349 {
1350         struct panfrost_context *ctx = batch->ctx;
1351         struct panfrost_device *device = pan_device(ctx->base.screen);
1352
1353         if (!ctx->sampler_count[stage])
1354                 return;
1355
1356         if (device->quirks & IS_BIFROST) {
1357                 size_t desc_size = sizeof(struct bifrost_sampler_descriptor);
1358                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1359                 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1360                                                                                 transfer_size);
1361                 struct bifrost_sampler_descriptor *desc = (struct bifrost_sampler_descriptor *)transfer.cpu;
1362
1363                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1364                         desc[i] = ctx->samplers[stage][i]->bifrost_hw;
1365
1366                 postfix->sampler_descriptor = transfer.gpu;
1367         } else {
1368                 size_t desc_size = sizeof(struct mali_sampler_descriptor);
1369                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1370                 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1371                                                                                 transfer_size);
1372                 struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1373
1374                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1375                         desc[i] = ctx->samplers[stage][i]->midgard_hw;
1376
1377                 postfix->sampler_descriptor = transfer.gpu;
1378         }
1379 }
1380
1381 void
1382 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1383                                struct mali_vertex_tiler_postfix *vertex_postfix)
1384 {
1385         struct panfrost_context *ctx = batch->ctx;
1386
1387         if (!ctx->vertex)
1388                 return;
1389
1390         struct panfrost_vertex_state *so = ctx->vertex;
1391
1392         panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1393         vertex_postfix->attribute_meta = panfrost_upload_transient(batch, so->hw,
1394                                                                sizeof(*so->hw) *
1395                                                                PAN_MAX_ATTRIBUTE);
1396 }
1397
1398 void
1399 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1400                           struct mali_vertex_tiler_postfix *vertex_postfix)
1401 {
1402         struct panfrost_context *ctx = batch->ctx;
1403         struct panfrost_vertex_state *so = ctx->vertex;
1404
1405         /* Staged mali_attr, and index into them. i =/= k, depending on the
1406          * vertex buffer mask and instancing. Twice as much room is allocated,
1407          * for a worst case of NPOT_DIVIDEs which take up extra slot */
1408         union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1409         unsigned k = 0;
1410
1411         for (unsigned i = 0; i < so->num_elements; ++i) {
1412                 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1413                  * means duplicating some vertex buffers (who cares? aside from
1414                  * maybe some caching implications but I somehow doubt that
1415                  * matters) */
1416
1417                 struct pipe_vertex_element *elem = &so->pipe[i];
1418                 unsigned vbi = elem->vertex_buffer_index;
1419
1420                 /* The exception to 1:1 mapping is that we can have multiple
1421                  * entries (NPOT divisors), so we fixup anyways */
1422
1423                 so->hw[i].index = k;
1424
1425                 if (!(ctx->vb_mask & (1 << vbi)))
1426                         continue;
1427
1428                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1429                 struct panfrost_resource *rsrc;
1430
1431                 rsrc = pan_resource(buf->buffer.resource);
1432                 if (!rsrc)
1433                         continue;
1434
1435                 /* Align to 64 bytes by masking off the lower bits. This
1436                  * will be adjusted back when we fixup the src_offset in
1437                  * mali_attr_meta */
1438
1439                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1440                 mali_ptr addr = raw_addr & ~63;
1441                 unsigned chopped_addr = raw_addr - addr;
1442
1443                 /* Add a dependency of the batch on the vertex buffer */
1444                 panfrost_batch_add_bo(batch, rsrc->bo,
1445                                       PAN_BO_ACCESS_SHARED |
1446                                       PAN_BO_ACCESS_READ |
1447                                       PAN_BO_ACCESS_VERTEX_TILER);
1448
1449                 /* Set common fields */
1450                 attrs[k].elements = addr;
1451                 attrs[k].stride = buf->stride;
1452
1453                 /* Since we advanced the base pointer, we shrink the buffer
1454                  * size */
1455                 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1456
1457                 /* We need to add the extra size we masked off (for
1458                  * correctness) so the data doesn't get clamped away */
1459                 attrs[k].size += chopped_addr;
1460
1461                 /* For non-instancing make sure we initialize */
1462                 attrs[k].shift = attrs[k].extra_flags = 0;
1463
1464                 /* Instancing uses a dramatically different code path than
1465                  * linear, so dispatch for the actual emission now that the
1466                  * common code is finished */
1467
1468                 unsigned divisor = elem->instance_divisor;
1469
1470                 if (divisor && ctx->instance_count == 1) {
1471                         /* Silly corner case where there's a divisor(=1) but
1472                          * there's no legitimate instancing. So we want *every*
1473                          * attribute to be the same. So set stride to zero so
1474                          * we don't go anywhere. */
1475
1476                         attrs[k].size = attrs[k].stride + chopped_addr;
1477                         attrs[k].stride = 0;
1478                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1479                 } else if (ctx->instance_count <= 1) {
1480                         /* Normal, non-instanced attributes */
1481                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1482                 } else {
1483                         unsigned instance_shift = vertex_postfix->instance_shift;
1484                         unsigned instance_odd = vertex_postfix->instance_odd;
1485
1486                         k += panfrost_vertex_instanced(ctx->padded_count,
1487                                                        instance_shift,
1488                                                        instance_odd,
1489                                                        divisor, &attrs[k]);
1490                 }
1491         }
1492
1493         /* Add special gl_VertexID/gl_InstanceID buffers */
1494
1495         panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1496         so->hw[PAN_VERTEX_ID].index = k++;
1497         panfrost_instance_id(ctx->padded_count, &attrs[k]);
1498         so->hw[PAN_INSTANCE_ID].index = k++;
1499
1500         /* Upload whatever we emitted and go */
1501
1502         vertex_postfix->attributes = panfrost_upload_transient(batch, attrs,
1503                                                            k * sizeof(*attrs));
1504 }
1505
1506 static mali_ptr
1507 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1508                        unsigned stride, unsigned count)
1509 {
1510         /* Fill out the descriptor */
1511         slot->stride = stride;
1512         slot->size = stride * count;
1513         slot->shift = slot->extra_flags = 0;
1514
1515         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1516                                                                         slot->size);
1517
1518         slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1519
1520         return transfer.gpu;
1521 }
1522
1523 static void
1524 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1525                         unsigned stride, unsigned offset, unsigned count,
1526                         struct pipe_stream_output_target *target)
1527 {
1528         /* Fill out the descriptor */
1529         slot->stride = stride * 4;
1530         slot->shift = slot->extra_flags = 0;
1531
1532         unsigned max_size = target->buffer_size;
1533         unsigned expected_size = slot->stride * count;
1534
1535         slot->size = MIN2(max_size, expected_size);
1536
1537         /* Grab the BO and bind it to the batch */
1538         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1539
1540         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1541          * the perspective of the TILER and FRAGMENT.
1542          */
1543         panfrost_batch_add_bo(batch, bo,
1544                               PAN_BO_ACCESS_SHARED |
1545                               PAN_BO_ACCESS_RW |
1546                               PAN_BO_ACCESS_VERTEX_TILER |
1547                               PAN_BO_ACCESS_FRAGMENT);
1548
1549         mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1550         slot->elements = addr;
1551 }
1552
1553 /* Given a shader and buffer indices, link varying metadata together */
1554
1555 static bool
1556 is_special_varying(gl_varying_slot loc)
1557 {
1558         switch (loc) {
1559         case VARYING_SLOT_POS:
1560         case VARYING_SLOT_PSIZ:
1561         case VARYING_SLOT_PNTC:
1562         case VARYING_SLOT_FACE:
1563                 return true;
1564         default:
1565                 return false;
1566         }
1567 }
1568
1569 static void
1570 panfrost_emit_varying_meta(void *outptr, struct panfrost_shader_state *ss,
1571                            signed general, signed gl_Position,
1572                            signed gl_PointSize, signed gl_PointCoord,
1573                            signed gl_FrontFacing)
1574 {
1575         struct mali_attr_meta *out = (struct mali_attr_meta *) outptr;
1576
1577         for (unsigned i = 0; i < ss->varying_count; ++i) {
1578                 gl_varying_slot location = ss->varyings_loc[i];
1579                 int index = -1;
1580
1581                 switch (location) {
1582                 case VARYING_SLOT_POS:
1583                         index = gl_Position;
1584                         break;
1585                 case VARYING_SLOT_PSIZ:
1586                         index = gl_PointSize;
1587                         break;
1588                 case VARYING_SLOT_PNTC:
1589                         index = gl_PointCoord;
1590                         break;
1591                 case VARYING_SLOT_FACE:
1592                         index = gl_FrontFacing;
1593                         break;
1594                 default:
1595                         index = general;
1596                         break;
1597                 }
1598
1599                 assert(index >= 0);
1600                 out[i].index = index;
1601         }
1602 }
1603
1604 static bool
1605 has_point_coord(unsigned mask, gl_varying_slot loc)
1606 {
1607         if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1608                 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1609         else if (loc == VARYING_SLOT_PNTC)
1610                 return (mask & (1 << 8));
1611         else
1612                 return false;
1613 }
1614
1615 /* Helpers for manipulating stream out information so we can pack varyings
1616  * accordingly. Compute the src_offset for a given captured varying */
1617
1618 static struct pipe_stream_output *
1619 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1620 {
1621         for (unsigned i = 0; i < info->num_outputs; ++i) {
1622                 if (info->output[i].register_index == loc)
1623                         return &info->output[i];
1624         }
1625
1626         unreachable("Varying not captured");
1627 }
1628
1629 /* TODO: Integers */
1630 static enum mali_format
1631 pan_xfb_format(unsigned nr_components)
1632 {
1633         switch (nr_components) {
1634                 case 1: return MALI_R32F;
1635                 case 2: return MALI_RG32F;
1636                 case 3: return MALI_RGB32F;
1637                 case 4: return MALI_RGBA32F;
1638                 default: unreachable("Invalid format");
1639         }
1640 }
1641
1642 void
1643 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1644                                  unsigned vertex_count,
1645                                  struct mali_vertex_tiler_postfix *vertex_postfix,
1646                                  struct mali_vertex_tiler_postfix *tiler_postfix,
1647                                  union midgard_primitive_size *primitive_size)
1648 {
1649         /* Load the shaders */
1650         struct panfrost_context *ctx = batch->ctx;
1651         struct panfrost_shader_state *vs, *fs;
1652         unsigned int num_gen_varyings = 0;
1653         size_t vs_size, fs_size;
1654
1655         /* Allocate the varying descriptor */
1656
1657         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1658         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1659         vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
1660         fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
1661
1662         struct panfrost_transfer trans = panfrost_allocate_transient(batch,
1663                                                                      vs_size +
1664                                                                      fs_size);
1665
1666         struct pipe_stream_output_info *so = &vs->stream_output;
1667
1668         /* Check if this varying is linked by us. This is the case for
1669          * general-purpose, non-captured varyings. If it is, link it. If it's
1670          * not, use the provided stream out information to determine the
1671          * offset, since it was already linked for us. */
1672
1673         for (unsigned i = 0; i < vs->varying_count; i++) {
1674                 gl_varying_slot loc = vs->varyings_loc[i];
1675
1676                 bool special = is_special_varying(loc);
1677                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1678
1679                 if (captured) {
1680                         struct pipe_stream_output *o = pan_get_so(so, loc);
1681
1682                         unsigned dst_offset = o->dst_offset * 4; /* dwords */
1683                         vs->varyings[i].src_offset = dst_offset;
1684                 } else if (!special) {
1685                         vs->varyings[i].src_offset = 16 * (num_gen_varyings++);
1686                 }
1687         }
1688
1689         /* Conversely, we need to set src_offset for the captured varyings.
1690          * Here, the layout is defined by the stream out info, not us */
1691
1692         /* Link up with fragment varyings */
1693         bool reads_point_coord = fs->reads_point_coord;
1694
1695         for (unsigned i = 0; i < fs->varying_count; i++) {
1696                 gl_varying_slot loc = fs->varyings_loc[i];
1697                 unsigned src_offset;
1698                 signed vs_idx = -1;
1699
1700                 /* Link up */
1701                 for (unsigned j = 0; j < vs->varying_count; ++j) {
1702                         if (vs->varyings_loc[j] == loc) {
1703                                 vs_idx = j;
1704                                 break;
1705                         }
1706                 }
1707
1708                 /* Either assign or reuse */
1709                 if (vs_idx >= 0)
1710                         src_offset = vs->varyings[vs_idx].src_offset;
1711                 else
1712                         src_offset = 16 * (num_gen_varyings++);
1713
1714                 fs->varyings[i].src_offset = src_offset;
1715
1716                 if (has_point_coord(fs->point_sprite_mask, loc))
1717                         reads_point_coord = true;
1718         }
1719
1720         memcpy(trans.cpu, vs->varyings, vs_size);
1721         memcpy(trans.cpu + vs_size, fs->varyings, fs_size);
1722
1723         union mali_attr varyings[PIPE_MAX_ATTRIBS] = {0};
1724
1725         /* Figure out how many streamout buffers could be bound */
1726         unsigned so_count = ctx->streamout.num_targets;
1727         for (unsigned i = 0; i < vs->varying_count; i++) {
1728                 gl_varying_slot loc = vs->varyings_loc[i];
1729
1730                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1731                 if (!captured) continue;
1732
1733                 struct pipe_stream_output *o = pan_get_so(so, loc);
1734                 so_count = MAX2(so_count, o->output_buffer + 1);
1735         }
1736
1737         signed idx = so_count;
1738         signed general = idx++;
1739         signed gl_Position = idx++;
1740         signed gl_PointSize = vs->writes_point_size ? (idx++) : -1;
1741         signed gl_PointCoord = reads_point_coord ? (idx++) : -1;
1742         signed gl_FrontFacing = fs->reads_face ? (idx++) : -1;
1743         signed gl_FragCoord = fs->reads_frag_coord ? (idx++) : -1;
1744
1745         /* Emit the stream out buffers */
1746
1747         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1748                                                            ctx->vertex_count);
1749
1750         for (unsigned i = 0; i < so_count; ++i) {
1751                 if (i < ctx->streamout.num_targets) {
1752                         panfrost_emit_streamout(batch, &varyings[i],
1753                                                 so->stride[i],
1754                                                 ctx->streamout.offsets[i],
1755                                                 out_count,
1756                                                 ctx->streamout.targets[i]);
1757                 } else {
1758                         /* Emit a dummy buffer */
1759                         panfrost_emit_varyings(batch, &varyings[i],
1760                                                so->stride[i] * 4,
1761                                                out_count);
1762
1763                         /* Clear the attribute type */
1764                         varyings[i].elements &= ~0xF;
1765                 }
1766         }
1767
1768         panfrost_emit_varyings(batch, &varyings[general],
1769                                num_gen_varyings * 16,
1770                                vertex_count);
1771
1772         mali_ptr varyings_p;
1773
1774         /* fp32 vec4 gl_Position */
1775         varyings_p = panfrost_emit_varyings(batch, &varyings[gl_Position],
1776                                             sizeof(float) * 4, vertex_count);
1777         tiler_postfix->position_varying = varyings_p;
1778
1779
1780         if (panfrost_writes_point_size(ctx)) {
1781                 varyings_p = panfrost_emit_varyings(batch,
1782                                                     &varyings[gl_PointSize],
1783                                                     2, vertex_count);
1784                 primitive_size->pointer = varyings_p;
1785         }
1786
1787         if (reads_point_coord)
1788                 varyings[gl_PointCoord].elements = MALI_VARYING_POINT_COORD;
1789
1790         if (fs->reads_face)
1791                 varyings[gl_FrontFacing].elements = MALI_VARYING_FRONT_FACING;
1792
1793         if (fs->reads_frag_coord)
1794                 varyings[gl_FragCoord].elements = MALI_VARYING_FRAG_COORD;
1795
1796         struct panfrost_device *device = pan_device(ctx->base.screen);
1797         assert(!(device->quirks & IS_BIFROST) || !(reads_point_coord || fs->reads_face || fs->reads_frag_coord));
1798
1799         /* Let's go ahead and link varying meta to the buffer in question, now
1800          * that that information is available. VARYING_SLOT_POS is mapped to
1801          * gl_FragCoord for fragment shaders but gl_Positionf or vertex shaders
1802          * */
1803
1804         panfrost_emit_varying_meta(trans.cpu, vs, general, gl_Position,
1805                                    gl_PointSize, gl_PointCoord,
1806                                    gl_FrontFacing);
1807
1808         panfrost_emit_varying_meta(trans.cpu + vs_size, fs, general,
1809                                    gl_FragCoord, gl_PointSize,
1810                                    gl_PointCoord, gl_FrontFacing);
1811
1812         /* Replace streamout */
1813
1814         struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
1815         struct mali_attr_meta *ofs = ovs + vs->varying_count;
1816
1817         for (unsigned i = 0; i < vs->varying_count; i++) {
1818                 gl_varying_slot loc = vs->varyings_loc[i];
1819
1820                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1821                 if (!captured)
1822                         continue;
1823
1824                 struct pipe_stream_output *o = pan_get_so(so, loc);
1825                 ovs[i].index = o->output_buffer;
1826
1827                 /* Set the type appropriately. TODO: Integer varyings XXX */
1828                 assert(o->stream == 0);
1829                 ovs[i].format = pan_xfb_format(o->num_components);
1830                 ovs[i].swizzle = panfrost_get_default_swizzle(o->num_components);
1831
1832                 /* Link to the fragment */
1833                 signed fs_idx = -1;
1834
1835                 /* Link up */
1836                 for (unsigned j = 0; j < fs->varying_count; ++j) {
1837                         if (fs->varyings_loc[j] == loc) {
1838                                 fs_idx = j;
1839                                 break;
1840                         }
1841                 }
1842
1843                 if (fs_idx >= 0) {
1844                         ofs[fs_idx].index = ovs[i].index;
1845                         ofs[fs_idx].format = ovs[i].format;
1846                         ofs[fs_idx].swizzle = ovs[i].swizzle;
1847                 }
1848         }
1849
1850         /* Replace point sprite */
1851         for (unsigned i = 0; i < fs->varying_count; i++) {
1852                 /* If we have a point sprite replacement, handle that here. We
1853                  * have to translate location first.  TODO: Flip y in shader.
1854                  * We're already keying ... just time crunch .. */
1855
1856                 if (has_point_coord(fs->point_sprite_mask,
1857                                     fs->varyings_loc[i])) {
1858                         ofs[i].index = gl_PointCoord;
1859
1860                         /* Swizzle out the z/w to 0/1 */
1861                         ofs[i].format = MALI_RG16F;
1862                         ofs[i].swizzle = panfrost_get_default_swizzle(2);
1863                 }
1864         }
1865
1866         /* Fix up unaligned addresses */
1867         for (unsigned i = 0; i < so_count; ++i) {
1868                 if (varyings[i].elements < MALI_RECORD_SPECIAL)
1869                         continue;
1870
1871                 unsigned align = (varyings[i].elements & 63);
1872
1873                 /* While we're at it, the SO buffers are linear */
1874
1875                 if (!align) {
1876                         varyings[i].elements |= MALI_ATTR_LINEAR;
1877                         continue;
1878                 }
1879
1880                 /* We need to adjust alignment */
1881                 varyings[i].elements &= ~63;
1882                 varyings[i].elements |= MALI_ATTR_LINEAR;
1883                 varyings[i].size += align;
1884
1885                 for (unsigned v = 0; v < vs->varying_count; ++v) {
1886                         if (ovs[v].index != i)
1887                                 continue;
1888
1889                         ovs[v].src_offset = vs->varyings[v].src_offset + align;
1890                 }
1891
1892                 for (unsigned f = 0; f < fs->varying_count; ++f) {
1893                         if (ofs[f].index != i)
1894                                 continue;
1895
1896                         ofs[f].src_offset = fs->varyings[f].src_offset + align;
1897                 }
1898         }
1899
1900         varyings_p = panfrost_upload_transient(batch, varyings,
1901                                                idx * sizeof(*varyings));
1902         vertex_postfix->varyings = varyings_p;
1903         tiler_postfix->varyings = varyings_p;
1904
1905         vertex_postfix->varying_meta = trans.gpu;
1906         tiler_postfix->varying_meta = trans.gpu + vs_size;
1907 }
1908
1909 void
1910 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1911                                 struct mali_vertex_tiler_prefix *vertex_prefix,
1912                                 struct mali_vertex_tiler_postfix *vertex_postfix,
1913                                 struct mali_vertex_tiler_prefix *tiler_prefix,
1914                                 struct mali_vertex_tiler_postfix *tiler_postfix,
1915                                 union midgard_primitive_size *primitive_size)
1916 {
1917         struct panfrost_context *ctx = batch->ctx;
1918         struct panfrost_device *device = pan_device(ctx->base.screen);
1919         bool wallpapering = ctx->wallpaper_batch && batch->tiler_dep;
1920         struct bifrost_payload_vertex bifrost_vertex = {0,};
1921         struct bifrost_payload_tiler bifrost_tiler = {0,};
1922         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
1923         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
1924         void *vp, *tp;
1925         size_t vp_size, tp_size;
1926
1927         if (device->quirks & IS_BIFROST) {
1928                 bifrost_vertex.prefix = *vertex_prefix;
1929                 bifrost_vertex.postfix = *vertex_postfix;
1930                 vp = &bifrost_vertex;
1931                 vp_size = sizeof(bifrost_vertex);
1932
1933                 bifrost_tiler.prefix = *tiler_prefix;
1934                 bifrost_tiler.tiler.primitive_size = *primitive_size;
1935                 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
1936                 bifrost_tiler.postfix = *tiler_postfix;
1937                 tp = &bifrost_tiler;
1938                 tp_size = sizeof(bifrost_tiler);
1939         } else {
1940                 midgard_vertex.prefix = *vertex_prefix;
1941                 midgard_vertex.postfix = *vertex_postfix;
1942                 vp = &midgard_vertex;
1943                 vp_size = sizeof(midgard_vertex);
1944
1945                 midgard_tiler.prefix = *tiler_prefix;
1946                 midgard_tiler.postfix = *tiler_postfix;
1947                 midgard_tiler.primitive_size = *primitive_size;
1948                 tp = &midgard_tiler;
1949                 tp_size = sizeof(midgard_tiler);
1950         }
1951
1952         if (wallpapering) {
1953                 /* Inject in reverse order, with "predicted" job indices.
1954                  * THIS IS A HACK XXX */
1955                 panfrost_new_job(batch, JOB_TYPE_TILER, false,
1956                                  batch->job_index + 2, tp, tp_size, true);
1957                 panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
1958                                  vp, vp_size, true);
1959                 return;
1960         }
1961
1962         /* If rasterizer discard is enable, only submit the vertex */
1963
1964         bool rasterizer_discard = ctx->rasterizer &&
1965                                   ctx->rasterizer->base.rasterizer_discard;
1966
1967         unsigned vertex = panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
1968                                            vp, vp_size, false);
1969
1970         if (rasterizer_discard)
1971                 return;
1972
1973         panfrost_new_job(batch, JOB_TYPE_TILER, false, vertex, tp, tp_size,
1974                          false);
1975 }
1976
1977 /* TODO: stop hardcoding this */
1978 mali_ptr
1979 panfrost_emit_sample_locations(struct panfrost_batch *batch)
1980 {
1981         uint16_t locations[] = {
1982             128, 128,
1983             0, 256,
1984             0, 256,
1985             0, 256,
1986             0, 256,
1987             0, 256,
1988             0, 256,
1989             0, 256,
1990             0, 256,
1991             0, 256,
1992             0, 256,
1993             0, 256,
1994             0, 256,
1995             0, 256,
1996             0, 256,
1997             0, 256,
1998             0, 256,
1999             0, 256,
2000             0, 256,
2001             0, 256,
2002             0, 256,
2003             0, 256,
2004             0, 256,
2005             0, 256,
2006             0, 256,
2007             0, 256,
2008             0, 256,
2009             0, 256,
2010             0, 256,
2011             0, 256,
2012             0, 256,
2013             0, 256,
2014             128, 128,
2015             0, 0,
2016             0, 0,
2017             0, 0,
2018             0, 0,
2019             0, 0,
2020             0, 0,
2021             0, 0,
2022             0, 0,
2023             0, 0,
2024             0, 0,
2025             0, 0,
2026             0, 0,
2027             0, 0,
2028             0, 0,
2029             0, 0,
2030         };
2031
2032         return panfrost_upload_transient(batch, locations, 96 * sizeof(uint16_t));
2033 }