src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28
  29 #include "panfrost-quirks.h"
  30
  31 #include "pan_allocate.h"
  32 #include "pan_bo.h"
  33 #include "pan_cmdstream.h"
  34 #include "pan_context.h"
  35 #include "pan_job.h"
  36
  37 /* If a BO is accessed for a particular shader stage, will it be in the primary
  38  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  39  * fragment will be primary, e.g. compute jobs will be considered
  40  * "vertex/tiler" by analogy */
  41
  42 static inline uint32_t
  43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  44 {
  45         assert(stage == PIPE_SHADER_FRAGMENT ||
  46                stage == PIPE_SHADER_VERTEX ||
  47                stage == PIPE_SHADER_COMPUTE);
  48
  49         return stage == PIPE_SHADER_FRAGMENT ?
  50                PAN_BO_ACCESS_FRAGMENT :
  51                PAN_BO_ACCESS_VERTEX_TILER;
  52 }
  53
  54 static void
  55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
  56                                struct mali_vertex_tiler_postfix *postfix)
  57 {
  58         struct panfrost_device *dev = pan_device(ctx->base.screen);
  59         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  60
  61         unsigned shift = panfrost_get_stack_shift(batch->stack_size);
  62         struct mali_shared_memory shared = {
  63                 .stack_shift = shift,
  64                 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
  65                 .shared_workgroup_count = ~0,
  66         };
  67         postfix->shared_memory = panfrost_upload_transient(batch, &shared, sizeof(shared));
  68 }
  69
  70 static void
  71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
  72                                struct mali_vertex_tiler_postfix *postfix)
  73 {
  74         struct panfrost_device *dev = pan_device(ctx->base.screen);
  75         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  76
  77         /* If we haven't, reserve space for the framebuffer */
  78
  79         if (!batch->framebuffer.gpu) {
  80                 unsigned size = (dev->quirks & MIDGARD_SFBD) ?
  81                         sizeof(struct mali_single_framebuffer) :
  82                         sizeof(struct mali_framebuffer);
  83
  84                 batch->framebuffer = panfrost_allocate_transient(batch, size);
  85
  86                 /* Tag the pointer */
  87                 if (!(dev->quirks & MIDGARD_SFBD))
  88                         batch->framebuffer.gpu |= MALI_MFBD;
  89         }
  90
  91         postfix->shared_memory = batch->framebuffer.gpu;
  92 }
  93
  94 static void
  95 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
  96                               struct mali_vertex_tiler_prefix *prefix,
  97                               struct mali_vertex_tiler_postfix *postfix)
  98 {
  99         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 100
 101         postfix->gl_enables |= 0x7;
 102         SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
 103                 rasterizer && rasterizer->base.front_ccw);
 104         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
 105                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
 106         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
 107                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
 108         SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
 109                 rasterizer && rasterizer->base.flatshade_first);
 110 }
 111
 112 void
 113 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
 114                                   struct mali_vertex_tiler_prefix *prefix,
 115                                   union midgard_primitive_size *primitive_size)
 116 {
 117         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 118
 119         if (!panfrost_writes_point_size(ctx)) {
 120                 bool points = prefix->draw_mode == MALI_POINTS;
 121                 float val = 0.0f;
 122
 123                 if (rasterizer)
 124                         val = points ?
 125                               rasterizer->base.point_size :
 126                               rasterizer->base.line_width;
 127
 128                 primitive_size->constant = val;
 129         }
 130 }
 131
 132 static void
 133 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
 134                                    struct mali_vertex_tiler_postfix *postfix)
 135 {
 136         SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
 137         if (ctx->occlusion_query)
 138                 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
 139         else
 140                 postfix->occlusion_counter = 0;
 141 }
 142
 143 void
 144 panfrost_vt_init(struct panfrost_context *ctx,
 145                  enum pipe_shader_type stage,
 146                  struct mali_vertex_tiler_prefix *prefix,
 147                  struct mali_vertex_tiler_postfix *postfix)
 148 {
 149         struct panfrost_device *device = pan_device(ctx->base.screen);
 150
 151         if (!ctx->shader[stage])
 152                 return;
 153
 154         memset(prefix, 0, sizeof(*prefix));
 155         memset(postfix, 0, sizeof(*postfix));
 156
 157         if (device->quirks & IS_BIFROST) {
 158                 postfix->gl_enables = 0x2;
 159                 panfrost_vt_emit_shared_memory(ctx, postfix);
 160         } else {
 161                 postfix->gl_enables = 0x6;
 162                 panfrost_vt_attach_framebuffer(ctx, postfix);
 163         }
 164
 165         if (stage == PIPE_SHADER_FRAGMENT) {
 166                 panfrost_vt_update_occlusion_query(ctx, postfix);
 167                 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
 168         }
 169 }
 170
 171 static unsigned
 172 panfrost_translate_index_size(unsigned size)
 173 {
 174         switch (size) {
 175         case 1:
 176                 return MALI_DRAW_INDEXED_UINT8;
 177
 178         case 2:
 179                 return MALI_DRAW_INDEXED_UINT16;
 180
 181         case 4:
 182                 return MALI_DRAW_INDEXED_UINT32;
 183
 184         default:
 185                 unreachable("Invalid index size");
 186         }
 187 }
 188
 189 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
 190  * good for the duration of the draw (transient), could last longer. Also get
 191  * the bounds on the index buffer for the range accessed by the draw. We do
 192  * these operations together because there are natural optimizations which
 193  * require them to be together. */
 194
 195 static mali_ptr
 196 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 197                                   const struct pipe_draw_info *info,
 198                                   unsigned *min_index, unsigned *max_index)
 199 {
 200         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 201         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 202         off_t offset = info->start * info->index_size;
 203         bool needs_indices = true;
 204         mali_ptr out = 0;
 205
 206         if (info->max_index != ~0u) {
 207                 *min_index = info->min_index;
 208                 *max_index = info->max_index;
 209                 needs_indices = false;
 210         }
 211
 212         if (!info->has_user_indices) {
 213                 /* Only resources can be directly mapped */
 214                 panfrost_batch_add_bo(batch, rsrc->bo,
 215                                       PAN_BO_ACCESS_SHARED |
 216                                       PAN_BO_ACCESS_READ |
 217                                       PAN_BO_ACCESS_VERTEX_TILER);
 218                 out = rsrc->bo->gpu + offset;
 219
 220                 /* Check the cache */
 221                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 222                                                            info->start,
 223                                                            info->count,
 224                                                            min_index,
 225                                                            max_index);
 226         } else {
 227                 /* Otherwise, we need to upload to transient memory */
 228                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 229                 out = panfrost_upload_transient(batch, ibuf8 + offset,
 230                                                 info->count *
 231                                                 info->index_size);
 232         }
 233
 234         if (needs_indices) {
 235                 /* Fallback */
 236                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 237
 238                 if (!info->has_user_indices)
 239                         panfrost_minmax_cache_add(rsrc->index_cache,
 240                                                   info->start, info->count,
 241                                                   *min_index, *max_index);
 242         }
 243
 244         return out;
 245 }
 246
 247 void
 248 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
 249                           const struct pipe_draw_info *info,
 250                           enum mali_draw_mode draw_mode,
 251                           struct mali_vertex_tiler_postfix *vertex_postfix,
 252                           struct mali_vertex_tiler_prefix *tiler_prefix,
 253                           struct mali_vertex_tiler_postfix *tiler_postfix,
 254                           unsigned *vertex_count,
 255                           unsigned *padded_count)
 256 {
 257         tiler_prefix->draw_mode = draw_mode;
 258
 259         unsigned draw_flags = 0;
 260
 261         if (panfrost_writes_point_size(ctx))
 262                 draw_flags |= MALI_DRAW_VARYING_SIZE;
 263
 264         if (info->primitive_restart)
 265                 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
 266
 267         /* These doesn't make much sense */
 268
 269         draw_flags |= 0x3000;
 270
 271         if (info->index_size) {
 272                 unsigned min_index = 0, max_index = 0;
 273
 274                 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
 275                                                                        info,
 276                                                                        &min_index,
 277                                                                        &max_index);
 278
 279                 /* Use the corresponding values */
 280                 *vertex_count = max_index - min_index + 1;
 281                 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
 282                 tiler_prefix->offset_bias_correction = -min_index;
 283                 tiler_prefix->index_count = MALI_POSITIVE(info->count);
 284                 draw_flags |= panfrost_translate_index_size(info->index_size);
 285         } else {
 286                 tiler_prefix->indices = 0;
 287                 *vertex_count = ctx->vertex_count;
 288                 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
 289                 tiler_prefix->offset_bias_correction = 0;
 290                 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
 291         }
 292
 293         tiler_prefix->unknown_draw = draw_flags;
 294
 295         /* Encode the padded vertex count */
 296
 297         if (info->instance_count > 1) {
 298                 *padded_count = panfrost_padded_vertex_count(*vertex_count);
 299
 300                 unsigned shift = __builtin_ctz(ctx->padded_count);
 301                 unsigned k = ctx->padded_count >> (shift + 1);
 302
 303                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
 304                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
 305         } else {
 306                 *padded_count = *vertex_count;
 307
 308                 /* Reset instancing state */
 309                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
 310                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
 311         }
 312 }
 313
 314 static void
 315 panfrost_shader_meta_init(struct panfrost_context *ctx,
 316                           enum pipe_shader_type st,
 317                           struct mali_shader_meta *meta)
 318 {
 319         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 320         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 321
 322         memset(meta, 0, sizeof(*meta));
 323         meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
 324         meta->attribute_count = ss->attribute_count;
 325         meta->varying_count = ss->varying_count;
 326         meta->texture_count = ctx->sampler_view_count[st];
 327         meta->sampler_count = ctx->sampler_count[st];
 328
 329         if (dev->quirks & IS_BIFROST) {
 330                 meta->bifrost1.unk1 = 0x800200;
 331                 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 332                 meta->bifrost2.preload_regs = 0xC0;
 333                 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
 334                                                     ss->uniform_cutoff);
 335         } else {
 336                 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
 337                                                     ss->uniform_cutoff);
 338                 meta->midgard1.work_count = ss->work_reg_count;
 339                 meta->midgard1.flags_hi = 0x8; /* XXX */
 340                 meta->midgard1.flags_lo = 0x220;
 341                 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 342         }
 343
 344 }
 345
 346 static unsigned
 347 panfrost_translate_compare_func(enum pipe_compare_func in)
 348 {
 349         switch (in) {
 350         case PIPE_FUNC_NEVER:
 351                 return MALI_FUNC_NEVER;
 352
 353         case PIPE_FUNC_LESS:
 354                 return MALI_FUNC_LESS;
 355
 356         case PIPE_FUNC_EQUAL:
 357                 return MALI_FUNC_EQUAL;
 358
 359         case PIPE_FUNC_LEQUAL:
 360                 return MALI_FUNC_LEQUAL;
 361
 362         case PIPE_FUNC_GREATER:
 363                 return MALI_FUNC_GREATER;
 364
 365         case PIPE_FUNC_NOTEQUAL:
 366                 return MALI_FUNC_NOTEQUAL;
 367
 368         case PIPE_FUNC_GEQUAL:
 369                 return MALI_FUNC_GEQUAL;
 370
 371         case PIPE_FUNC_ALWAYS:
 372                 return MALI_FUNC_ALWAYS;
 373
 374         default:
 375                 unreachable("Invalid func");
 376         }
 377 }
 378
 379 static unsigned
 380 panfrost_translate_stencil_op(enum pipe_stencil_op in)
 381 {
 382         switch (in) {
 383         case PIPE_STENCIL_OP_KEEP:
 384                 return MALI_STENCIL_KEEP;
 385
 386         case PIPE_STENCIL_OP_ZERO:
 387                 return MALI_STENCIL_ZERO;
 388
 389         case PIPE_STENCIL_OP_REPLACE:
 390                return MALI_STENCIL_REPLACE;
 391
 392         case PIPE_STENCIL_OP_INCR:
 393                 return MALI_STENCIL_INCR;
 394
 395         case PIPE_STENCIL_OP_DECR:
 396                 return MALI_STENCIL_DECR;
 397
 398         case PIPE_STENCIL_OP_INCR_WRAP:
 399                 return MALI_STENCIL_INCR_WRAP;
 400
 401         case PIPE_STENCIL_OP_DECR_WRAP:
 402                 return MALI_STENCIL_DECR_WRAP;
 403
 404         case PIPE_STENCIL_OP_INVERT:
 405                 return MALI_STENCIL_INVERT;
 406
 407         default:
 408                 unreachable("Invalid stencil op");
 409         }
 410 }
 411
 412 static unsigned
 413 translate_tex_wrap(enum pipe_tex_wrap w)
 414 {
 415         switch (w) {
 416         case PIPE_TEX_WRAP_REPEAT:
 417                 return MALI_WRAP_REPEAT;
 418
 419         case PIPE_TEX_WRAP_CLAMP:
 420                 return MALI_WRAP_CLAMP;
 421
 422         case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 423                 return MALI_WRAP_CLAMP_TO_EDGE;
 424
 425         case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
 426                 return MALI_WRAP_CLAMP_TO_BORDER;
 427
 428         case PIPE_TEX_WRAP_MIRROR_REPEAT:
 429                 return MALI_WRAP_MIRRORED_REPEAT;
 430
 431         case PIPE_TEX_WRAP_MIRROR_CLAMP:
 432                 return MALI_WRAP_MIRRORED_CLAMP;
 433
 434         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 435                 return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE;
 436
 437         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
 438                 return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER;
 439
 440         default:
 441                 unreachable("Invalid wrap");
 442         }
 443 }
 444
 445 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 446                                 struct mali_sampler_descriptor *hw)
 447 {
 448         unsigned func = panfrost_translate_compare_func(cso->compare_func);
 449         bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 450         bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 451         bool mip_linear  = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
 452         unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
 453         unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
 454         unsigned mip_filter = mip_linear  ?
 455                               (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
 456         unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
 457
 458         *hw = (struct mali_sampler_descriptor) {
 459                 .filter_mode = min_filter | mag_filter | mip_filter |
 460                                normalized,
 461                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 462                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 463                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 464                 .compare_func = panfrost_flip_compare_func(func),
 465                 .border_color = {
 466                         cso->border_color.f[0],
 467                         cso->border_color.f[1],
 468                         cso->border_color.f[2],
 469                         cso->border_color.f[3]
 470                 },
 471                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 472                 .max_lod = FIXED_16(cso->max_lod, false),
 473                 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
 474                 .seamless_cube_map = cso->seamless_cube_map,
 475         };
 476
 477         /* If necessary, we disable mipmapping in the sampler descriptor by
 478          * clamping the LOD as tight as possible (from 0 to epsilon,
 479          * essentially -- remember these are fixed point numbers, so
 480          * epsilon=1/256) */
 481
 482         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 483                 hw->max_lod = hw->min_lod + 1;
 484 }
 485
 486 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
 487                                         struct bifrost_sampler_descriptor *hw)
 488 {
 489         *hw = (struct bifrost_sampler_descriptor) {
 490                 .unk1 = 0x1,
 491                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 492                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 493                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 494                 .unk8 = 0x8,
 495                 .unk2 = 0x2,
 496                 .min_filter = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST,
 497                 .norm_coords = cso->normalized_coords,
 498                 .mip_filter = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR,
 499                 .mag_filter = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR,
 500                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 501                 .max_lod = FIXED_16(cso->max_lod, false),
 502         };
 503
 504         /* If necessary, we disable mipmapping in the sampler descriptor by
 505          * clamping the LOD as tight as possible (from 0 to epsilon,
 506          * essentially -- remember these are fixed point numbers, so
 507          * epsilon=1/256) */
 508
 509         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 510                 hw->max_lod = hw->min_lod + 1;
 511 }
 512
 513 static void
 514 panfrost_make_stencil_state(const struct pipe_stencil_state *in,
 515                             struct mali_stencil_test *out)
 516 {
 517         out->ref = 0; /* Gallium gets it from elsewhere */
 518
 519         out->mask = in->valuemask;
 520         out->func = panfrost_translate_compare_func(in->func);
 521         out->sfail = panfrost_translate_stencil_op(in->fail_op);
 522         out->dpfail = panfrost_translate_stencil_op(in->zfail_op);
 523         out->dppass = panfrost_translate_stencil_op(in->zpass_op);
 524 }
 525
 526 static void
 527 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
 528                                      struct mali_shader_meta *fragmeta)
 529 {
 530         if (!ctx->rasterizer) {
 531                 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
 532                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
 533                 fragmeta->depth_units = 0.0f;
 534                 fragmeta->depth_factor = 0.0f;
 535                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
 536                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
 537                 return;
 538         }
 539
 540         bool msaa = ctx->rasterizer->base.multisample;
 541
 542         /* TODO: Sample size */
 543         SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
 544         SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
 545         fragmeta->depth_units = ctx->rasterizer->base.offset_units * 2.0f;
 546         fragmeta->depth_factor = ctx->rasterizer->base.offset_scale;
 547
 548         /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
 549
 550         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A,
 551                 ctx->rasterizer->base.offset_tri);
 552         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B,
 553                 ctx->rasterizer->base.offset_tri);
 554 }
 555
 556 static void
 557 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
 558                               struct mali_shader_meta *fragmeta)
 559 {
 560         const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
 561         int zfunc = PIPE_FUNC_ALWAYS;
 562
 563         if (!zsa) {
 564                 struct pipe_stencil_state default_stencil = {
 565                         .enabled = 0,
 566                         .func = PIPE_FUNC_ALWAYS,
 567                         .fail_op = MALI_STENCIL_KEEP,
 568                         .zfail_op = MALI_STENCIL_KEEP,
 569                         .zpass_op = MALI_STENCIL_KEEP,
 570                         .writemask = 0xFF,
 571                         .valuemask = 0xFF
 572                 };
 573
 574                 panfrost_make_stencil_state(&default_stencil,
 575                                             &fragmeta->stencil_front);
 576                 fragmeta->stencil_mask_front = default_stencil.writemask;
 577                 fragmeta->stencil_back = fragmeta->stencil_front;
 578                 fragmeta->stencil_mask_back = default_stencil.writemask;
 579                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
 580                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
 581         } else {
 582                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
 583                         zsa->stencil[0].enabled);
 584                 panfrost_make_stencil_state(&zsa->stencil[0],
 585                                             &fragmeta->stencil_front);
 586                 fragmeta->stencil_mask_front = zsa->stencil[0].writemask;
 587                 fragmeta->stencil_front.ref = ctx->stencil_ref.ref_value[0];
 588
 589                 /* If back-stencil is not enabled, use the front values */
 590
 591                 if (zsa->stencil[1].enabled) {
 592                         panfrost_make_stencil_state(&zsa->stencil[1],
 593                                                     &fragmeta->stencil_back);
 594                         fragmeta->stencil_mask_back = zsa->stencil[1].writemask;
 595                         fragmeta->stencil_back.ref = ctx->stencil_ref.ref_value[1];
 596                 } else {
 597                         fragmeta->stencil_back = fragmeta->stencil_front;
 598                         fragmeta->stencil_mask_back = fragmeta->stencil_mask_front;
 599                         fragmeta->stencil_back.ref = fragmeta->stencil_front.ref;
 600                 }
 601
 602                 if (zsa->depth.enabled)
 603                         zfunc = zsa->depth.func;
 604
 605                 /* Depth state (TODO: Refactor) */
 606
 607                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
 608                         zsa->depth.writemask);
 609         }
 610
 611         fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
 612         fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
 613 }
 614
 615 static void
 616 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
 617                                 struct mali_shader_meta *fragmeta,
 618                                 struct midgard_blend_rt *rts)
 619 {
 620         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 621
 622         SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
 623                 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
 624                 !ctx->blend->base.dither);
 625
 626         /* Get blending setup */
 627         unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 628
 629         struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 630         unsigned shader_offset = 0;
 631         struct panfrost_bo *shader_bo = NULL;
 632
 633         for (unsigned c = 0; c < rt_count; ++c)
 634                 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
 635                                                           &shader_offset);
 636
 637          /* If there is a blend shader, work registers are shared. XXX: opt */
 638
 639         for (unsigned c = 0; c < rt_count; ++c) {
 640                 if (blend[c].is_shader)
 641                         fragmeta->midgard1.work_count = 16;
 642         }
 643
 644         /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
 645          * copied to the blend_meta appended (by convention), but this is the
 646          * field actually read by the hardware. (Or maybe both are read...?).
 647          * Specify the last RTi with a blend shader. */
 648
 649         fragmeta->blend.shader = 0;
 650
 651         for (signed rt = (rt_count - 1); rt >= 0; --rt) {
 652                 if (!blend[rt].is_shader)
 653                         continue;
 654
 655                 fragmeta->blend.shader = blend[rt].shader.gpu |
 656                                          blend[rt].shader.first_tag;
 657                 break;
 658         }
 659
 660         if (dev->quirks & MIDGARD_SFBD) {
 661                 /* When only a single render target platform is used, the blend
 662                  * information is inside the shader meta itself. We additionally
 663                  * need to signal CAN_DISCARD for nontrivial blend modes (so
 664                  * we're able to read back the destination buffer) */
 665
 666                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
 667                         blend[0].is_shader);
 668
 669                 if (!blend[0].is_shader) {
 670                         fragmeta->blend.equation = *blend[0].equation.equation;
 671                         fragmeta->blend.constant = blend[0].equation.constant;
 672                 }
 673
 674                 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
 675                         !blend[0].no_blending);
 676                 return;
 677         }
 678
 679         /* Additional blend descriptor tacked on for jobs using MFBD */
 680
 681         for (unsigned i = 0; i < rt_count; ++i) {
 682                 rts[i].flags = 0x200;
 683
 684                 bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
 685                                (ctx->pipe_framebuffer.cbufs[i]) &&
 686                                util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
 687
 688                 SET_BIT(rts[i].flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
 689                 SET_BIT(rts[i].flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
 690                 SET_BIT(rts[i].flags, MALI_BLEND_SRGB, is_srgb);
 691                 SET_BIT(rts[i].flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
 692
 693                 if (blend[i].is_shader) {
 694                         rts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 695                 } else {
 696                         rts[i].blend.equation = *blend[i].equation.equation;
 697                         rts[i].blend.constant = blend[i].equation.constant;
 698                 }
 699         }
 700 }
 701
 702 static void
 703 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
 704                                struct mali_shader_meta *fragmeta,
 705                                struct midgard_blend_rt *rts)
 706 {
 707         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 708         struct panfrost_shader_state *fs;
 709
 710         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 711
 712         fragmeta->alpha_coverage = ~MALI_ALPHA_COVERAGE(0.000000);
 713         fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x3010;
 714         fragmeta->unknown2_4 = 0x4e0;
 715
 716         /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
 717          * is required (independent of 32-bit/64-bit descriptors), or why it's
 718          * not used on later GPU revisions. Otherwise, all shader jobs fault on
 719          * these earlier chips (perhaps this is a chicken bit of some kind).
 720          * More investigation is needed. */
 721
 722         SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
 723
 724         /* Depending on whether it's legal to in the given shader, we try to
 725          * enable early-z testing (or forward-pixel kill?) */
 726
 727         SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
 728                 !fs->can_discard && !fs->writes_depth);
 729
 730         /* Add the writes Z/S flags if needed. */
 731         SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
 732         SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
 733
 734         /* Any time texturing is used, derivatives are implicitly calculated,
 735          * so we need to enable helper invocations */
 736
 737         SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
 738                 fs->helper_invocations);
 739
 740         /* CAN_DISCARD should be set if the fragment shader possibly contains a
 741          * 'discard' instruction. It is likely this is related to optimizations
 742          * related to forward-pixel kill, as per "Mali Performance 3: Is
 743          * EGL_BUFFER_PRESERVED a good thing?" by Peter Harris */
 744
 745         SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD, fs->can_discard);
 746         SET_BIT(fragmeta->midgard1.flags_lo, 0x400, fs->can_discard);
 747
 748         panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
 749         panfrost_frag_meta_zsa_update(ctx, fragmeta);
 750         panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
 751 }
 752
 753 void
 754 panfrost_emit_shader_meta(struct panfrost_batch *batch,
 755                           enum pipe_shader_type st,
 756                           struct mali_vertex_tiler_postfix *postfix)
 757 {
 758         struct panfrost_context *ctx = batch->ctx;
 759         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 760
 761         if (!ss) {
 762                 postfix->shader = 0;
 763                 return;
 764         }
 765
 766         struct mali_shader_meta meta;
 767
 768         panfrost_shader_meta_init(ctx, st, &meta);
 769
 770         /* Add the shader BO to the batch. */
 771         panfrost_batch_add_bo(batch, ss->bo,
 772                               PAN_BO_ACCESS_PRIVATE |
 773                               PAN_BO_ACCESS_READ |
 774                               panfrost_bo_access_for_stage(st));
 775
 776         mali_ptr shader_ptr;
 777
 778         if (st == PIPE_SHADER_FRAGMENT) {
 779                 struct panfrost_device *dev = pan_device(ctx->base.screen);
 780                 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 781                 size_t desc_size = sizeof(meta);
 782                 struct midgard_blend_rt rts[4];
 783                 struct panfrost_transfer xfer;
 784
 785                 assert(rt_count <= ARRAY_SIZE(rts));
 786
 787                 panfrost_frag_shader_meta_init(ctx, &meta, rts);
 788
 789                 if (!(dev->quirks & MIDGARD_SFBD))
 790                         desc_size += sizeof(*rts) * rt_count;
 791
 792                 xfer = panfrost_allocate_transient(batch, desc_size);
 793
 794                 memcpy(xfer.cpu, &meta, sizeof(meta));
 795                 memcpy(xfer.cpu + sizeof(meta), rts, sizeof(*rts) * rt_count);
 796
 797                 shader_ptr = xfer.gpu;
 798         } else {
 799                 shader_ptr = panfrost_upload_transient(batch, &meta,
 800                                                        sizeof(meta));
 801         }
 802
 803         postfix->shader = shader_ptr;
 804 }
 805
 806 static void
 807 panfrost_mali_viewport_init(struct panfrost_context *ctx,
 808                             struct mali_viewport *mvp)
 809 {
 810         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 811
 812         /* Clip bounds are encoded as floats. The viewport itself is encoded as
 813          * (somewhat) asymmetric ints. */
 814
 815         const struct pipe_scissor_state *ss = &ctx->scissor;
 816
 817         memset(mvp, 0, sizeof(*mvp));
 818
 819         /* By default, do no viewport clipping, i.e. clip to (-inf, inf) in
 820          * each direction. Clipping to the viewport in theory should work, but
 821          * in practice causes issues when we're not explicitly trying to
 822          * scissor */
 823
 824         *mvp = (struct mali_viewport) {
 825                 .clip_minx = -INFINITY,
 826                 .clip_miny = -INFINITY,
 827                 .clip_maxx = INFINITY,
 828                 .clip_maxy = INFINITY,
 829         };
 830
 831         /* Always scissor to the viewport by default. */
 832         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 833         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 834
 835         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 836         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 837
 838         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 839         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 840
 841         /* Apply the scissor test */
 842
 843         unsigned minx, miny, maxx, maxy;
 844
 845         if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
 846                 minx = MAX2(ss->minx, vp_minx);
 847                 miny = MAX2(ss->miny, vp_miny);
 848                 maxx = MIN2(ss->maxx, vp_maxx);
 849                 maxy = MIN2(ss->maxy, vp_maxy);
 850         } else {
 851                 minx = vp_minx;
 852                 miny = vp_miny;
 853                 maxx = vp_maxx;
 854                 maxy = vp_maxy;
 855         }
 856
 857         /* Hardware needs the min/max to be strictly ordered, so flip if we
 858          * need to. The viewport transformation in the vertex shader will
 859          * handle the negatives if we don't */
 860
 861         if (miny > maxy) {
 862                 unsigned temp = miny;
 863                 miny = maxy;
 864                 maxy = temp;
 865         }
 866
 867         if (minx > maxx) {
 868                 unsigned temp = minx;
 869                 minx = maxx;
 870                 maxx = temp;
 871         }
 872
 873         if (minz > maxz) {
 874                 float temp = minz;
 875                 minz = maxz;
 876                 maxz = temp;
 877         }
 878
 879         /* Clamp to the framebuffer size as a last check */
 880
 881         minx = MIN2(ctx->pipe_framebuffer.width, minx);
 882         maxx = MIN2(ctx->pipe_framebuffer.width, maxx);
 883
 884         miny = MIN2(ctx->pipe_framebuffer.height, miny);
 885         maxy = MIN2(ctx->pipe_framebuffer.height, maxy);
 886
 887         /* Upload */
 888
 889         mvp->viewport0[0] = minx;
 890         mvp->viewport1[0] = MALI_POSITIVE(maxx);
 891
 892         mvp->viewport0[1] = miny;
 893         mvp->viewport1[1] = MALI_POSITIVE(maxy);
 894
 895         mvp->clip_minz = minz;
 896         mvp->clip_maxz = maxz;
 897 }
 898
 899 void
 900 panfrost_emit_viewport(struct panfrost_batch *batch,
 901                        struct mali_vertex_tiler_postfix *tiler_postfix)
 902 {
 903         struct panfrost_context *ctx = batch->ctx;
 904         struct mali_viewport mvp;
 905
 906         panfrost_mali_viewport_init(batch->ctx,  &mvp);
 907
 908         /* Update the job, unless we're doing wallpapering (whose lack of
 909          * scissor we can ignore, since if we "miss" a tile of wallpaper, it'll
 910          * just... be faster :) */
 911
 912         if (!ctx->wallpaper_batch)
 913                 panfrost_batch_union_scissor(batch, mvp.viewport0[0],
 914                                              mvp.viewport0[1],
 915                                              mvp.viewport1[0] + 1,
 916                                              mvp.viewport1[1] + 1);
 917
 918         tiler_postfix->viewport = panfrost_upload_transient(batch, &mvp,
 919                                                             sizeof(mvp));
 920 }
 921
 922 static mali_ptr
 923 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
 924                                  enum pipe_shader_type st,
 925                                  struct panfrost_constant_buffer *buf,
 926                                  unsigned index)
 927 {
 928         struct pipe_constant_buffer *cb = &buf->cb[index];
 929         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
 930
 931         if (rsrc) {
 932                 panfrost_batch_add_bo(batch, rsrc->bo,
 933                                       PAN_BO_ACCESS_SHARED |
 934                                       PAN_BO_ACCESS_READ |
 935                                       panfrost_bo_access_for_stage(st));
 936
 937                 /* Alignment gauranteed by
 938                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
 939                 return rsrc->bo->gpu + cb->buffer_offset;
 940         } else if (cb->user_buffer) {
 941                 return panfrost_upload_transient(batch,
 942                                                  cb->user_buffer +
 943                                                  cb->buffer_offset,
 944                                                  cb->buffer_size);
 945         } else {
 946                 unreachable("No constant buffer");
 947         }
 948 }
 949
 950 struct sysval_uniform {
 951         union {
 952                 float f[4];
 953                 int32_t i[4];
 954                 uint32_t u[4];
 955                 uint64_t du[2];
 956         };
 957 };
 958
 959 static void
 960 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
 961                                       struct sysval_uniform *uniform)
 962 {
 963         struct panfrost_context *ctx = batch->ctx;
 964         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 965
 966         uniform->f[0] = vp->scale[0];
 967         uniform->f[1] = vp->scale[1];
 968         uniform->f[2] = vp->scale[2];
 969 }
 970
 971 static void
 972 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
 973                                        struct sysval_uniform *uniform)
 974 {
 975         struct panfrost_context *ctx = batch->ctx;
 976         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 977
 978         uniform->f[0] = vp->translate[0];
 979         uniform->f[1] = vp->translate[1];
 980         uniform->f[2] = vp->translate[2];
 981 }
 982
 983 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
 984                                        enum pipe_shader_type st,
 985                                        unsigned int sysvalid,
 986                                        struct sysval_uniform *uniform)
 987 {
 988         struct panfrost_context *ctx = batch->ctx;
 989         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
 990         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
 991         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
 992         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
 993
 994         assert(dim);
 995         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
 996
 997         if (dim > 1)
 998                 uniform->i[1] = u_minify(tex->texture->height0,
 999                                          tex->u.tex.first_level);
1000
1001         if (dim > 2)
1002                 uniform->i[2] = u_minify(tex->texture->depth0,
1003                                          tex->u.tex.first_level);
1004
1005         if (is_array)
1006                 uniform->i[dim] = tex->texture->array_size;
1007 }
1008
1009 static void
1010 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1011                             enum pipe_shader_type st,
1012                             unsigned ssbo_id,
1013                             struct sysval_uniform *uniform)
1014 {
1015         struct panfrost_context *ctx = batch->ctx;
1016
1017         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1018         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1019
1020         /* Compute address */
1021         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1022
1023         panfrost_batch_add_bo(batch, bo,
1024                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1025                               panfrost_bo_access_for_stage(st));
1026
1027         /* Upload address and size as sysval */
1028         uniform->du[0] = bo->gpu + sb.buffer_offset;
1029         uniform->u[2] = sb.buffer_size;
1030 }
1031
1032 static void
1033 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1034                                enum pipe_shader_type st,
1035                                unsigned samp_idx,
1036                                struct sysval_uniform *uniform)
1037 {
1038         struct panfrost_context *ctx = batch->ctx;
1039         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1040
1041         uniform->f[0] = sampl->min_lod;
1042         uniform->f[1] = sampl->max_lod;
1043         uniform->f[2] = sampl->lod_bias;
1044
1045         /* Even without any errata, Midgard represents "no mipmapping" as
1046          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1047          * panfrost_create_sampler_state which also explains our choice of
1048          * epsilon value (again to keep behaviour consistent) */
1049
1050         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1051                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1052 }
1053
1054 static void
1055 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1056                                        struct sysval_uniform *uniform)
1057 {
1058         struct panfrost_context *ctx = batch->ctx;
1059
1060         uniform->u[0] = ctx->compute_grid->grid[0];
1061         uniform->u[1] = ctx->compute_grid->grid[1];
1062         uniform->u[2] = ctx->compute_grid->grid[2];
1063 }
1064
1065 static void
1066 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1067                         struct panfrost_shader_state *ss,
1068                         enum pipe_shader_type st)
1069 {
1070         struct sysval_uniform *uniforms = (void *)buf;
1071
1072         for (unsigned i = 0; i < ss->sysval_count; ++i) {
1073                 int sysval = ss->sysval[i];
1074
1075                 switch (PAN_SYSVAL_TYPE(sysval)) {
1076                 case PAN_SYSVAL_VIEWPORT_SCALE:
1077                         panfrost_upload_viewport_scale_sysval(batch,
1078                                                               &uniforms[i]);
1079                         break;
1080                 case PAN_SYSVAL_VIEWPORT_OFFSET:
1081                         panfrost_upload_viewport_offset_sysval(batch,
1082                                                                &uniforms[i]);
1083                         break;
1084                 case PAN_SYSVAL_TEXTURE_SIZE:
1085                         panfrost_upload_txs_sysval(batch, st,
1086                                                    PAN_SYSVAL_ID(sysval),
1087                                                    &uniforms[i]);
1088                         break;
1089                 case PAN_SYSVAL_SSBO:
1090                         panfrost_upload_ssbo_sysval(batch, st,
1091                                                     PAN_SYSVAL_ID(sysval),
1092                                                     &uniforms[i]);
1093                         break;
1094                 case PAN_SYSVAL_NUM_WORK_GROUPS:
1095                         panfrost_upload_num_work_groups_sysval(batch,
1096                                                                &uniforms[i]);
1097                         break;
1098                 case PAN_SYSVAL_SAMPLER:
1099                         panfrost_upload_sampler_sysval(batch, st,
1100                                                        PAN_SYSVAL_ID(sysval),
1101                                                        &uniforms[i]);
1102                         break;
1103                 default:
1104                         assert(0);
1105                 }
1106         }
1107 }
1108
1109 static const void *
1110 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1111                                  unsigned index)
1112 {
1113         struct pipe_constant_buffer *cb = &buf->cb[index];
1114         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1115
1116         if (rsrc)
1117                 return rsrc->bo->cpu;
1118         else if (cb->user_buffer)
1119                 return cb->user_buffer;
1120         else
1121                 unreachable("No constant buffer");
1122 }
1123
1124 void
1125 panfrost_emit_const_buf(struct panfrost_batch *batch,
1126                         enum pipe_shader_type stage,
1127                         struct mali_vertex_tiler_postfix *postfix)
1128 {
1129         struct panfrost_context *ctx = batch->ctx;
1130         struct panfrost_shader_variants *all = ctx->shader[stage];
1131
1132         if (!all)
1133                 return;
1134
1135         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1136
1137         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1138
1139         /* Uniforms are implicitly UBO #0 */
1140         bool has_uniforms = buf->enabled_mask & (1 << 0);
1141
1142         /* Allocate room for the sysval and the uniforms */
1143         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1144         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1145         size_t size = sys_size + uniform_size;
1146         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1147                                                                         size);
1148
1149         /* Upload sysvals requested by the shader */
1150         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1151
1152         /* Upload uniforms */
1153         if (has_uniforms && uniform_size) {
1154                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1155                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1156         }
1157
1158         /* Next up, attach UBOs. UBO #0 is the uniforms we just
1159          * uploaded */
1160
1161         unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1162         assert(ubo_count >= 1);
1163
1164         size_t sz = sizeof(uint64_t) * ubo_count;
1165         uint64_t ubos[PAN_MAX_CONST_BUFFERS];
1166         int uniform_count = ss->uniform_count;
1167
1168         /* Upload uniforms as a UBO */
1169         ubos[0] = MALI_MAKE_UBO(2 + uniform_count, transfer.gpu);
1170
1171         /* The rest are honest-to-goodness UBOs */
1172
1173         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1174                 size_t usz = buf->cb[ubo].buffer_size;
1175                 bool enabled = buf->enabled_mask & (1 << ubo);
1176                 bool empty = usz == 0;
1177
1178                 if (!enabled || empty) {
1179                         /* Stub out disabled UBOs to catch accesses */
1180                         ubos[ubo] = MALI_MAKE_UBO(0, 0xDEAD0000);
1181                         continue;
1182                 }
1183
1184                 mali_ptr gpu = panfrost_map_constant_buffer_gpu(batch, stage,
1185                                                                 buf, ubo);
1186
1187                 unsigned bytes_per_field = 16;
1188                 unsigned aligned = ALIGN_POT(usz, bytes_per_field);
1189                 ubos[ubo] = MALI_MAKE_UBO(aligned / bytes_per_field, gpu);
1190         }
1191
1192         mali_ptr ubufs = panfrost_upload_transient(batch, ubos, sz);
1193         postfix->uniforms = transfer.gpu;
1194         postfix->uniform_buffers = ubufs;
1195
1196         buf->dirty_mask = 0;
1197 }
1198
1199 void
1200 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1201                             const struct pipe_grid_info *info,
1202                             struct midgard_payload_vertex_tiler *vtp)
1203 {
1204         struct panfrost_context *ctx = batch->ctx;
1205         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1206         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1207         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1208                                                            128));
1209         unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1210                                info->grid[2] * 4;
1211         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1212                                                                   shared_size,
1213                                                                   1);
1214
1215         struct mali_shared_memory shared = {
1216                 .shared_memory = bo->gpu,
1217                 .shared_workgroup_count =
1218                         util_logbase2_ceil(info->grid[0]) +
1219                         util_logbase2_ceil(info->grid[1]) +
1220                         util_logbase2_ceil(info->grid[2]),
1221                 .shared_unk1 = 0x2,
1222                 .shared_shift = util_logbase2(single_size) - 1
1223         };
1224
1225         vtp->postfix.shared_memory = panfrost_upload_transient(batch, &shared,
1226                                                                sizeof(shared));
1227 }
1228
1229 static mali_ptr
1230 panfrost_get_tex_desc(struct panfrost_batch *batch,
1231                       enum pipe_shader_type st,
1232                       struct panfrost_sampler_view *view)
1233 {
1234         if (!view)
1235                 return (mali_ptr) 0;
1236
1237         struct pipe_sampler_view *pview = &view->base;
1238         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1239
1240         /* Add the BO to the job so it's retained until the job is done. */
1241
1242         panfrost_batch_add_bo(batch, rsrc->bo,
1243                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1244                               panfrost_bo_access_for_stage(st));
1245
1246         panfrost_batch_add_bo(batch, view->midgard_bo,
1247                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1248                               panfrost_bo_access_for_stage(st));
1249
1250         return view->midgard_bo->gpu;
1251 }
1252
1253 void
1254 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1255                                   enum pipe_shader_type stage,
1256                                   struct mali_vertex_tiler_postfix *postfix)
1257 {
1258         struct panfrost_context *ctx = batch->ctx;
1259         struct panfrost_device *device = pan_device(ctx->base.screen);
1260
1261         if (!ctx->sampler_view_count[stage])
1262                 return;
1263
1264         if (device->quirks & IS_BIFROST) {
1265                 struct bifrost_texture_descriptor *descriptors;
1266
1267                 descriptors = malloc(sizeof(struct bifrost_texture_descriptor) *
1268                                      ctx->sampler_view_count[stage]);
1269
1270                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1271                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1272                         struct pipe_sampler_view *pview = &view->base;
1273                         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1274
1275                         panfrost_batch_add_bo(batch, rsrc->bo,
1276                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1277                                               panfrost_bo_access_for_stage(stage));
1278
1279                         memcpy(&descriptors[i], view->bifrost_descriptor, sizeof(*view->bifrost_descriptor));
1280                 }
1281
1282                 postfix->textures = panfrost_upload_transient(batch,
1283                                                               descriptors,
1284                                                               sizeof(struct bifrost_texture_descriptor) *
1285                                                                       ctx->sampler_view_count[stage]);
1286         } else {
1287                 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1288
1289                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i)
1290                         trampolines[i] = panfrost_get_tex_desc(batch, stage,
1291                                                                ctx->sampler_views[stage][i]);
1292
1293                 postfix->textures = panfrost_upload_transient(batch,
1294                                                               trampolines,
1295                                                               sizeof(uint64_t) *
1296                                                               ctx->sampler_view_count[stage]);
1297         }
1298 }
1299
1300 void
1301 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1302                                   enum pipe_shader_type stage,
1303                                   struct mali_vertex_tiler_postfix *postfix)
1304 {
1305         struct panfrost_context *ctx = batch->ctx;
1306         struct panfrost_device *device = pan_device(ctx->base.screen);
1307
1308         if (!ctx->sampler_count[stage])
1309                 return;
1310
1311         if (device->quirks & IS_BIFROST) {
1312                 size_t desc_size = sizeof(struct bifrost_sampler_descriptor);
1313                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1314                 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1315                                                                                 transfer_size);
1316                 struct bifrost_sampler_descriptor *desc = (struct bifrost_sampler_descriptor *)transfer.cpu;
1317
1318                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1319                         desc[i] = ctx->samplers[stage][i]->bifrost_hw;
1320
1321                 postfix->sampler_descriptor = transfer.gpu;
1322         } else {
1323                 size_t desc_size = sizeof(struct mali_sampler_descriptor);
1324                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1325                 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1326                                                                                 transfer_size);
1327                 struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1328
1329                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1330                         desc[i] = ctx->samplers[stage][i]->midgard_hw;
1331
1332                 postfix->sampler_descriptor = transfer.gpu;
1333         }
1334 }
1335
1336 void
1337 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1338                                struct mali_vertex_tiler_postfix *vertex_postfix)
1339 {
1340         struct panfrost_context *ctx = batch->ctx;
1341
1342         if (!ctx->vertex)
1343                 return;
1344
1345         struct panfrost_vertex_state *so = ctx->vertex;
1346
1347         panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1348         vertex_postfix->attribute_meta = panfrost_upload_transient(batch, so->hw,
1349                                                                sizeof(*so->hw) *
1350                                                                PAN_MAX_ATTRIBUTE);
1351 }
1352
1353 void
1354 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1355                           struct mali_vertex_tiler_postfix *vertex_postfix)
1356 {
1357         struct panfrost_context *ctx = batch->ctx;
1358         struct panfrost_vertex_state *so = ctx->vertex;
1359
1360         /* Staged mali_attr, and index into them. i =/= k, depending on the
1361          * vertex buffer mask and instancing. Twice as much room is allocated,
1362          * for a worst case of NPOT_DIVIDEs which take up extra slot */
1363         union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1364         unsigned k = 0;
1365
1366         for (unsigned i = 0; i < so->num_elements; ++i) {
1367                 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1368                  * means duplicating some vertex buffers (who cares? aside from
1369                  * maybe some caching implications but I somehow doubt that
1370                  * matters) */
1371
1372                 struct pipe_vertex_element *elem = &so->pipe[i];
1373                 unsigned vbi = elem->vertex_buffer_index;
1374
1375                 /* The exception to 1:1 mapping is that we can have multiple
1376                  * entries (NPOT divisors), so we fixup anyways */
1377
1378                 so->hw[i].index = k;
1379
1380                 if (!(ctx->vb_mask & (1 << vbi)))
1381                         continue;
1382
1383                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1384                 struct panfrost_resource *rsrc;
1385
1386                 rsrc = pan_resource(buf->buffer.resource);
1387                 if (!rsrc)
1388                         continue;
1389
1390                 /* Align to 64 bytes by masking off the lower bits. This
1391                  * will be adjusted back when we fixup the src_offset in
1392                  * mali_attr_meta */
1393
1394                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1395                 mali_ptr addr = raw_addr & ~63;
1396                 unsigned chopped_addr = raw_addr - addr;
1397
1398                 /* Add a dependency of the batch on the vertex buffer */
1399                 panfrost_batch_add_bo(batch, rsrc->bo,
1400                                       PAN_BO_ACCESS_SHARED |
1401                                       PAN_BO_ACCESS_READ |
1402                                       PAN_BO_ACCESS_VERTEX_TILER);
1403
1404                 /* Set common fields */
1405                 attrs[k].elements = addr;
1406                 attrs[k].stride = buf->stride;
1407
1408                 /* Since we advanced the base pointer, we shrink the buffer
1409                  * size */
1410                 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1411
1412                 /* We need to add the extra size we masked off (for
1413                  * correctness) so the data doesn't get clamped away */
1414                 attrs[k].size += chopped_addr;
1415
1416                 /* For non-instancing make sure we initialize */
1417                 attrs[k].shift = attrs[k].extra_flags = 0;
1418
1419                 /* Instancing uses a dramatically different code path than
1420                  * linear, so dispatch for the actual emission now that the
1421                  * common code is finished */
1422
1423                 unsigned divisor = elem->instance_divisor;
1424
1425                 if (divisor && ctx->instance_count == 1) {
1426                         /* Silly corner case where there's a divisor(=1) but
1427                          * there's no legitimate instancing. So we want *every*
1428                          * attribute to be the same. So set stride to zero so
1429                          * we don't go anywhere. */
1430
1431                         attrs[k].size = attrs[k].stride + chopped_addr;
1432                         attrs[k].stride = 0;
1433                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1434                 } else if (ctx->instance_count <= 1) {
1435                         /* Normal, non-instanced attributes */
1436                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1437                 } else {
1438                         unsigned instance_shift = vertex_postfix->instance_shift;
1439                         unsigned instance_odd = vertex_postfix->instance_odd;
1440
1441                         k += panfrost_vertex_instanced(ctx->padded_count,
1442                                                        instance_shift,
1443                                                        instance_odd,
1444                                                        divisor, &attrs[k]);
1445                 }
1446         }
1447
1448         /* Add special gl_VertexID/gl_InstanceID buffers */
1449
1450         panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1451         so->hw[PAN_VERTEX_ID].index = k++;
1452         panfrost_instance_id(ctx->padded_count, &attrs[k]);
1453         so->hw[PAN_INSTANCE_ID].index = k++;
1454
1455         /* Upload whatever we emitted and go */
1456
1457         vertex_postfix->attributes = panfrost_upload_transient(batch, attrs,
1458                                                            k * sizeof(*attrs));
1459 }
1460
1461 static mali_ptr
1462 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1463                        unsigned stride, unsigned count)
1464 {
1465         /* Fill out the descriptor */
1466         slot->stride = stride;
1467         slot->size = stride * count;
1468         slot->shift = slot->extra_flags = 0;
1469
1470         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1471                                                                         slot->size);
1472
1473         slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1474
1475         return transfer.gpu;
1476 }
1477
1478 static void
1479 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1480                         unsigned stride, unsigned offset, unsigned count,
1481                         struct pipe_stream_output_target *target)
1482 {
1483         /* Fill out the descriptor */
1484         slot->stride = stride * 4;
1485         slot->shift = slot->extra_flags = 0;
1486
1487         unsigned max_size = target->buffer_size;
1488         unsigned expected_size = slot->stride * count;
1489
1490         slot->size = MIN2(max_size, expected_size);
1491
1492         /* Grab the BO and bind it to the batch */
1493         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1494
1495         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1496          * the perspective of the TILER and FRAGMENT.
1497          */
1498         panfrost_batch_add_bo(batch, bo,
1499                               PAN_BO_ACCESS_SHARED |
1500                               PAN_BO_ACCESS_RW |
1501                               PAN_BO_ACCESS_VERTEX_TILER |
1502                               PAN_BO_ACCESS_FRAGMENT);
1503
1504         mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1505         slot->elements = addr;
1506 }
1507
1508 /* Given a shader and buffer indices, link varying metadata together */
1509
1510 static bool
1511 is_special_varying(gl_varying_slot loc)
1512 {
1513         switch (loc) {
1514         case VARYING_SLOT_POS:
1515         case VARYING_SLOT_PSIZ:
1516         case VARYING_SLOT_PNTC:
1517         case VARYING_SLOT_FACE:
1518                 return true;
1519         default:
1520                 return false;
1521         }
1522 }
1523
1524 static void
1525 panfrost_emit_varying_meta(void *outptr, struct panfrost_shader_state *ss,
1526                            signed general, signed gl_Position,
1527                            signed gl_PointSize, signed gl_PointCoord,
1528                            signed gl_FrontFacing)
1529 {
1530         struct mali_attr_meta *out = (struct mali_attr_meta *) outptr;
1531
1532         for (unsigned i = 0; i < ss->varying_count; ++i) {
1533                 gl_varying_slot location = ss->varyings_loc[i];
1534                 int index = -1;
1535
1536                 switch (location) {
1537                 case VARYING_SLOT_POS:
1538                         index = gl_Position;
1539                         break;
1540                 case VARYING_SLOT_PSIZ:
1541                         index = gl_PointSize;
1542                         break;
1543                 case VARYING_SLOT_PNTC:
1544                         index = gl_PointCoord;
1545                         break;
1546                 case VARYING_SLOT_FACE:
1547                         index = gl_FrontFacing;
1548                         break;
1549                 default:
1550                         index = general;
1551                         break;
1552                 }
1553
1554                 assert(index >= 0);
1555                 out[i].index = index;
1556         }
1557 }
1558
1559 static bool
1560 has_point_coord(unsigned mask, gl_varying_slot loc)
1561 {
1562         if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1563                 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1564         else if (loc == VARYING_SLOT_PNTC)
1565                 return (mask & (1 << 8));
1566         else
1567                 return false;
1568 }
1569
1570 /* Helpers for manipulating stream out information so we can pack varyings
1571  * accordingly. Compute the src_offset for a given captured varying */
1572
1573 static struct pipe_stream_output *
1574 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1575 {
1576         for (unsigned i = 0; i < info->num_outputs; ++i) {
1577                 if (info->output[i].register_index == loc)
1578                         return &info->output[i];
1579         }
1580
1581         unreachable("Varying not captured");
1582 }
1583
1584 /* TODO: Integers */
1585 static enum mali_format
1586 pan_xfb_format(unsigned nr_components)
1587 {
1588         switch (nr_components) {
1589                 case 1: return MALI_R32F;
1590                 case 2: return MALI_RG32F;
1591                 case 3: return MALI_RGB32F;
1592                 case 4: return MALI_RGBA32F;
1593                 default: unreachable("Invalid format");
1594         }
1595 }
1596
1597 void
1598 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1599                                  unsigned vertex_count,
1600                                  struct mali_vertex_tiler_postfix *vertex_postfix,
1601                                  struct mali_vertex_tiler_postfix *tiler_postfix,
1602                                  union midgard_primitive_size *primitive_size)
1603 {
1604         /* Load the shaders */
1605         struct panfrost_context *ctx = batch->ctx;
1606         struct panfrost_shader_state *vs, *fs;
1607         unsigned int num_gen_varyings = 0;
1608         size_t vs_size, fs_size;
1609
1610         /* Allocate the varying descriptor */
1611
1612         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1613         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1614         vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
1615         fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
1616
1617         struct panfrost_transfer trans = panfrost_allocate_transient(batch,
1618                                                                      vs_size +
1619                                                                      fs_size);
1620
1621         struct pipe_stream_output_info *so = &vs->stream_output;
1622
1623         /* Check if this varying is linked by us. This is the case for
1624          * general-purpose, non-captured varyings. If it is, link it. If it's
1625          * not, use the provided stream out information to determine the
1626          * offset, since it was already linked for us. */
1627
1628         for (unsigned i = 0; i < vs->varying_count; i++) {
1629                 gl_varying_slot loc = vs->varyings_loc[i];
1630
1631                 bool special = is_special_varying(loc);
1632                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1633
1634                 if (captured) {
1635                         struct pipe_stream_output *o = pan_get_so(so, loc);
1636
1637                         unsigned dst_offset = o->dst_offset * 4; /* dwords */
1638                         vs->varyings[i].src_offset = dst_offset;
1639                 } else if (!special) {
1640                         vs->varyings[i].src_offset = 16 * (num_gen_varyings++);
1641                 }
1642         }
1643
1644         /* Conversely, we need to set src_offset for the captured varyings.
1645          * Here, the layout is defined by the stream out info, not us */
1646
1647         /* Link up with fragment varyings */
1648         bool reads_point_coord = fs->reads_point_coord;
1649
1650         for (unsigned i = 0; i < fs->varying_count; i++) {
1651                 gl_varying_slot loc = fs->varyings_loc[i];
1652                 unsigned src_offset;
1653                 signed vs_idx = -1;
1654
1655                 /* Link up */
1656                 for (unsigned j = 0; j < vs->varying_count; ++j) {
1657                         if (vs->varyings_loc[j] == loc) {
1658                                 vs_idx = j;
1659                                 break;
1660                         }
1661                 }
1662
1663                 /* Either assign or reuse */
1664                 if (vs_idx >= 0)
1665                         src_offset = vs->varyings[vs_idx].src_offset;
1666                 else
1667                         src_offset = 16 * (num_gen_varyings++);
1668
1669                 fs->varyings[i].src_offset = src_offset;
1670
1671                 if (has_point_coord(fs->point_sprite_mask, loc))
1672                         reads_point_coord = true;
1673         }
1674
1675         memcpy(trans.cpu, vs->varyings, vs_size);
1676         memcpy(trans.cpu + vs_size, fs->varyings, fs_size);
1677
1678         union mali_attr varyings[PIPE_MAX_ATTRIBS] = {0};
1679
1680         /* Figure out how many streamout buffers could be bound */
1681         unsigned so_count = ctx->streamout.num_targets;
1682         for (unsigned i = 0; i < vs->varying_count; i++) {
1683                 gl_varying_slot loc = vs->varyings_loc[i];
1684
1685                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1686                 if (!captured) continue;
1687
1688                 struct pipe_stream_output *o = pan_get_so(so, loc);
1689                 so_count = MAX2(so_count, o->output_buffer + 1);
1690         }
1691
1692         signed idx = so_count;
1693         signed general = idx++;
1694         signed gl_Position = idx++;
1695         signed gl_PointSize = vs->writes_point_size ? (idx++) : -1;
1696         signed gl_PointCoord = reads_point_coord ? (idx++) : -1;
1697         signed gl_FrontFacing = fs->reads_face ? (idx++) : -1;
1698         signed gl_FragCoord = fs->reads_frag_coord ? (idx++) : -1;
1699
1700         /* Emit the stream out buffers */
1701
1702         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1703                                                            ctx->vertex_count);
1704
1705         for (unsigned i = 0; i < so_count; ++i) {
1706                 if (i < ctx->streamout.num_targets) {
1707                         panfrost_emit_streamout(batch, &varyings[i],
1708                                                 so->stride[i],
1709                                                 ctx->streamout.offsets[i],
1710                                                 out_count,
1711                                                 ctx->streamout.targets[i]);
1712                 } else {
1713                         /* Emit a dummy buffer */
1714                         panfrost_emit_varyings(batch, &varyings[i],
1715                                                so->stride[i] * 4,
1716                                                out_count);
1717
1718                         /* Clear the attribute type */
1719                         varyings[i].elements &= ~0xF;
1720                 }
1721         }
1722
1723         panfrost_emit_varyings(batch, &varyings[general],
1724                                num_gen_varyings * 16,
1725                                vertex_count);
1726
1727         mali_ptr varyings_p;
1728
1729         /* fp32 vec4 gl_Position */
1730         varyings_p = panfrost_emit_varyings(batch, &varyings[gl_Position],
1731                                             sizeof(float) * 4, vertex_count);
1732         tiler_postfix->position_varying = varyings_p;
1733
1734
1735         if (panfrost_writes_point_size(ctx)) {
1736                 varyings_p = panfrost_emit_varyings(batch,
1737                                                     &varyings[gl_PointSize],
1738                                                     2, vertex_count);
1739                 primitive_size->pointer = varyings_p;
1740         }
1741
1742         if (reads_point_coord)
1743                 varyings[gl_PointCoord].elements = MALI_VARYING_POINT_COORD;
1744
1745         if (fs->reads_face)
1746                 varyings[gl_FrontFacing].elements = MALI_VARYING_FRONT_FACING;
1747
1748         if (fs->reads_frag_coord)
1749                 varyings[gl_FragCoord].elements = MALI_VARYING_FRAG_COORD;
1750
1751         struct panfrost_device *device = pan_device(ctx->base.screen);
1752         assert(!(device->quirks & IS_BIFROST) || !(reads_point_coord || fs->reads_face || fs->reads_frag_coord));
1753
1754         /* Let's go ahead and link varying meta to the buffer in question, now
1755          * that that information is available. VARYING_SLOT_POS is mapped to
1756          * gl_FragCoord for fragment shaders but gl_Positionf or vertex shaders
1757          * */
1758
1759         panfrost_emit_varying_meta(trans.cpu, vs, general, gl_Position,
1760                                    gl_PointSize, gl_PointCoord,
1761                                    gl_FrontFacing);
1762
1763         panfrost_emit_varying_meta(trans.cpu + vs_size, fs, general,
1764                                    gl_FragCoord, gl_PointSize,
1765                                    gl_PointCoord, gl_FrontFacing);
1766
1767         /* Replace streamout */
1768
1769         struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
1770         struct mali_attr_meta *ofs = ovs + vs->varying_count;
1771
1772         for (unsigned i = 0; i < vs->varying_count; i++) {
1773                 gl_varying_slot loc = vs->varyings_loc[i];
1774
1775                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1776                 if (!captured)
1777                         continue;
1778
1779                 struct pipe_stream_output *o = pan_get_so(so, loc);
1780                 ovs[i].index = o->output_buffer;
1781
1782                 /* Set the type appropriately. TODO: Integer varyings XXX */
1783                 assert(o->stream == 0);
1784                 ovs[i].format = pan_xfb_format(o->num_components);
1785                 ovs[i].swizzle = panfrost_get_default_swizzle(o->num_components);
1786
1787                 /* Link to the fragment */
1788                 signed fs_idx = -1;
1789
1790                 /* Link up */
1791                 for (unsigned j = 0; j < fs->varying_count; ++j) {
1792                         if (fs->varyings_loc[j] == loc) {
1793                                 fs_idx = j;
1794                                 break;
1795                         }
1796                 }
1797
1798                 if (fs_idx >= 0) {
1799                         ofs[fs_idx].index = ovs[i].index;
1800                         ofs[fs_idx].format = ovs[i].format;
1801                         ofs[fs_idx].swizzle = ovs[i].swizzle;
1802                 }
1803         }
1804
1805         /* Replace point sprite */
1806         for (unsigned i = 0; i < fs->varying_count; i++) {
1807                 /* If we have a point sprite replacement, handle that here. We
1808                  * have to translate location first.  TODO: Flip y in shader.
1809                  * We're already keying ... just time crunch .. */
1810
1811                 if (has_point_coord(fs->point_sprite_mask,
1812                                     fs->varyings_loc[i])) {
1813                         ofs[i].index = gl_PointCoord;
1814
1815                         /* Swizzle out the z/w to 0/1 */
1816                         ofs[i].format = MALI_RG16F;
1817                         ofs[i].swizzle = panfrost_get_default_swizzle(2);
1818                 }
1819         }
1820
1821         /* Fix up unaligned addresses */
1822         for (unsigned i = 0; i < so_count; ++i) {
1823                 if (varyings[i].elements < MALI_RECORD_SPECIAL)
1824                         continue;
1825
1826                 unsigned align = (varyings[i].elements & 63);
1827
1828                 /* While we're at it, the SO buffers are linear */
1829
1830                 if (!align) {
1831                         varyings[i].elements |= MALI_ATTR_LINEAR;
1832                         continue;
1833                 }
1834
1835                 /* We need to adjust alignment */
1836                 varyings[i].elements &= ~63;
1837                 varyings[i].elements |= MALI_ATTR_LINEAR;
1838                 varyings[i].size += align;
1839
1840                 for (unsigned v = 0; v < vs->varying_count; ++v) {
1841                         if (ovs[v].index != i)
1842                                 continue;
1843
1844                         ovs[v].src_offset = vs->varyings[v].src_offset + align;
1845                 }
1846
1847                 for (unsigned f = 0; f < fs->varying_count; ++f) {
1848                         if (ofs[f].index != i)
1849                                 continue;
1850
1851                         ofs[f].src_offset = fs->varyings[f].src_offset + align;
1852                 }
1853         }
1854
1855         varyings_p = panfrost_upload_transient(batch, varyings,
1856                                                idx * sizeof(*varyings));
1857         vertex_postfix->varyings = varyings_p;
1858         tiler_postfix->varyings = varyings_p;
1859
1860         vertex_postfix->varying_meta = trans.gpu;
1861         tiler_postfix->varying_meta = trans.gpu + vs_size;
1862 }
1863
1864 void
1865 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1866                                 struct mali_vertex_tiler_prefix *vertex_prefix,
1867                                 struct mali_vertex_tiler_postfix *vertex_postfix,
1868                                 struct mali_vertex_tiler_prefix *tiler_prefix,
1869                                 struct mali_vertex_tiler_postfix *tiler_postfix,
1870                                 union midgard_primitive_size *primitive_size)
1871 {
1872         struct panfrost_context *ctx = batch->ctx;
1873         struct panfrost_device *device = pan_device(ctx->base.screen);
1874         bool wallpapering = ctx->wallpaper_batch && batch->tiler_dep;
1875         struct bifrost_payload_vertex bifrost_vertex = {0,};
1876         struct bifrost_payload_tiler bifrost_tiler = {0,};
1877         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
1878         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
1879         void *vp, *tp;
1880         size_t vp_size, tp_size;
1881
1882         if (device->quirks & IS_BIFROST) {
1883                 bifrost_vertex.prefix = *vertex_prefix;
1884                 bifrost_vertex.postfix = *vertex_postfix;
1885                 vp = &bifrost_vertex;
1886                 vp_size = sizeof(bifrost_vertex);
1887
1888                 bifrost_tiler.prefix = *tiler_prefix;
1889                 bifrost_tiler.tiler.primitive_size = *primitive_size;
1890                 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
1891                 bifrost_tiler.postfix = *tiler_postfix;
1892                 tp = &bifrost_tiler;
1893                 tp_size = sizeof(bifrost_tiler);
1894         } else {
1895                 midgard_vertex.prefix = *vertex_prefix;
1896                 midgard_vertex.postfix = *vertex_postfix;
1897                 vp = &midgard_vertex;
1898                 vp_size = sizeof(midgard_vertex);
1899
1900                 midgard_tiler.prefix = *tiler_prefix;
1901                 midgard_tiler.postfix = *tiler_postfix;
1902                 midgard_tiler.primitive_size = *primitive_size;
1903                 tp = &midgard_tiler;
1904                 tp_size = sizeof(midgard_tiler);
1905         }
1906
1907         if (wallpapering) {
1908                 /* Inject in reverse order, with "predicted" job indices.
1909                  * THIS IS A HACK XXX */
1910                 panfrost_new_job(batch, JOB_TYPE_TILER, false,
1911                                  batch->job_index + 2, tp, tp_size, true);
1912                 panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
1913                                  vp, vp_size, true);
1914                 return;
1915         }
1916
1917         /* If rasterizer discard is enable, only submit the vertex */
1918
1919         bool rasterizer_discard = ctx->rasterizer &&
1920                                   ctx->rasterizer->base.rasterizer_discard;
1921
1922         unsigned vertex = panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
1923                                            vp, vp_size, false);
1924
1925         if (rasterizer_discard)
1926                 return;
1927
1928         panfrost_new_job(batch, JOB_TYPE_TILER, false, vertex, tp, tp_size,
1929                          false);
1930 }
1931
1932 /* TODO: stop hardcoding this */
1933 mali_ptr
1934 panfrost_emit_sample_locations(struct panfrost_batch *batch)
1935 {
1936         uint16_t locations[] = {
1937             128, 128,
1938             0, 256,
1939             0, 256,
1940             0, 256,
1941             0, 256,
1942             0, 256,
1943             0, 256,
1944             0, 256,
1945             0, 256,
1946             0, 256,
1947             0, 256,
1948             0, 256,
1949             0, 256,
1950             0, 256,
1951             0, 256,
1952             0, 256,
1953             0, 256,
1954             0, 256,
1955             0, 256,
1956             0, 256,
1957             0, 256,
1958             0, 256,
1959             0, 256,
1960             0, 256,
1961             0, 256,
1962             0, 256,
1963             0, 256,
1964             0, 256,
1965             0, 256,
1966             0, 256,
1967             0, 256,
1968             0, 256,
1969             128, 128,
1970             0, 0,
1971             0, 0,
1972             0, 0,
1973             0, 0,
1974             0, 0,
1975             0, 0,
1976             0, 0,
1977             0, 0,
1978             0, 0,
1979             0, 0,
1980             0, 0,
1981             0, 0,
1982             0, 0,
1983             0, 0,
1984             0, 0,
1985         };
1986
1987         return panfrost_upload_transient(batch, locations, 96 * sizeof(uint16_t));
1988 }