src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28
  29 #include "panfrost-quirks.h"
  30
  31 #include "pan_allocate.h"
  32 #include "pan_bo.h"
  33 #include "pan_cmdstream.h"
  34 #include "pan_context.h"
  35 #include "pan_job.h"
  36
  37 /* If a BO is accessed for a particular shader stage, will it be in the primary
  38  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  39  * fragment will be primary, e.g. compute jobs will be considered
  40  * "vertex/tiler" by analogy */
  41
  42 static inline uint32_t
  43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  44 {
  45         assert(stage == PIPE_SHADER_FRAGMENT ||
  46                stage == PIPE_SHADER_VERTEX ||
  47                stage == PIPE_SHADER_COMPUTE);
  48
  49         return stage == PIPE_SHADER_FRAGMENT ?
  50                PAN_BO_ACCESS_FRAGMENT :
  51                PAN_BO_ACCESS_VERTEX_TILER;
  52 }
  53
  54 static void
  55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
  56                                struct mali_vertex_tiler_postfix *postfix)
  57 {
  58         struct panfrost_device *dev = pan_device(ctx->base.screen);
  59         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  60
  61         unsigned shift = panfrost_get_stack_shift(batch->stack_size);
  62         struct mali_shared_memory shared = {
  63                 .stack_shift = shift,
  64                 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
  65                 .shared_workgroup_count = ~0,
  66         };
  67         postfix->shared_memory = panfrost_upload_transient(batch, &shared, sizeof(shared));
  68 }
  69
  70 static void
  71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
  72                                struct mali_vertex_tiler_postfix *postfix)
  73 {
  74         struct panfrost_device *dev = pan_device(ctx->base.screen);
  75         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  76
  77         /* If we haven't, reserve space for the framebuffer */
  78
  79         if (!batch->framebuffer.gpu) {
  80                 unsigned size = (dev->quirks & MIDGARD_SFBD) ?
  81                         sizeof(struct mali_single_framebuffer) :
  82                         sizeof(struct mali_framebuffer);
  83
  84                 batch->framebuffer = panfrost_allocate_transient(batch, size);
  85
  86                 /* Tag the pointer */
  87                 if (!(dev->quirks & MIDGARD_SFBD))
  88                         batch->framebuffer.gpu |= MALI_MFBD;
  89         }
  90
  91         postfix->shared_memory = batch->framebuffer.gpu;
  92 }
  93
  94 static void
  95 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
  96                               struct mali_vertex_tiler_prefix *prefix,
  97                               struct mali_vertex_tiler_postfix *postfix)
  98 {
  99         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 100
 101         postfix->gl_enables |= 0x7;
 102         SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
 103                 rasterizer && rasterizer->base.front_ccw);
 104         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
 105                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
 106         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
 107                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
 108         SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
 109                 rasterizer && rasterizer->base.flatshade_first);
 110 }
 111
 112 void
 113 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
 114                                   struct mali_vertex_tiler_prefix *prefix,
 115                                   union midgard_primitive_size *primitive_size)
 116 {
 117         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 118
 119         if (!panfrost_writes_point_size(ctx)) {
 120                 bool points = prefix->draw_mode == MALI_POINTS;
 121                 float val = 0.0f;
 122
 123                 if (rasterizer)
 124                         val = points ?
 125                               rasterizer->base.point_size :
 126                               rasterizer->base.line_width;
 127
 128                 primitive_size->constant = val;
 129         }
 130 }
 131
 132 static void
 133 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
 134                                    struct mali_vertex_tiler_postfix *postfix)
 135 {
 136         SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
 137         if (ctx->occlusion_query)
 138                 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
 139         else
 140                 postfix->occlusion_counter = 0;
 141 }
 142
 143 void
 144 panfrost_vt_init(struct panfrost_context *ctx,
 145                  enum pipe_shader_type stage,
 146                  struct mali_vertex_tiler_prefix *prefix,
 147                  struct mali_vertex_tiler_postfix *postfix)
 148 {
 149         struct panfrost_device *device = pan_device(ctx->base.screen);
 150
 151         if (!ctx->shader[stage])
 152                 return;
 153
 154         memset(prefix, 0, sizeof(*prefix));
 155         memset(postfix, 0, sizeof(*postfix));
 156
 157         if (device->quirks & IS_BIFROST) {
 158                 postfix->gl_enables = 0x2;
 159                 panfrost_vt_emit_shared_memory(ctx, postfix);
 160         } else {
 161                 postfix->gl_enables = 0x6;
 162                 panfrost_vt_attach_framebuffer(ctx, postfix);
 163         }
 164
 165         if (stage == PIPE_SHADER_FRAGMENT) {
 166                 panfrost_vt_update_occlusion_query(ctx, postfix);
 167                 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
 168         }
 169 }
 170
 171 static unsigned
 172 panfrost_translate_index_size(unsigned size)
 173 {
 174         switch (size) {
 175         case 1:
 176                 return MALI_DRAW_INDEXED_UINT8;
 177
 178         case 2:
 179                 return MALI_DRAW_INDEXED_UINT16;
 180
 181         case 4:
 182                 return MALI_DRAW_INDEXED_UINT32;
 183
 184         default:
 185                 unreachable("Invalid index size");
 186         }
 187 }
 188
 189 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
 190  * good for the duration of the draw (transient), could last longer. Also get
 191  * the bounds on the index buffer for the range accessed by the draw. We do
 192  * these operations together because there are natural optimizations which
 193  * require them to be together. */
 194
 195 static mali_ptr
 196 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 197                                   const struct pipe_draw_info *info,
 198                                   unsigned *min_index, unsigned *max_index)
 199 {
 200         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 201         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 202         off_t offset = info->start * info->index_size;
 203         bool needs_indices = true;
 204         mali_ptr out = 0;
 205
 206         if (info->max_index != ~0u) {
 207                 *min_index = info->min_index;
 208                 *max_index = info->max_index;
 209                 needs_indices = false;
 210         }
 211
 212         if (!info->has_user_indices) {
 213                 /* Only resources can be directly mapped */
 214                 panfrost_batch_add_bo(batch, rsrc->bo,
 215                                       PAN_BO_ACCESS_SHARED |
 216                                       PAN_BO_ACCESS_READ |
 217                                       PAN_BO_ACCESS_VERTEX_TILER);
 218                 out = rsrc->bo->gpu + offset;
 219
 220                 /* Check the cache */
 221                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 222                                                            info->start,
 223                                                            info->count,
 224                                                            min_index,
 225                                                            max_index);
 226         } else {
 227                 /* Otherwise, we need to upload to transient memory */
 228                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 229                 out = panfrost_upload_transient(batch, ibuf8 + offset,
 230                                                 info->count *
 231                                                 info->index_size);
 232         }
 233
 234         if (needs_indices) {
 235                 /* Fallback */
 236                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 237
 238                 if (!info->has_user_indices)
 239                         panfrost_minmax_cache_add(rsrc->index_cache,
 240                                                   info->start, info->count,
 241                                                   *min_index, *max_index);
 242         }
 243
 244         return out;
 245 }
 246
 247 void
 248 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
 249                           const struct pipe_draw_info *info,
 250                           enum mali_draw_mode draw_mode,
 251                           struct mali_vertex_tiler_postfix *vertex_postfix,
 252                           struct mali_vertex_tiler_prefix *tiler_prefix,
 253                           struct mali_vertex_tiler_postfix *tiler_postfix,
 254                           unsigned *vertex_count,
 255                           unsigned *padded_count)
 256 {
 257         tiler_prefix->draw_mode = draw_mode;
 258
 259         unsigned draw_flags = 0;
 260
 261         if (panfrost_writes_point_size(ctx))
 262                 draw_flags |= MALI_DRAW_VARYING_SIZE;
 263
 264         if (info->primitive_restart)
 265                 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
 266
 267         /* These doesn't make much sense */
 268
 269         draw_flags |= 0x3000;
 270
 271         if (info->index_size) {
 272                 unsigned min_index = 0, max_index = 0;
 273
 274                 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
 275                                                                        info,
 276                                                                        &min_index,
 277                                                                        &max_index);
 278
 279                 /* Use the corresponding values */
 280                 *vertex_count = max_index - min_index + 1;
 281                 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
 282                 tiler_prefix->offset_bias_correction = -min_index;
 283                 tiler_prefix->index_count = MALI_POSITIVE(info->count);
 284                 draw_flags |= panfrost_translate_index_size(info->index_size);
 285         } else {
 286                 tiler_prefix->indices = 0;
 287                 *vertex_count = ctx->vertex_count;
 288                 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
 289                 tiler_prefix->offset_bias_correction = 0;
 290                 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
 291         }
 292
 293         tiler_prefix->unknown_draw = draw_flags;
 294
 295         /* Encode the padded vertex count */
 296
 297         if (info->instance_count > 1) {
 298                 *padded_count = panfrost_padded_vertex_count(*vertex_count);
 299
 300                 unsigned shift = __builtin_ctz(ctx->padded_count);
 301                 unsigned k = ctx->padded_count >> (shift + 1);
 302
 303                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
 304                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
 305         } else {
 306                 *padded_count = *vertex_count;
 307
 308                 /* Reset instancing state */
 309                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
 310                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
 311         }
 312 }
 313
 314 static void
 315 panfrost_shader_meta_init(struct panfrost_context *ctx,
 316                           enum pipe_shader_type st,
 317                           struct mali_shader_meta *meta)
 318 {
 319         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 320         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 321
 322         memset(meta, 0, sizeof(*meta));
 323         meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
 324         meta->attribute_count = ss->attribute_count;
 325         meta->varying_count = ss->varying_count;
 326         meta->texture_count = ctx->sampler_view_count[st];
 327         meta->sampler_count = ctx->sampler_count[st];
 328
 329         if (dev->quirks & IS_BIFROST) {
 330                 if (st == PIPE_SHADER_VERTEX)
 331                         meta->bifrost1.unk1 = 0x800000;
 332                 else {
 333                         /* First clause ATEST |= 0x4000000.
 334                          * Less than 32 regs |= 0x200 */
 335                         meta->bifrost1.unk1 = 0x958020;
 336                 }
 337
 338                 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 339                 if (st == PIPE_SHADER_VERTEX)
 340                         meta->bifrost2.preload_regs = 0xC0;
 341                 else
 342                         meta->bifrost2.preload_regs = 0x1;
 343                 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
 344                                                     ss->uniform_cutoff);
 345         } else {
 346                 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
 347                                                     ss->uniform_cutoff);
 348                 meta->midgard1.work_count = ss->work_reg_count;
 349                 meta->midgard1.flags_hi = 0x8; /* XXX */
 350                 meta->midgard1.flags_lo = 0x220;
 351                 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 352         }
 353 }
 354
 355 static unsigned
 356 panfrost_translate_compare_func(enum pipe_compare_func in)
 357 {
 358         switch (in) {
 359         case PIPE_FUNC_NEVER:
 360                 return MALI_FUNC_NEVER;
 361
 362         case PIPE_FUNC_LESS:
 363                 return MALI_FUNC_LESS;
 364
 365         case PIPE_FUNC_EQUAL:
 366                 return MALI_FUNC_EQUAL;
 367
 368         case PIPE_FUNC_LEQUAL:
 369                 return MALI_FUNC_LEQUAL;
 370
 371         case PIPE_FUNC_GREATER:
 372                 return MALI_FUNC_GREATER;
 373
 374         case PIPE_FUNC_NOTEQUAL:
 375                 return MALI_FUNC_NOTEQUAL;
 376
 377         case PIPE_FUNC_GEQUAL:
 378                 return MALI_FUNC_GEQUAL;
 379
 380         case PIPE_FUNC_ALWAYS:
 381                 return MALI_FUNC_ALWAYS;
 382
 383         default:
 384                 unreachable("Invalid func");
 385         }
 386 }
 387
 388 static unsigned
 389 panfrost_translate_stencil_op(enum pipe_stencil_op in)
 390 {
 391         switch (in) {
 392         case PIPE_STENCIL_OP_KEEP:
 393                 return MALI_STENCIL_KEEP;
 394
 395         case PIPE_STENCIL_OP_ZERO:
 396                 return MALI_STENCIL_ZERO;
 397
 398         case PIPE_STENCIL_OP_REPLACE:
 399                return MALI_STENCIL_REPLACE;
 400
 401         case PIPE_STENCIL_OP_INCR:
 402                 return MALI_STENCIL_INCR;
 403
 404         case PIPE_STENCIL_OP_DECR:
 405                 return MALI_STENCIL_DECR;
 406
 407         case PIPE_STENCIL_OP_INCR_WRAP:
 408                 return MALI_STENCIL_INCR_WRAP;
 409
 410         case PIPE_STENCIL_OP_DECR_WRAP:
 411                 return MALI_STENCIL_DECR_WRAP;
 412
 413         case PIPE_STENCIL_OP_INVERT:
 414                 return MALI_STENCIL_INVERT;
 415
 416         default:
 417                 unreachable("Invalid stencil op");
 418         }
 419 }
 420
 421 static unsigned
 422 translate_tex_wrap(enum pipe_tex_wrap w)
 423 {
 424         switch (w) {
 425         case PIPE_TEX_WRAP_REPEAT:
 426                 return MALI_WRAP_REPEAT;
 427
 428         case PIPE_TEX_WRAP_CLAMP:
 429                 return MALI_WRAP_CLAMP;
 430
 431         case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 432                 return MALI_WRAP_CLAMP_TO_EDGE;
 433
 434         case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
 435                 return MALI_WRAP_CLAMP_TO_BORDER;
 436
 437         case PIPE_TEX_WRAP_MIRROR_REPEAT:
 438                 return MALI_WRAP_MIRRORED_REPEAT;
 439
 440         case PIPE_TEX_WRAP_MIRROR_CLAMP:
 441                 return MALI_WRAP_MIRRORED_CLAMP;
 442
 443         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 444                 return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE;
 445
 446         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
 447                 return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER;
 448
 449         default:
 450                 unreachable("Invalid wrap");
 451         }
 452 }
 453
 454 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 455                                 struct mali_sampler_descriptor *hw)
 456 {
 457         unsigned func = panfrost_translate_compare_func(cso->compare_func);
 458         bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 459         bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 460         bool mip_linear  = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
 461         unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
 462         unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
 463         unsigned mip_filter = mip_linear  ?
 464                               (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
 465         unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
 466
 467         *hw = (struct mali_sampler_descriptor) {
 468                 .filter_mode = min_filter | mag_filter | mip_filter |
 469                                normalized,
 470                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 471                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 472                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 473                 .compare_func = panfrost_flip_compare_func(func),
 474                 .border_color = {
 475                         cso->border_color.f[0],
 476                         cso->border_color.f[1],
 477                         cso->border_color.f[2],
 478                         cso->border_color.f[3]
 479                 },
 480                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 481                 .max_lod = FIXED_16(cso->max_lod, false),
 482                 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
 483                 .seamless_cube_map = cso->seamless_cube_map,
 484         };
 485
 486         /* If necessary, we disable mipmapping in the sampler descriptor by
 487          * clamping the LOD as tight as possible (from 0 to epsilon,
 488          * essentially -- remember these are fixed point numbers, so
 489          * epsilon=1/256) */
 490
 491         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 492                 hw->max_lod = hw->min_lod + 1;
 493 }
 494
 495 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
 496                                         struct bifrost_sampler_descriptor *hw)
 497 {
 498         *hw = (struct bifrost_sampler_descriptor) {
 499                 .unk1 = 0x1,
 500                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 501                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 502                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 503                 .unk8 = 0x8,
 504                 .min_filter = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST,
 505                 .norm_coords = cso->normalized_coords,
 506                 .mip_filter = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR,
 507                 .mag_filter = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR,
 508                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 509                 .max_lod = FIXED_16(cso->max_lod, false),
 510         };
 511
 512         /* If necessary, we disable mipmapping in the sampler descriptor by
 513          * clamping the LOD as tight as possible (from 0 to epsilon,
 514          * essentially -- remember these are fixed point numbers, so
 515          * epsilon=1/256) */
 516
 517         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 518                 hw->max_lod = hw->min_lod + 1;
 519 }
 520
 521 static void
 522 panfrost_make_stencil_state(const struct pipe_stencil_state *in,
 523                             struct mali_stencil_test *out)
 524 {
 525         out->ref = 0; /* Gallium gets it from elsewhere */
 526
 527         out->mask = in->valuemask;
 528         out->func = panfrost_translate_compare_func(in->func);
 529         out->sfail = panfrost_translate_stencil_op(in->fail_op);
 530         out->dpfail = panfrost_translate_stencil_op(in->zfail_op);
 531         out->dppass = panfrost_translate_stencil_op(in->zpass_op);
 532 }
 533
 534 static void
 535 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
 536                                      struct mali_shader_meta *fragmeta)
 537 {
 538         if (!ctx->rasterizer) {
 539                 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
 540                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
 541                 fragmeta->depth_units = 0.0f;
 542                 fragmeta->depth_factor = 0.0f;
 543                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
 544                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
 545                 return;
 546         }
 547
 548         bool msaa = ctx->rasterizer->base.multisample;
 549
 550         /* TODO: Sample size */
 551         SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
 552         SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
 553         fragmeta->depth_units = ctx->rasterizer->base.offset_units * 2.0f;
 554         fragmeta->depth_factor = ctx->rasterizer->base.offset_scale;
 555
 556         /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
 557
 558         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A,
 559                 ctx->rasterizer->base.offset_tri);
 560         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B,
 561                 ctx->rasterizer->base.offset_tri);
 562 }
 563
 564 static void
 565 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
 566                               struct mali_shader_meta *fragmeta)
 567 {
 568         const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
 569         int zfunc = PIPE_FUNC_ALWAYS;
 570
 571         if (!zsa) {
 572                 struct pipe_stencil_state default_stencil = {
 573                         .enabled = 0,
 574                         .func = PIPE_FUNC_ALWAYS,
 575                         .fail_op = MALI_STENCIL_KEEP,
 576                         .zfail_op = MALI_STENCIL_KEEP,
 577                         .zpass_op = MALI_STENCIL_KEEP,
 578                         .writemask = 0xFF,
 579                         .valuemask = 0xFF
 580                 };
 581
 582                 panfrost_make_stencil_state(&default_stencil,
 583                                             &fragmeta->stencil_front);
 584                 fragmeta->stencil_mask_front = default_stencil.writemask;
 585                 fragmeta->stencil_back = fragmeta->stencil_front;
 586                 fragmeta->stencil_mask_back = default_stencil.writemask;
 587                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
 588                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
 589         } else {
 590                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
 591                         zsa->stencil[0].enabled);
 592                 panfrost_make_stencil_state(&zsa->stencil[0],
 593                                             &fragmeta->stencil_front);
 594                 fragmeta->stencil_mask_front = zsa->stencil[0].writemask;
 595                 fragmeta->stencil_front.ref = ctx->stencil_ref.ref_value[0];
 596
 597                 /* If back-stencil is not enabled, use the front values */
 598
 599                 if (zsa->stencil[1].enabled) {
 600                         panfrost_make_stencil_state(&zsa->stencil[1],
 601                                                     &fragmeta->stencil_back);
 602                         fragmeta->stencil_mask_back = zsa->stencil[1].writemask;
 603                         fragmeta->stencil_back.ref = ctx->stencil_ref.ref_value[1];
 604                 } else {
 605                         fragmeta->stencil_back = fragmeta->stencil_front;
 606                         fragmeta->stencil_mask_back = fragmeta->stencil_mask_front;
 607                         fragmeta->stencil_back.ref = fragmeta->stencil_front.ref;
 608                 }
 609
 610                 if (zsa->depth.enabled)
 611                         zfunc = zsa->depth.func;
 612
 613                 /* Depth state (TODO: Refactor) */
 614
 615                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
 616                         zsa->depth.writemask);
 617         }
 618
 619         fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
 620         fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
 621 }
 622
 623 static void
 624 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
 625                                 struct mali_shader_meta *fragmeta,
 626                                 void *rts)
 627 {
 628         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 629
 630         SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
 631                 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
 632                 !ctx->blend->base.dither);
 633
 634         /* Get blending setup */
 635         unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 636
 637         struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 638         unsigned shader_offset = 0;
 639         struct panfrost_bo *shader_bo = NULL;
 640
 641         for (unsigned c = 0; c < rt_count; ++c)
 642                 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
 643                                                           &shader_offset);
 644
 645          /* If there is a blend shader, work registers are shared. XXX: opt */
 646
 647         if (!(dev->quirks & IS_BIFROST)) {
 648                 for (unsigned c = 0; c < rt_count; ++c) {
 649                         if (blend[c].is_shader)
 650                                 fragmeta->midgard1.work_count = 16;
 651                 }
 652         }
 653
 654         /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
 655          * copied to the blend_meta appended (by convention), but this is the
 656          * field actually read by the hardware. (Or maybe both are read...?).
 657          * Specify the last RTi with a blend shader. */
 658
 659         fragmeta->blend.shader = 0;
 660
 661         for (signed rt = (rt_count - 1); rt >= 0; --rt) {
 662                 if (!blend[rt].is_shader)
 663                         continue;
 664
 665                 fragmeta->blend.shader = blend[rt].shader.gpu |
 666                                          blend[rt].shader.first_tag;
 667                 break;
 668         }
 669
 670         if (dev->quirks & MIDGARD_SFBD) {
 671                 /* When only a single render target platform is used, the blend
 672                  * information is inside the shader meta itself. We additionally
 673                  * need to signal CAN_DISCARD for nontrivial blend modes (so
 674                  * we're able to read back the destination buffer) */
 675
 676                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
 677                         blend[0].is_shader);
 678
 679                 if (!blend[0].is_shader) {
 680                         fragmeta->blend.equation = *blend[0].equation.equation;
 681                         fragmeta->blend.constant = blend[0].equation.constant;
 682                 }
 683
 684                 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
 685                         !blend[0].no_blending);
 686                 return;
 687         }
 688
 689         /* Additional blend descriptor tacked on for jobs using MFBD */
 690
 691         for (unsigned i = 0; i < rt_count; ++i) {
 692                 if (dev->quirks & IS_BIFROST) {
 693                         struct bifrost_blend_rt *brts = rts;
 694                         struct panfrost_shader_state *fs;
 695                         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 696
 697                         brts[i].flags = 0x200;
 698                         if (blend[i].is_shader) {
 699                                 /* The blend shader's address needs to be at
 700                                  * the same top 32 bit as the fragment shader.
 701                                  * TODO: Ensure that's always the case.
 702                                  */
 703                                 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
 704                                        (fs->bo->gpu & (0xffffffffull << 32)));
 705                                 brts[i].shader = blend[i].shader.gpu;
 706                                 brts[i].unk2 = 0x0;
 707                         } else {
 708                                 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
 709                                 const struct util_format_description *format_desc;
 710                                 format_desc = util_format_description(format);
 711
 712                                 brts[i].equation = *blend[i].equation.equation;
 713
 714                                 /* TODO: this is a bit more complicated */
 715                                 brts[i].constant = blend[i].equation.constant;
 716
 717                                 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
 718                                 brts[i].unk2 = 0x19;
 719
 720                                 brts[i].shader_type = fs->blend_types[i];
 721                         }
 722                 } else {
 723                         struct midgard_blend_rt *mrts = rts;
 724
 725                         mrts[i].flags = 0x200;
 726
 727                         bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
 728                                        (ctx->pipe_framebuffer.cbufs[i]) &&
 729                                        util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
 730
 731                         SET_BIT(mrts[i].flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
 732                         SET_BIT(mrts[i].flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
 733                         SET_BIT(mrts[i].flags, MALI_BLEND_SRGB, is_srgb);
 734                         SET_BIT(mrts[i].flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
 735
 736                         if (blend[i].is_shader) {
 737                                 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 738                         } else {
 739                                 mrts[i].blend.equation = *blend[i].equation.equation;
 740                                 mrts[i].blend.constant = blend[i].equation.constant;
 741                         }
 742                 }
 743         }
 744 }
 745
 746 static void
 747 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
 748                                struct mali_shader_meta *fragmeta,
 749                                void *rts)
 750 {
 751         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 752         struct panfrost_shader_state *fs;
 753
 754         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 755
 756         fragmeta->alpha_coverage = ~MALI_ALPHA_COVERAGE(0.000000);
 757         fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x3010;
 758         fragmeta->unknown2_4 = 0x4e0;
 759
 760         /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
 761          * is required (independent of 32-bit/64-bit descriptors), or why it's
 762          * not used on later GPU revisions. Otherwise, all shader jobs fault on
 763          * these earlier chips (perhaps this is a chicken bit of some kind).
 764          * More investigation is needed. */
 765
 766         SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
 767
 768         if (dev->quirks & IS_BIFROST) {
 769                 /* TODO */
 770         } else {
 771                 /* Depending on whether it's legal to in the given shader, we try to
 772                  * enable early-z testing (or forward-pixel kill?) */
 773
 774                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
 775                         !fs->can_discard && !fs->writes_depth);
 776
 777                 /* Add the writes Z/S flags if needed. */
 778                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
 779                 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
 780
 781                 /* Any time texturing is used, derivatives are implicitly calculated,
 782                  * so we need to enable helper invocations */
 783
 784                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
 785                         fs->helper_invocations);
 786
 787                 /* CAN_DISCARD should be set if the fragment shader possibly contains a
 788                  * 'discard' instruction. It is likely this is related to optimizations
 789                  * related to forward-pixel kill, as per "Mali Performance 3: Is
 790                  * EGL_BUFFER_PRESERVED a good thing?" by Peter Harris */
 791
 792                 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD, fs->can_discard);
 793                 SET_BIT(fragmeta->midgard1.flags_lo, 0x400, fs->can_discard);
 794         }
 795
 796         panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
 797         panfrost_frag_meta_zsa_update(ctx, fragmeta);
 798         panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
 799 }
 800
 801 void
 802 panfrost_emit_shader_meta(struct panfrost_batch *batch,
 803                           enum pipe_shader_type st,
 804                           struct mali_vertex_tiler_postfix *postfix)
 805 {
 806         struct panfrost_context *ctx = batch->ctx;
 807         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 808
 809         if (!ss) {
 810                 postfix->shader = 0;
 811                 return;
 812         }
 813
 814         struct mali_shader_meta meta;
 815
 816         panfrost_shader_meta_init(ctx, st, &meta);
 817
 818         /* Add the shader BO to the batch. */
 819         panfrost_batch_add_bo(batch, ss->bo,
 820                               PAN_BO_ACCESS_PRIVATE |
 821                               PAN_BO_ACCESS_READ |
 822                               panfrost_bo_access_for_stage(st));
 823
 824         mali_ptr shader_ptr;
 825
 826         if (st == PIPE_SHADER_FRAGMENT) {
 827                 struct panfrost_device *dev = pan_device(ctx->base.screen);
 828                 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 829                 size_t desc_size = sizeof(meta);
 830                 void *rts = NULL;
 831                 struct panfrost_transfer xfer;
 832                 unsigned rt_size;
 833
 834                 if (dev->quirks & MIDGARD_SFBD)
 835                         rt_size = 0;
 836                 else if (dev->quirks & IS_BIFROST)
 837                         rt_size = sizeof(struct bifrost_blend_rt);
 838                 else
 839                         rt_size = sizeof(struct midgard_blend_rt);
 840
 841                 desc_size += rt_size * rt_count;
 842
 843                 if (rt_size)
 844                         rts = rzalloc_size(ctx, rt_size * rt_count);
 845
 846                 panfrost_frag_shader_meta_init(ctx, &meta, rts);
 847
 848                 xfer = panfrost_allocate_transient(batch, desc_size);
 849
 850                 memcpy(xfer.cpu, &meta, sizeof(meta));
 851                 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
 852
 853                 if (rt_size)
 854                         ralloc_free(rts);
 855
 856                 shader_ptr = xfer.gpu;
 857         } else {
 858                 shader_ptr = panfrost_upload_transient(batch, &meta,
 859                                                        sizeof(meta));
 860         }
 861
 862         postfix->shader = shader_ptr;
 863 }
 864
 865 static void
 866 panfrost_mali_viewport_init(struct panfrost_context *ctx,
 867                             struct mali_viewport *mvp)
 868 {
 869         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 870
 871         /* Clip bounds are encoded as floats. The viewport itself is encoded as
 872          * (somewhat) asymmetric ints. */
 873
 874         const struct pipe_scissor_state *ss = &ctx->scissor;
 875
 876         memset(mvp, 0, sizeof(*mvp));
 877
 878         /* By default, do no viewport clipping, i.e. clip to (-inf, inf) in
 879          * each direction. Clipping to the viewport in theory should work, but
 880          * in practice causes issues when we're not explicitly trying to
 881          * scissor */
 882
 883         *mvp = (struct mali_viewport) {
 884                 .clip_minx = -INFINITY,
 885                 .clip_miny = -INFINITY,
 886                 .clip_maxx = INFINITY,
 887                 .clip_maxy = INFINITY,
 888         };
 889
 890         /* Always scissor to the viewport by default. */
 891         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 892         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 893
 894         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 895         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 896
 897         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 898         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 899
 900         /* Apply the scissor test */
 901
 902         unsigned minx, miny, maxx, maxy;
 903
 904         if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
 905                 minx = MAX2(ss->minx, vp_minx);
 906                 miny = MAX2(ss->miny, vp_miny);
 907                 maxx = MIN2(ss->maxx, vp_maxx);
 908                 maxy = MIN2(ss->maxy, vp_maxy);
 909         } else {
 910                 minx = vp_minx;
 911                 miny = vp_miny;
 912                 maxx = vp_maxx;
 913                 maxy = vp_maxy;
 914         }
 915
 916         /* Hardware needs the min/max to be strictly ordered, so flip if we
 917          * need to. The viewport transformation in the vertex shader will
 918          * handle the negatives if we don't */
 919
 920         if (miny > maxy) {
 921                 unsigned temp = miny;
 922                 miny = maxy;
 923                 maxy = temp;
 924         }
 925
 926         if (minx > maxx) {
 927                 unsigned temp = minx;
 928                 minx = maxx;
 929                 maxx = temp;
 930         }
 931
 932         if (minz > maxz) {
 933                 float temp = minz;
 934                 minz = maxz;
 935                 maxz = temp;
 936         }
 937
 938         /* Clamp to the framebuffer size as a last check */
 939
 940         minx = MIN2(ctx->pipe_framebuffer.width, minx);
 941         maxx = MIN2(ctx->pipe_framebuffer.width, maxx);
 942
 943         miny = MIN2(ctx->pipe_framebuffer.height, miny);
 944         maxy = MIN2(ctx->pipe_framebuffer.height, maxy);
 945
 946         /* Upload */
 947
 948         mvp->viewport0[0] = minx;
 949         mvp->viewport1[0] = MALI_POSITIVE(maxx);
 950
 951         mvp->viewport0[1] = miny;
 952         mvp->viewport1[1] = MALI_POSITIVE(maxy);
 953
 954         mvp->clip_minz = minz;
 955         mvp->clip_maxz = maxz;
 956 }
 957
 958 void
 959 panfrost_emit_viewport(struct panfrost_batch *batch,
 960                        struct mali_vertex_tiler_postfix *tiler_postfix)
 961 {
 962         struct panfrost_context *ctx = batch->ctx;
 963         struct mali_viewport mvp;
 964
 965         panfrost_mali_viewport_init(batch->ctx,  &mvp);
 966
 967         /* Update the job, unless we're doing wallpapering (whose lack of
 968          * scissor we can ignore, since if we "miss" a tile of wallpaper, it'll
 969          * just... be faster :) */
 970
 971         if (!ctx->wallpaper_batch)
 972                 panfrost_batch_union_scissor(batch, mvp.viewport0[0],
 973                                              mvp.viewport0[1],
 974                                              mvp.viewport1[0] + 1,
 975                                              mvp.viewport1[1] + 1);
 976
 977         tiler_postfix->viewport = panfrost_upload_transient(batch, &mvp,
 978                                                             sizeof(mvp));
 979 }
 980
 981 static mali_ptr
 982 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
 983                                  enum pipe_shader_type st,
 984                                  struct panfrost_constant_buffer *buf,
 985                                  unsigned index)
 986 {
 987         struct pipe_constant_buffer *cb = &buf->cb[index];
 988         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
 989
 990         if (rsrc) {
 991                 panfrost_batch_add_bo(batch, rsrc->bo,
 992                                       PAN_BO_ACCESS_SHARED |
 993                                       PAN_BO_ACCESS_READ |
 994                                       panfrost_bo_access_for_stage(st));
 995
 996                 /* Alignment gauranteed by
 997                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
 998                 return rsrc->bo->gpu + cb->buffer_offset;
 999         } else if (cb->user_buffer) {
1000                 return panfrost_upload_transient(batch,
1001                                                  cb->user_buffer +
1002                                                  cb->buffer_offset,
1003                                                  cb->buffer_size);
1004         } else {
1005                 unreachable("No constant buffer");
1006         }
1007 }
1008
1009 struct sysval_uniform {
1010         union {
1011                 float f[4];
1012                 int32_t i[4];
1013                 uint32_t u[4];
1014                 uint64_t du[2];
1015         };
1016 };
1017
1018 static void
1019 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
1020                                       struct sysval_uniform *uniform)
1021 {
1022         struct panfrost_context *ctx = batch->ctx;
1023         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1024
1025         uniform->f[0] = vp->scale[0];
1026         uniform->f[1] = vp->scale[1];
1027         uniform->f[2] = vp->scale[2];
1028 }
1029
1030 static void
1031 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1032                                        struct sysval_uniform *uniform)
1033 {
1034         struct panfrost_context *ctx = batch->ctx;
1035         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1036
1037         uniform->f[0] = vp->translate[0];
1038         uniform->f[1] = vp->translate[1];
1039         uniform->f[2] = vp->translate[2];
1040 }
1041
1042 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1043                                        enum pipe_shader_type st,
1044                                        unsigned int sysvalid,
1045                                        struct sysval_uniform *uniform)
1046 {
1047         struct panfrost_context *ctx = batch->ctx;
1048         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1049         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1050         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1051         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1052
1053         assert(dim);
1054         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1055
1056         if (dim > 1)
1057                 uniform->i[1] = u_minify(tex->texture->height0,
1058                                          tex->u.tex.first_level);
1059
1060         if (dim > 2)
1061                 uniform->i[2] = u_minify(tex->texture->depth0,
1062                                          tex->u.tex.first_level);
1063
1064         if (is_array)
1065                 uniform->i[dim] = tex->texture->array_size;
1066 }
1067
1068 static void
1069 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1070                             enum pipe_shader_type st,
1071                             unsigned ssbo_id,
1072                             struct sysval_uniform *uniform)
1073 {
1074         struct panfrost_context *ctx = batch->ctx;
1075
1076         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1077         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1078
1079         /* Compute address */
1080         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1081
1082         panfrost_batch_add_bo(batch, bo,
1083                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1084                               panfrost_bo_access_for_stage(st));
1085
1086         /* Upload address and size as sysval */
1087         uniform->du[0] = bo->gpu + sb.buffer_offset;
1088         uniform->u[2] = sb.buffer_size;
1089 }
1090
1091 static void
1092 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1093                                enum pipe_shader_type st,
1094                                unsigned samp_idx,
1095                                struct sysval_uniform *uniform)
1096 {
1097         struct panfrost_context *ctx = batch->ctx;
1098         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1099
1100         uniform->f[0] = sampl->min_lod;
1101         uniform->f[1] = sampl->max_lod;
1102         uniform->f[2] = sampl->lod_bias;
1103
1104         /* Even without any errata, Midgard represents "no mipmapping" as
1105          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1106          * panfrost_create_sampler_state which also explains our choice of
1107          * epsilon value (again to keep behaviour consistent) */
1108
1109         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1110                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1111 }
1112
1113 static void
1114 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1115                                        struct sysval_uniform *uniform)
1116 {
1117         struct panfrost_context *ctx = batch->ctx;
1118
1119         uniform->u[0] = ctx->compute_grid->grid[0];
1120         uniform->u[1] = ctx->compute_grid->grid[1];
1121         uniform->u[2] = ctx->compute_grid->grid[2];
1122 }
1123
1124 static void
1125 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1126                         struct panfrost_shader_state *ss,
1127                         enum pipe_shader_type st)
1128 {
1129         struct sysval_uniform *uniforms = (void *)buf;
1130
1131         for (unsigned i = 0; i < ss->sysval_count; ++i) {
1132                 int sysval = ss->sysval[i];
1133
1134                 switch (PAN_SYSVAL_TYPE(sysval)) {
1135                 case PAN_SYSVAL_VIEWPORT_SCALE:
1136                         panfrost_upload_viewport_scale_sysval(batch,
1137                                                               &uniforms[i]);
1138                         break;
1139                 case PAN_SYSVAL_VIEWPORT_OFFSET:
1140                         panfrost_upload_viewport_offset_sysval(batch,
1141                                                                &uniforms[i]);
1142                         break;
1143                 case PAN_SYSVAL_TEXTURE_SIZE:
1144                         panfrost_upload_txs_sysval(batch, st,
1145                                                    PAN_SYSVAL_ID(sysval),
1146                                                    &uniforms[i]);
1147                         break;
1148                 case PAN_SYSVAL_SSBO:
1149                         panfrost_upload_ssbo_sysval(batch, st,
1150                                                     PAN_SYSVAL_ID(sysval),
1151                                                     &uniforms[i]);
1152                         break;
1153                 case PAN_SYSVAL_NUM_WORK_GROUPS:
1154                         panfrost_upload_num_work_groups_sysval(batch,
1155                                                                &uniforms[i]);
1156                         break;
1157                 case PAN_SYSVAL_SAMPLER:
1158                         panfrost_upload_sampler_sysval(batch, st,
1159                                                        PAN_SYSVAL_ID(sysval),
1160                                                        &uniforms[i]);
1161                         break;
1162                 default:
1163                         assert(0);
1164                 }
1165         }
1166 }
1167
1168 static const void *
1169 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1170                                  unsigned index)
1171 {
1172         struct pipe_constant_buffer *cb = &buf->cb[index];
1173         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1174
1175         if (rsrc)
1176                 return rsrc->bo->cpu;
1177         else if (cb->user_buffer)
1178                 return cb->user_buffer;
1179         else
1180                 unreachable("No constant buffer");
1181 }
1182
1183 void
1184 panfrost_emit_const_buf(struct panfrost_batch *batch,
1185                         enum pipe_shader_type stage,
1186                         struct mali_vertex_tiler_postfix *postfix)
1187 {
1188         struct panfrost_context *ctx = batch->ctx;
1189         struct panfrost_shader_variants *all = ctx->shader[stage];
1190
1191         if (!all)
1192                 return;
1193
1194         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1195
1196         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1197
1198         /* Uniforms are implicitly UBO #0 */
1199         bool has_uniforms = buf->enabled_mask & (1 << 0);
1200
1201         /* Allocate room for the sysval and the uniforms */
1202         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1203         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1204         size_t size = sys_size + uniform_size;
1205         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1206                                                                         size);
1207
1208         /* Upload sysvals requested by the shader */
1209         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1210
1211         /* Upload uniforms */
1212         if (has_uniforms && uniform_size) {
1213                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1214                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1215         }
1216
1217         /* Next up, attach UBOs. UBO #0 is the uniforms we just
1218          * uploaded */
1219
1220         unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1221         assert(ubo_count >= 1);
1222
1223         size_t sz = sizeof(uint64_t) * ubo_count;
1224         uint64_t ubos[PAN_MAX_CONST_BUFFERS];
1225         int uniform_count = ss->uniform_count;
1226
1227         /* Upload uniforms as a UBO */
1228         ubos[0] = MALI_MAKE_UBO(2 + uniform_count, transfer.gpu);
1229
1230         /* The rest are honest-to-goodness UBOs */
1231
1232         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1233                 size_t usz = buf->cb[ubo].buffer_size;
1234                 bool enabled = buf->enabled_mask & (1 << ubo);
1235                 bool empty = usz == 0;
1236
1237                 if (!enabled || empty) {
1238                         /* Stub out disabled UBOs to catch accesses */
1239                         ubos[ubo] = MALI_MAKE_UBO(0, 0xDEAD0000);
1240                         continue;
1241                 }
1242
1243                 mali_ptr gpu = panfrost_map_constant_buffer_gpu(batch, stage,
1244                                                                 buf, ubo);
1245
1246                 unsigned bytes_per_field = 16;
1247                 unsigned aligned = ALIGN_POT(usz, bytes_per_field);
1248                 ubos[ubo] = MALI_MAKE_UBO(aligned / bytes_per_field, gpu);
1249         }
1250
1251         mali_ptr ubufs = panfrost_upload_transient(batch, ubos, sz);
1252         postfix->uniforms = transfer.gpu;
1253         postfix->uniform_buffers = ubufs;
1254
1255         buf->dirty_mask = 0;
1256 }
1257
1258 void
1259 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1260                             const struct pipe_grid_info *info,
1261                             struct midgard_payload_vertex_tiler *vtp)
1262 {
1263         struct panfrost_context *ctx = batch->ctx;
1264         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1265         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1266         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1267                                                            128));
1268         unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1269                                info->grid[2] * 4;
1270         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1271                                                                   shared_size,
1272                                                                   1);
1273
1274         struct mali_shared_memory shared = {
1275                 .shared_memory = bo->gpu,
1276                 .shared_workgroup_count =
1277                         util_logbase2_ceil(info->grid[0]) +
1278                         util_logbase2_ceil(info->grid[1]) +
1279                         util_logbase2_ceil(info->grid[2]),
1280                 .shared_unk1 = 0x2,
1281                 .shared_shift = util_logbase2(single_size) - 1
1282         };
1283
1284         vtp->postfix.shared_memory = panfrost_upload_transient(batch, &shared,
1285                                                                sizeof(shared));
1286 }
1287
1288 static mali_ptr
1289 panfrost_get_tex_desc(struct panfrost_batch *batch,
1290                       enum pipe_shader_type st,
1291                       struct panfrost_sampler_view *view)
1292 {
1293         if (!view)
1294                 return (mali_ptr) 0;
1295
1296         struct pipe_sampler_view *pview = &view->base;
1297         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1298
1299         /* Add the BO to the job so it's retained until the job is done. */
1300
1301         panfrost_batch_add_bo(batch, rsrc->bo,
1302                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1303                               panfrost_bo_access_for_stage(st));
1304
1305         panfrost_batch_add_bo(batch, view->midgard_bo,
1306                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1307                               panfrost_bo_access_for_stage(st));
1308
1309         return view->midgard_bo->gpu;
1310 }
1311
1312 void
1313 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1314                                   enum pipe_shader_type stage,
1315                                   struct mali_vertex_tiler_postfix *postfix)
1316 {
1317         struct panfrost_context *ctx = batch->ctx;
1318         struct panfrost_device *device = pan_device(ctx->base.screen);
1319
1320         if (!ctx->sampler_view_count[stage])
1321                 return;
1322
1323         if (device->quirks & IS_BIFROST) {
1324                 struct bifrost_texture_descriptor *descriptors;
1325
1326                 descriptors = malloc(sizeof(struct bifrost_texture_descriptor) *
1327                                      ctx->sampler_view_count[stage]);
1328
1329                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1330                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1331                         struct pipe_sampler_view *pview = &view->base;
1332                         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1333
1334                         /* Add the BOs to the job so they are retained until the job is done. */
1335
1336                         panfrost_batch_add_bo(batch, rsrc->bo,
1337                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1338                                               panfrost_bo_access_for_stage(stage));
1339
1340                         panfrost_batch_add_bo(batch, view->bifrost_bo,
1341                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1342                                               panfrost_bo_access_for_stage(stage));
1343
1344                         memcpy(&descriptors[i], view->bifrost_descriptor, sizeof(*view->bifrost_descriptor));
1345                 }
1346
1347                 postfix->textures = panfrost_upload_transient(batch,
1348                                                               descriptors,
1349                                                               sizeof(struct bifrost_texture_descriptor) *
1350                                                                       ctx->sampler_view_count[stage]);
1351
1352                 free(descriptors);
1353         } else {
1354                 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1355
1356                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i)
1357                         trampolines[i] = panfrost_get_tex_desc(batch, stage,
1358                                                                ctx->sampler_views[stage][i]);
1359
1360                 postfix->textures = panfrost_upload_transient(batch,
1361                                                               trampolines,
1362                                                               sizeof(uint64_t) *
1363                                                               ctx->sampler_view_count[stage]);
1364         }
1365 }
1366
1367 void
1368 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1369                                   enum pipe_shader_type stage,
1370                                   struct mali_vertex_tiler_postfix *postfix)
1371 {
1372         struct panfrost_context *ctx = batch->ctx;
1373         struct panfrost_device *device = pan_device(ctx->base.screen);
1374
1375         if (!ctx->sampler_count[stage])
1376                 return;
1377
1378         if (device->quirks & IS_BIFROST) {
1379                 size_t desc_size = sizeof(struct bifrost_sampler_descriptor);
1380                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1381                 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1382                                                                                 transfer_size);
1383                 struct bifrost_sampler_descriptor *desc = (struct bifrost_sampler_descriptor *)transfer.cpu;
1384
1385                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1386                         desc[i] = ctx->samplers[stage][i]->bifrost_hw;
1387
1388                 postfix->sampler_descriptor = transfer.gpu;
1389         } else {
1390                 size_t desc_size = sizeof(struct mali_sampler_descriptor);
1391                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1392                 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1393                                                                                 transfer_size);
1394                 struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1395
1396                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1397                         desc[i] = ctx->samplers[stage][i]->midgard_hw;
1398
1399                 postfix->sampler_descriptor = transfer.gpu;
1400         }
1401 }
1402
1403 void
1404 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1405                                struct mali_vertex_tiler_postfix *vertex_postfix)
1406 {
1407         struct panfrost_context *ctx = batch->ctx;
1408
1409         if (!ctx->vertex)
1410                 return;
1411
1412         struct panfrost_vertex_state *so = ctx->vertex;
1413
1414         panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1415         vertex_postfix->attribute_meta = panfrost_upload_transient(batch, so->hw,
1416                                                                sizeof(*so->hw) *
1417                                                                PAN_MAX_ATTRIBUTE);
1418 }
1419
1420 void
1421 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1422                           struct mali_vertex_tiler_postfix *vertex_postfix)
1423 {
1424         struct panfrost_context *ctx = batch->ctx;
1425         struct panfrost_vertex_state *so = ctx->vertex;
1426
1427         /* Staged mali_attr, and index into them. i =/= k, depending on the
1428          * vertex buffer mask and instancing. Twice as much room is allocated,
1429          * for a worst case of NPOT_DIVIDEs which take up extra slot */
1430         union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1431         unsigned k = 0;
1432
1433         for (unsigned i = 0; i < so->num_elements; ++i) {
1434                 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1435                  * means duplicating some vertex buffers (who cares? aside from
1436                  * maybe some caching implications but I somehow doubt that
1437                  * matters) */
1438
1439                 struct pipe_vertex_element *elem = &so->pipe[i];
1440                 unsigned vbi = elem->vertex_buffer_index;
1441
1442                 /* The exception to 1:1 mapping is that we can have multiple
1443                  * entries (NPOT divisors), so we fixup anyways */
1444
1445                 so->hw[i].index = k;
1446
1447                 if (!(ctx->vb_mask & (1 << vbi)))
1448                         continue;
1449
1450                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1451                 struct panfrost_resource *rsrc;
1452
1453                 rsrc = pan_resource(buf->buffer.resource);
1454                 if (!rsrc)
1455                         continue;
1456
1457                 /* Align to 64 bytes by masking off the lower bits. This
1458                  * will be adjusted back when we fixup the src_offset in
1459                  * mali_attr_meta */
1460
1461                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1462                 mali_ptr addr = raw_addr & ~63;
1463                 unsigned chopped_addr = raw_addr - addr;
1464
1465                 /* Add a dependency of the batch on the vertex buffer */
1466                 panfrost_batch_add_bo(batch, rsrc->bo,
1467                                       PAN_BO_ACCESS_SHARED |
1468                                       PAN_BO_ACCESS_READ |
1469                                       PAN_BO_ACCESS_VERTEX_TILER);
1470
1471                 /* Set common fields */
1472                 attrs[k].elements = addr;
1473                 attrs[k].stride = buf->stride;
1474
1475                 /* Since we advanced the base pointer, we shrink the buffer
1476                  * size */
1477                 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1478
1479                 /* We need to add the extra size we masked off (for
1480                  * correctness) so the data doesn't get clamped away */
1481                 attrs[k].size += chopped_addr;
1482
1483                 /* For non-instancing make sure we initialize */
1484                 attrs[k].shift = attrs[k].extra_flags = 0;
1485
1486                 /* Instancing uses a dramatically different code path than
1487                  * linear, so dispatch for the actual emission now that the
1488                  * common code is finished */
1489
1490                 unsigned divisor = elem->instance_divisor;
1491
1492                 if (divisor && ctx->instance_count == 1) {
1493                         /* Silly corner case where there's a divisor(=1) but
1494                          * there's no legitimate instancing. So we want *every*
1495                          * attribute to be the same. So set stride to zero so
1496                          * we don't go anywhere. */
1497
1498                         attrs[k].size = attrs[k].stride + chopped_addr;
1499                         attrs[k].stride = 0;
1500                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1501                 } else if (ctx->instance_count <= 1) {
1502                         /* Normal, non-instanced attributes */
1503                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1504                 } else {
1505                         unsigned instance_shift = vertex_postfix->instance_shift;
1506                         unsigned instance_odd = vertex_postfix->instance_odd;
1507
1508                         k += panfrost_vertex_instanced(ctx->padded_count,
1509                                                        instance_shift,
1510                                                        instance_odd,
1511                                                        divisor, &attrs[k]);
1512                 }
1513         }
1514
1515         /* Add special gl_VertexID/gl_InstanceID buffers */
1516
1517         panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1518         so->hw[PAN_VERTEX_ID].index = k++;
1519         panfrost_instance_id(ctx->padded_count, &attrs[k]);
1520         so->hw[PAN_INSTANCE_ID].index = k++;
1521
1522         /* Upload whatever we emitted and go */
1523
1524         vertex_postfix->attributes = panfrost_upload_transient(batch, attrs,
1525                                                            k * sizeof(*attrs));
1526 }
1527
1528 static mali_ptr
1529 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1530                        unsigned stride, unsigned count)
1531 {
1532         /* Fill out the descriptor */
1533         slot->stride = stride;
1534         slot->size = stride * count;
1535         slot->shift = slot->extra_flags = 0;
1536
1537         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1538                                                                         slot->size);
1539
1540         slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1541
1542         return transfer.gpu;
1543 }
1544
1545 static void
1546 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1547                         unsigned stride, unsigned offset, unsigned count,
1548                         struct pipe_stream_output_target *target)
1549 {
1550         /* Fill out the descriptor */
1551         slot->stride = stride * 4;
1552         slot->shift = slot->extra_flags = 0;
1553
1554         unsigned max_size = target->buffer_size;
1555         unsigned expected_size = slot->stride * count;
1556
1557         slot->size = MIN2(max_size, expected_size);
1558
1559         /* Grab the BO and bind it to the batch */
1560         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1561
1562         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1563          * the perspective of the TILER and FRAGMENT.
1564          */
1565         panfrost_batch_add_bo(batch, bo,
1566                               PAN_BO_ACCESS_SHARED |
1567                               PAN_BO_ACCESS_RW |
1568                               PAN_BO_ACCESS_VERTEX_TILER |
1569                               PAN_BO_ACCESS_FRAGMENT);
1570
1571         mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1572         slot->elements = addr;
1573 }
1574
1575 /* Given a shader and buffer indices, link varying metadata together */
1576
1577 static bool
1578 is_special_varying(gl_varying_slot loc)
1579 {
1580         switch (loc) {
1581         case VARYING_SLOT_POS:
1582         case VARYING_SLOT_PSIZ:
1583         case VARYING_SLOT_PNTC:
1584         case VARYING_SLOT_FACE:
1585                 return true;
1586         default:
1587                 return false;
1588         }
1589 }
1590
1591 static void
1592 panfrost_emit_varying_meta(void *outptr, struct panfrost_shader_state *ss,
1593                            signed general, signed gl_Position,
1594                            signed gl_PointSize, signed gl_PointCoord,
1595                            signed gl_FrontFacing)
1596 {
1597         struct mali_attr_meta *out = (struct mali_attr_meta *) outptr;
1598
1599         for (unsigned i = 0; i < ss->varying_count; ++i) {
1600                 gl_varying_slot location = ss->varyings_loc[i];
1601                 int index = -1;
1602
1603                 switch (location) {
1604                 case VARYING_SLOT_POS:
1605                         index = gl_Position;
1606                         break;
1607                 case VARYING_SLOT_PSIZ:
1608                         index = gl_PointSize;
1609                         break;
1610                 case VARYING_SLOT_PNTC:
1611                         index = gl_PointCoord;
1612                         break;
1613                 case VARYING_SLOT_FACE:
1614                         index = gl_FrontFacing;
1615                         break;
1616                 default:
1617                         index = general;
1618                         break;
1619                 }
1620
1621                 assert(index >= 0);
1622                 out[i].index = index;
1623         }
1624 }
1625
1626 static bool
1627 has_point_coord(unsigned mask, gl_varying_slot loc)
1628 {
1629         if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1630                 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1631         else if (loc == VARYING_SLOT_PNTC)
1632                 return (mask & (1 << 8));
1633         else
1634                 return false;
1635 }
1636
1637 /* Helpers for manipulating stream out information so we can pack varyings
1638  * accordingly. Compute the src_offset for a given captured varying */
1639
1640 static struct pipe_stream_output *
1641 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1642 {
1643         for (unsigned i = 0; i < info->num_outputs; ++i) {
1644                 if (info->output[i].register_index == loc)
1645                         return &info->output[i];
1646         }
1647
1648         unreachable("Varying not captured");
1649 }
1650
1651 /* TODO: Integers */
1652 static enum mali_format
1653 pan_xfb_format(unsigned nr_components)
1654 {
1655         switch (nr_components) {
1656                 case 1: return MALI_R32F;
1657                 case 2: return MALI_RG32F;
1658                 case 3: return MALI_RGB32F;
1659                 case 4: return MALI_RGBA32F;
1660                 default: unreachable("Invalid format");
1661         }
1662 }
1663
1664 void
1665 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1666                                  unsigned vertex_count,
1667                                  struct mali_vertex_tiler_postfix *vertex_postfix,
1668                                  struct mali_vertex_tiler_postfix *tiler_postfix,
1669                                  union midgard_primitive_size *primitive_size)
1670 {
1671         /* Load the shaders */
1672         struct panfrost_context *ctx = batch->ctx;
1673         struct panfrost_shader_state *vs, *fs;
1674         unsigned int num_gen_varyings = 0;
1675         size_t vs_size, fs_size;
1676
1677         /* Allocate the varying descriptor */
1678
1679         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1680         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1681         vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
1682         fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
1683
1684         struct panfrost_transfer trans = panfrost_allocate_transient(batch,
1685                                                                      vs_size +
1686                                                                      fs_size);
1687
1688         struct pipe_stream_output_info *so = &vs->stream_output;
1689
1690         /* Check if this varying is linked by us. This is the case for
1691          * general-purpose, non-captured varyings. If it is, link it. If it's
1692          * not, use the provided stream out information to determine the
1693          * offset, since it was already linked for us. */
1694
1695         for (unsigned i = 0; i < vs->varying_count; i++) {
1696                 gl_varying_slot loc = vs->varyings_loc[i];
1697
1698                 bool special = is_special_varying(loc);
1699                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1700
1701                 if (captured) {
1702                         struct pipe_stream_output *o = pan_get_so(so, loc);
1703
1704                         unsigned dst_offset = o->dst_offset * 4; /* dwords */
1705                         vs->varyings[i].src_offset = dst_offset;
1706                 } else if (!special) {
1707                         vs->varyings[i].src_offset = 16 * (num_gen_varyings++);
1708                 }
1709         }
1710
1711         /* Conversely, we need to set src_offset for the captured varyings.
1712          * Here, the layout is defined by the stream out info, not us */
1713
1714         /* Link up with fragment varyings */
1715         bool reads_point_coord = fs->reads_point_coord;
1716
1717         for (unsigned i = 0; i < fs->varying_count; i++) {
1718                 gl_varying_slot loc = fs->varyings_loc[i];
1719                 unsigned src_offset;
1720                 signed vs_idx = -1;
1721
1722                 /* Link up */
1723                 for (unsigned j = 0; j < vs->varying_count; ++j) {
1724                         if (vs->varyings_loc[j] == loc) {
1725                                 vs_idx = j;
1726                                 break;
1727                         }
1728                 }
1729
1730                 /* Either assign or reuse */
1731                 if (vs_idx >= 0)
1732                         src_offset = vs->varyings[vs_idx].src_offset;
1733                 else
1734                         src_offset = 16 * (num_gen_varyings++);
1735
1736                 fs->varyings[i].src_offset = src_offset;
1737
1738                 if (has_point_coord(fs->point_sprite_mask, loc))
1739                         reads_point_coord = true;
1740         }
1741
1742         memcpy(trans.cpu, vs->varyings, vs_size);
1743         memcpy(trans.cpu + vs_size, fs->varyings, fs_size);
1744
1745         union mali_attr varyings[PIPE_MAX_ATTRIBS] = {0};
1746
1747         /* Figure out how many streamout buffers could be bound */
1748         unsigned so_count = ctx->streamout.num_targets;
1749         for (unsigned i = 0; i < vs->varying_count; i++) {
1750                 gl_varying_slot loc = vs->varyings_loc[i];
1751
1752                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1753                 if (!captured) continue;
1754
1755                 struct pipe_stream_output *o = pan_get_so(so, loc);
1756                 so_count = MAX2(so_count, o->output_buffer + 1);
1757         }
1758
1759         signed idx = so_count;
1760         signed general = idx++;
1761         signed gl_Position = idx++;
1762         signed gl_PointSize = vs->writes_point_size ? (idx++) : -1;
1763         signed gl_PointCoord = reads_point_coord ? (idx++) : -1;
1764         signed gl_FrontFacing = fs->reads_face ? (idx++) : -1;
1765         signed gl_FragCoord = fs->reads_frag_coord ? (idx++) : -1;
1766
1767         /* Emit the stream out buffers */
1768
1769         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1770                                                            ctx->vertex_count);
1771
1772         for (unsigned i = 0; i < so_count; ++i) {
1773                 if (i < ctx->streamout.num_targets) {
1774                         panfrost_emit_streamout(batch, &varyings[i],
1775                                                 so->stride[i],
1776                                                 ctx->streamout.offsets[i],
1777                                                 out_count,
1778                                                 ctx->streamout.targets[i]);
1779                 } else {
1780                         /* Emit a dummy buffer */
1781                         panfrost_emit_varyings(batch, &varyings[i],
1782                                                so->stride[i] * 4,
1783                                                out_count);
1784
1785                         /* Clear the attribute type */
1786                         varyings[i].elements &= ~0xF;
1787                 }
1788         }
1789
1790         panfrost_emit_varyings(batch, &varyings[general],
1791                                num_gen_varyings * 16,
1792                                vertex_count);
1793
1794         mali_ptr varyings_p;
1795
1796         /* fp32 vec4 gl_Position */
1797         varyings_p = panfrost_emit_varyings(batch, &varyings[gl_Position],
1798                                             sizeof(float) * 4, vertex_count);
1799         tiler_postfix->position_varying = varyings_p;
1800
1801
1802         if (panfrost_writes_point_size(ctx)) {
1803                 varyings_p = panfrost_emit_varyings(batch,
1804                                                     &varyings[gl_PointSize],
1805                                                     2, vertex_count);
1806                 primitive_size->pointer = varyings_p;
1807         }
1808
1809         if (reads_point_coord)
1810                 varyings[gl_PointCoord].elements = MALI_VARYING_POINT_COORD;
1811
1812         if (fs->reads_face)
1813                 varyings[gl_FrontFacing].elements = MALI_VARYING_FRONT_FACING;
1814
1815         if (fs->reads_frag_coord)
1816                 varyings[gl_FragCoord].elements = MALI_VARYING_FRAG_COORD;
1817
1818         struct panfrost_device *device = pan_device(ctx->base.screen);
1819         assert(!(device->quirks & IS_BIFROST) || !(reads_point_coord));
1820
1821         /* Let's go ahead and link varying meta to the buffer in question, now
1822          * that that information is available. VARYING_SLOT_POS is mapped to
1823          * gl_FragCoord for fragment shaders but gl_Positionf or vertex shaders
1824          * */
1825
1826         panfrost_emit_varying_meta(trans.cpu, vs, general, gl_Position,
1827                                    gl_PointSize, gl_PointCoord,
1828                                    gl_FrontFacing);
1829
1830         panfrost_emit_varying_meta(trans.cpu + vs_size, fs, general,
1831                                    gl_FragCoord, gl_PointSize,
1832                                    gl_PointCoord, gl_FrontFacing);
1833
1834         /* Replace streamout */
1835
1836         struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
1837         struct mali_attr_meta *ofs = ovs + vs->varying_count;
1838
1839         for (unsigned i = 0; i < vs->varying_count; i++) {
1840                 gl_varying_slot loc = vs->varyings_loc[i];
1841
1842                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1843                 if (!captured)
1844                         continue;
1845
1846                 struct pipe_stream_output *o = pan_get_so(so, loc);
1847                 ovs[i].index = o->output_buffer;
1848
1849                 /* Set the type appropriately. TODO: Integer varyings XXX */
1850                 assert(o->stream == 0);
1851                 ovs[i].format = pan_xfb_format(o->num_components);
1852
1853                 if (device->quirks & HAS_SWIZZLES)
1854                         ovs[i].swizzle = panfrost_get_default_swizzle(o->num_components);
1855                 else
1856                         ovs[i].swizzle = panfrost_bifrost_swizzle(o->num_components);
1857
1858                 /* Link to the fragment */
1859                 signed fs_idx = -1;
1860
1861                 /* Link up */
1862                 for (unsigned j = 0; j < fs->varying_count; ++j) {
1863                         if (fs->varyings_loc[j] == loc) {
1864                                 fs_idx = j;
1865                                 break;
1866                         }
1867                 }
1868
1869                 if (fs_idx >= 0) {
1870                         ofs[fs_idx].index = ovs[i].index;
1871                         ofs[fs_idx].format = ovs[i].format;
1872                         ofs[fs_idx].swizzle = ovs[i].swizzle;
1873                 }
1874         }
1875
1876         /* Replace point sprite */
1877         for (unsigned i = 0; i < fs->varying_count; i++) {
1878                 /* If we have a point sprite replacement, handle that here. We
1879                  * have to translate location first.  TODO: Flip y in shader.
1880                  * We're already keying ... just time crunch .. */
1881
1882                 if (has_point_coord(fs->point_sprite_mask,
1883                                     fs->varyings_loc[i])) {
1884                         ofs[i].index = gl_PointCoord;
1885
1886                         /* Swizzle out the z/w to 0/1 */
1887                         ofs[i].format = MALI_RG16F;
1888                         ofs[i].swizzle = panfrost_get_default_swizzle(2);
1889                 }
1890         }
1891
1892         /* Fix up unaligned addresses */
1893         for (unsigned i = 0; i < so_count; ++i) {
1894                 if (varyings[i].elements < MALI_RECORD_SPECIAL)
1895                         continue;
1896
1897                 unsigned align = (varyings[i].elements & 63);
1898
1899                 /* While we're at it, the SO buffers are linear */
1900
1901                 if (!align) {
1902                         varyings[i].elements |= MALI_ATTR_LINEAR;
1903                         continue;
1904                 }
1905
1906                 /* We need to adjust alignment */
1907                 varyings[i].elements &= ~63;
1908                 varyings[i].elements |= MALI_ATTR_LINEAR;
1909                 varyings[i].size += align;
1910
1911                 for (unsigned v = 0; v < vs->varying_count; ++v) {
1912                         if (ovs[v].index != i)
1913                                 continue;
1914
1915                         ovs[v].src_offset = vs->varyings[v].src_offset + align;
1916                 }
1917
1918                 for (unsigned f = 0; f < fs->varying_count; ++f) {
1919                         if (ofs[f].index != i)
1920                                 continue;
1921
1922                         ofs[f].src_offset = fs->varyings[f].src_offset + align;
1923                 }
1924         }
1925
1926         varyings_p = panfrost_upload_transient(batch, varyings,
1927                                                idx * sizeof(*varyings));
1928         vertex_postfix->varyings = varyings_p;
1929         tiler_postfix->varyings = varyings_p;
1930
1931         vertex_postfix->varying_meta = trans.gpu;
1932         tiler_postfix->varying_meta = trans.gpu + vs_size;
1933 }
1934
1935 void
1936 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1937                                 struct mali_vertex_tiler_prefix *vertex_prefix,
1938                                 struct mali_vertex_tiler_postfix *vertex_postfix,
1939                                 struct mali_vertex_tiler_prefix *tiler_prefix,
1940                                 struct mali_vertex_tiler_postfix *tiler_postfix,
1941                                 union midgard_primitive_size *primitive_size)
1942 {
1943         struct panfrost_context *ctx = batch->ctx;
1944         struct panfrost_device *device = pan_device(ctx->base.screen);
1945         bool wallpapering = ctx->wallpaper_batch && batch->tiler_dep;
1946         struct bifrost_payload_vertex bifrost_vertex = {0,};
1947         struct bifrost_payload_tiler bifrost_tiler = {0,};
1948         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
1949         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
1950         void *vp, *tp;
1951         size_t vp_size, tp_size;
1952
1953         if (device->quirks & IS_BIFROST) {
1954                 bifrost_vertex.prefix = *vertex_prefix;
1955                 bifrost_vertex.postfix = *vertex_postfix;
1956                 vp = &bifrost_vertex;
1957                 vp_size = sizeof(bifrost_vertex);
1958
1959                 bifrost_tiler.prefix = *tiler_prefix;
1960                 bifrost_tiler.tiler.primitive_size = *primitive_size;
1961                 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
1962                 bifrost_tiler.postfix = *tiler_postfix;
1963                 tp = &bifrost_tiler;
1964                 tp_size = sizeof(bifrost_tiler);
1965         } else {
1966                 midgard_vertex.prefix = *vertex_prefix;
1967                 midgard_vertex.postfix = *vertex_postfix;
1968                 vp = &midgard_vertex;
1969                 vp_size = sizeof(midgard_vertex);
1970
1971                 midgard_tiler.prefix = *tiler_prefix;
1972                 midgard_tiler.postfix = *tiler_postfix;
1973                 midgard_tiler.primitive_size = *primitive_size;
1974                 tp = &midgard_tiler;
1975                 tp_size = sizeof(midgard_tiler);
1976         }
1977
1978         if (wallpapering) {
1979                 /* Inject in reverse order, with "predicted" job indices.
1980                  * THIS IS A HACK XXX */
1981                 panfrost_new_job(batch, JOB_TYPE_TILER, false,
1982                                  batch->job_index + 2, tp, tp_size, true);
1983                 panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
1984                                  vp, vp_size, true);
1985                 return;
1986         }
1987
1988         /* If rasterizer discard is enable, only submit the vertex */
1989
1990         bool rasterizer_discard = ctx->rasterizer &&
1991                                   ctx->rasterizer->base.rasterizer_discard;
1992
1993         unsigned vertex = panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
1994                                            vp, vp_size, false);
1995
1996         if (rasterizer_discard)
1997                 return;
1998
1999         panfrost_new_job(batch, JOB_TYPE_TILER, false, vertex, tp, tp_size,
2000                          false);
2001 }
2002
2003 /* TODO: stop hardcoding this */
2004 mali_ptr
2005 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2006 {
2007         uint16_t locations[] = {
2008             128, 128,
2009             0, 256,
2010             0, 256,
2011             0, 256,
2012             0, 256,
2013             0, 256,
2014             0, 256,
2015             0, 256,
2016             0, 256,
2017             0, 256,
2018             0, 256,
2019             0, 256,
2020             0, 256,
2021             0, 256,
2022             0, 256,
2023             0, 256,
2024             0, 256,
2025             0, 256,
2026             0, 256,
2027             0, 256,
2028             0, 256,
2029             0, 256,
2030             0, 256,
2031             0, 256,
2032             0, 256,
2033             0, 256,
2034             0, 256,
2035             0, 256,
2036             0, 256,
2037             0, 256,
2038             0, 256,
2039             0, 256,
2040             128, 128,
2041             0, 0,
2042             0, 0,
2043             0, 0,
2044             0, 0,
2045             0, 0,
2046             0, 0,
2047             0, 0,
2048             0, 0,
2049             0, 0,
2050             0, 0,
2051             0, 0,
2052             0, 0,
2053             0, 0,
2054             0, 0,
2055             0, 0,
2056         };
2057
2058         return panfrost_upload_transient(batch, locations, 96 * sizeof(uint16_t));
2059 }