src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28
  29 #include "panfrost-quirks.h"
  30
  31 #include "pan_allocate.h"
  32 #include "pan_bo.h"
  33 #include "pan_cmdstream.h"
  34 #include "pan_context.h"
  35 #include "pan_job.h"
  36
  37 /* If a BO is accessed for a particular shader stage, will it be in the primary
  38  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  39  * fragment will be primary, e.g. compute jobs will be considered
  40  * "vertex/tiler" by analogy */
  41
  42 static inline uint32_t
  43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  44 {
  45         assert(stage == PIPE_SHADER_FRAGMENT ||
  46                stage == PIPE_SHADER_VERTEX ||
  47                stage == PIPE_SHADER_COMPUTE);
  48
  49         return stage == PIPE_SHADER_FRAGMENT ?
  50                PAN_BO_ACCESS_FRAGMENT :
  51                PAN_BO_ACCESS_VERTEX_TILER;
  52 }
  53
  54 static void
  55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
  56                                struct mali_vertex_tiler_postfix *postfix)
  57 {
  58         struct panfrost_device *dev = pan_device(ctx->base.screen);
  59         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  60
  61         unsigned shift = panfrost_get_stack_shift(batch->stack_size);
  62         struct mali_shared_memory shared = {
  63                 .stack_shift = shift,
  64                 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
  65                 .shared_workgroup_count = ~0,
  66         };
  67         postfix->shared_memory = panfrost_upload_transient(batch, &shared, sizeof(shared));
  68 }
  69
  70 static void
  71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
  72                                struct mali_vertex_tiler_postfix *postfix)
  73 {
  74         struct panfrost_device *dev = pan_device(ctx->base.screen);
  75         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  76
  77         /* If we haven't, reserve space for the framebuffer */
  78
  79         if (!batch->framebuffer.gpu) {
  80                 unsigned size = (dev->quirks & MIDGARD_SFBD) ?
  81                         sizeof(struct mali_single_framebuffer) :
  82                         sizeof(struct mali_framebuffer);
  83
  84                 batch->framebuffer = panfrost_allocate_transient(batch, size);
  85
  86                 /* Tag the pointer */
  87                 if (!(dev->quirks & MIDGARD_SFBD))
  88                         batch->framebuffer.gpu |= MALI_MFBD;
  89         }
  90
  91         postfix->shared_memory = batch->framebuffer.gpu;
  92 }
  93
  94 static void
  95 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
  96                               struct mali_vertex_tiler_prefix *prefix,
  97                               struct mali_vertex_tiler_postfix *postfix)
  98 {
  99         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 100
 101         postfix->gl_enables |= 0x7;
 102         SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
 103                 rasterizer && rasterizer->base.front_ccw);
 104         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
 105                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
 106         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
 107                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
 108         SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
 109                 rasterizer && rasterizer->base.flatshade_first);
 110 }
 111
 112 void
 113 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
 114                                   struct mali_vertex_tiler_prefix *prefix,
 115                                   union midgard_primitive_size *primitive_size)
 116 {
 117         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 118
 119         if (!panfrost_writes_point_size(ctx)) {
 120                 bool points = prefix->draw_mode == MALI_POINTS;
 121                 float val = 0.0f;
 122
 123                 if (rasterizer)
 124                         val = points ?
 125                               rasterizer->base.point_size :
 126                               rasterizer->base.line_width;
 127
 128                 primitive_size->constant = val;
 129         }
 130 }
 131
 132 static void
 133 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
 134                                    struct mali_vertex_tiler_postfix *postfix)
 135 {
 136         SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
 137         if (ctx->occlusion_query)
 138                 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
 139         else
 140                 postfix->occlusion_counter = 0;
 141 }
 142
 143 void
 144 panfrost_vt_init(struct panfrost_context *ctx,
 145                  enum pipe_shader_type stage,
 146                  struct mali_vertex_tiler_prefix *prefix,
 147                  struct mali_vertex_tiler_postfix *postfix)
 148 {
 149         struct panfrost_device *device = pan_device(ctx->base.screen);
 150
 151         if (!ctx->shader[stage])
 152                 return;
 153
 154         memset(prefix, 0, sizeof(*prefix));
 155         memset(postfix, 0, sizeof(*postfix));
 156
 157         if (device->quirks & IS_BIFROST) {
 158                 postfix->gl_enables = 0x2;
 159                 panfrost_vt_emit_shared_memory(ctx, postfix);
 160         } else {
 161                 postfix->gl_enables = 0x6;
 162                 panfrost_vt_attach_framebuffer(ctx, postfix);
 163         }
 164
 165         if (stage == PIPE_SHADER_FRAGMENT) {
 166                 panfrost_vt_update_occlusion_query(ctx, postfix);
 167                 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
 168         }
 169 }
 170
 171 static unsigned
 172 panfrost_translate_index_size(unsigned size)
 173 {
 174         switch (size) {
 175         case 1:
 176                 return MALI_DRAW_INDEXED_UINT8;
 177
 178         case 2:
 179                 return MALI_DRAW_INDEXED_UINT16;
 180
 181         case 4:
 182                 return MALI_DRAW_INDEXED_UINT32;
 183
 184         default:
 185                 unreachable("Invalid index size");
 186         }
 187 }
 188
 189 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
 190  * good for the duration of the draw (transient), could last longer. Also get
 191  * the bounds on the index buffer for the range accessed by the draw. We do
 192  * these operations together because there are natural optimizations which
 193  * require them to be together. */
 194
 195 static mali_ptr
 196 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 197                                   const struct pipe_draw_info *info,
 198                                   unsigned *min_index, unsigned *max_index)
 199 {
 200         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 201         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 202         off_t offset = info->start * info->index_size;
 203         bool needs_indices = true;
 204         mali_ptr out = 0;
 205
 206         if (info->max_index != ~0u) {
 207                 *min_index = info->min_index;
 208                 *max_index = info->max_index;
 209                 needs_indices = false;
 210         }
 211
 212         if (!info->has_user_indices) {
 213                 /* Only resources can be directly mapped */
 214                 panfrost_batch_add_bo(batch, rsrc->bo,
 215                                       PAN_BO_ACCESS_SHARED |
 216                                       PAN_BO_ACCESS_READ |
 217                                       PAN_BO_ACCESS_VERTEX_TILER);
 218                 out = rsrc->bo->gpu + offset;
 219
 220                 /* Check the cache */
 221                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 222                                                            info->start,
 223                                                            info->count,
 224                                                            min_index,
 225                                                            max_index);
 226         } else {
 227                 /* Otherwise, we need to upload to transient memory */
 228                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 229                 out = panfrost_upload_transient(batch, ibuf8 + offset,
 230                                                 info->count *
 231                                                 info->index_size);
 232         }
 233
 234         if (needs_indices) {
 235                 /* Fallback */
 236                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 237
 238                 if (!info->has_user_indices)
 239                         panfrost_minmax_cache_add(rsrc->index_cache,
 240                                                   info->start, info->count,
 241                                                   *min_index, *max_index);
 242         }
 243
 244         return out;
 245 }
 246
 247 void
 248 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
 249                           const struct pipe_draw_info *info,
 250                           enum mali_draw_mode draw_mode,
 251                           struct mali_vertex_tiler_postfix *vertex_postfix,
 252                           struct mali_vertex_tiler_prefix *tiler_prefix,
 253                           struct mali_vertex_tiler_postfix *tiler_postfix,
 254                           unsigned *vertex_count,
 255                           unsigned *padded_count)
 256 {
 257         tiler_prefix->draw_mode = draw_mode;
 258
 259         unsigned draw_flags = 0;
 260
 261         if (panfrost_writes_point_size(ctx))
 262                 draw_flags |= MALI_DRAW_VARYING_SIZE;
 263
 264         if (info->primitive_restart)
 265                 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
 266
 267         /* These doesn't make much sense */
 268
 269         draw_flags |= 0x3000;
 270
 271         if (info->index_size) {
 272                 unsigned min_index = 0, max_index = 0;
 273
 274                 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
 275                                                                        info,
 276                                                                        &min_index,
 277                                                                        &max_index);
 278
 279                 /* Use the corresponding values */
 280                 *vertex_count = max_index - min_index + 1;
 281                 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
 282                 tiler_prefix->offset_bias_correction = -min_index;
 283                 tiler_prefix->index_count = MALI_POSITIVE(info->count);
 284                 draw_flags |= panfrost_translate_index_size(info->index_size);
 285         } else {
 286                 tiler_prefix->indices = 0;
 287                 *vertex_count = ctx->vertex_count;
 288                 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
 289                 tiler_prefix->offset_bias_correction = 0;
 290                 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
 291         }
 292
 293         tiler_prefix->unknown_draw = draw_flags;
 294
 295         /* Encode the padded vertex count */
 296
 297         if (info->instance_count > 1) {
 298                 *padded_count = panfrost_padded_vertex_count(*vertex_count);
 299
 300                 unsigned shift = __builtin_ctz(ctx->padded_count);
 301                 unsigned k = ctx->padded_count >> (shift + 1);
 302
 303                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
 304                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
 305         } else {
 306                 *padded_count = *vertex_count;
 307
 308                 /* Reset instancing state */
 309                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
 310                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
 311         }
 312 }
 313
 314 static void
 315 panfrost_shader_meta_init(struct panfrost_context *ctx,
 316                           enum pipe_shader_type st,
 317                           struct mali_shader_meta *meta)
 318 {
 319         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 320         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 321
 322         memset(meta, 0, sizeof(*meta));
 323         meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
 324         meta->attribute_count = ss->attribute_count;
 325         meta->varying_count = ss->varying_count;
 326         meta->texture_count = ctx->sampler_view_count[st];
 327         meta->sampler_count = ctx->sampler_count[st];
 328
 329         if (dev->quirks & IS_BIFROST) {
 330                 if (st == PIPE_SHADER_VERTEX)
 331                         meta->bifrost1.unk1 = 0x800000;
 332                 else {
 333                         /* First clause ATEST |= 0x4000000.
 334                          * Less than 32 regs |= 0x200 */
 335                         meta->bifrost1.unk1 = 0x950020;
 336                 }
 337
 338                 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 339                 if (st == PIPE_SHADER_VERTEX)
 340                         meta->bifrost2.preload_regs = 0xC0;
 341                 else {
 342                         meta->bifrost2.preload_regs = 0x1;
 343                         SET_BIT(meta->bifrost2.preload_regs, 0x10, ss->reads_frag_coord);
 344                 }
 345
 346                 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
 347                                                     ss->uniform_cutoff);
 348         } else {
 349                 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
 350                                                     ss->uniform_cutoff);
 351                 meta->midgard1.work_count = ss->work_reg_count;
 352
 353                 /* TODO: This is not conformant on ES3 */
 354                 meta->midgard1.flags_hi = MALI_SUPPRESS_INF_NAN;
 355
 356                 meta->midgard1.flags_lo = 0x20;
 357                 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 358
 359                 SET_BIT(meta->midgard1.flags_hi, MALI_WRITES_GLOBAL, ss->writes_global);
 360         }
 361 }
 362
 363 static unsigned
 364 panfrost_translate_compare_func(enum pipe_compare_func in)
 365 {
 366         switch (in) {
 367         case PIPE_FUNC_NEVER:
 368                 return MALI_FUNC_NEVER;
 369
 370         case PIPE_FUNC_LESS:
 371                 return MALI_FUNC_LESS;
 372
 373         case PIPE_FUNC_EQUAL:
 374                 return MALI_FUNC_EQUAL;
 375
 376         case PIPE_FUNC_LEQUAL:
 377                 return MALI_FUNC_LEQUAL;
 378
 379         case PIPE_FUNC_GREATER:
 380                 return MALI_FUNC_GREATER;
 381
 382         case PIPE_FUNC_NOTEQUAL:
 383                 return MALI_FUNC_NOTEQUAL;
 384
 385         case PIPE_FUNC_GEQUAL:
 386                 return MALI_FUNC_GEQUAL;
 387
 388         case PIPE_FUNC_ALWAYS:
 389                 return MALI_FUNC_ALWAYS;
 390
 391         default:
 392                 unreachable("Invalid func");
 393         }
 394 }
 395
 396 static unsigned
 397 panfrost_translate_stencil_op(enum pipe_stencil_op in)
 398 {
 399         switch (in) {
 400         case PIPE_STENCIL_OP_KEEP:
 401                 return MALI_STENCIL_KEEP;
 402
 403         case PIPE_STENCIL_OP_ZERO:
 404                 return MALI_STENCIL_ZERO;
 405
 406         case PIPE_STENCIL_OP_REPLACE:
 407                return MALI_STENCIL_REPLACE;
 408
 409         case PIPE_STENCIL_OP_INCR:
 410                 return MALI_STENCIL_INCR;
 411
 412         case PIPE_STENCIL_OP_DECR:
 413                 return MALI_STENCIL_DECR;
 414
 415         case PIPE_STENCIL_OP_INCR_WRAP:
 416                 return MALI_STENCIL_INCR_WRAP;
 417
 418         case PIPE_STENCIL_OP_DECR_WRAP:
 419                 return MALI_STENCIL_DECR_WRAP;
 420
 421         case PIPE_STENCIL_OP_INVERT:
 422                 return MALI_STENCIL_INVERT;
 423
 424         default:
 425                 unreachable("Invalid stencil op");
 426         }
 427 }
 428
 429 static unsigned
 430 translate_tex_wrap(enum pipe_tex_wrap w)
 431 {
 432         switch (w) {
 433         case PIPE_TEX_WRAP_REPEAT:
 434                 return MALI_WRAP_REPEAT;
 435
 436         case PIPE_TEX_WRAP_CLAMP:
 437                 return MALI_WRAP_CLAMP;
 438
 439         case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 440                 return MALI_WRAP_CLAMP_TO_EDGE;
 441
 442         case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
 443                 return MALI_WRAP_CLAMP_TO_BORDER;
 444
 445         case PIPE_TEX_WRAP_MIRROR_REPEAT:
 446                 return MALI_WRAP_MIRRORED_REPEAT;
 447
 448         case PIPE_TEX_WRAP_MIRROR_CLAMP:
 449                 return MALI_WRAP_MIRRORED_CLAMP;
 450
 451         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 452                 return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE;
 453
 454         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
 455                 return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER;
 456
 457         default:
 458                 unreachable("Invalid wrap");
 459         }
 460 }
 461
 462 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 463                                 struct mali_sampler_descriptor *hw)
 464 {
 465         unsigned func = panfrost_translate_compare_func(cso->compare_func);
 466         bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 467         bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 468         bool mip_linear  = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
 469         unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
 470         unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
 471         unsigned mip_filter = mip_linear  ?
 472                               (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
 473         unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
 474
 475         *hw = (struct mali_sampler_descriptor) {
 476                 .filter_mode = min_filter | mag_filter | mip_filter |
 477                                normalized,
 478                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 479                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 480                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 481                 .compare_func = panfrost_flip_compare_func(func),
 482                 .border_color = {
 483                         cso->border_color.f[0],
 484                         cso->border_color.f[1],
 485                         cso->border_color.f[2],
 486                         cso->border_color.f[3]
 487                 },
 488                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 489                 .max_lod = FIXED_16(cso->max_lod, false),
 490                 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
 491                 .seamless_cube_map = cso->seamless_cube_map,
 492         };
 493
 494         /* If necessary, we disable mipmapping in the sampler descriptor by
 495          * clamping the LOD as tight as possible (from 0 to epsilon,
 496          * essentially -- remember these are fixed point numbers, so
 497          * epsilon=1/256) */
 498
 499         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 500                 hw->max_lod = hw->min_lod + 1;
 501 }
 502
 503 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
 504                                         struct bifrost_sampler_descriptor *hw)
 505 {
 506         *hw = (struct bifrost_sampler_descriptor) {
 507                 .unk1 = 0x1,
 508                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 509                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 510                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 511                 .unk8 = 0x8,
 512                 .min_filter = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST,
 513                 .norm_coords = cso->normalized_coords,
 514                 .mip_filter = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR,
 515                 .mag_filter = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR,
 516                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 517                 .max_lod = FIXED_16(cso->max_lod, false),
 518         };
 519
 520         /* If necessary, we disable mipmapping in the sampler descriptor by
 521          * clamping the LOD as tight as possible (from 0 to epsilon,
 522          * essentially -- remember these are fixed point numbers, so
 523          * epsilon=1/256) */
 524
 525         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 526                 hw->max_lod = hw->min_lod + 1;
 527 }
 528
 529 static void
 530 panfrost_make_stencil_state(const struct pipe_stencil_state *in,
 531                             struct mali_stencil_test *out)
 532 {
 533         out->ref = 0; /* Gallium gets it from elsewhere */
 534
 535         out->mask = in->valuemask;
 536         out->func = panfrost_translate_compare_func(in->func);
 537         out->sfail = panfrost_translate_stencil_op(in->fail_op);
 538         out->dpfail = panfrost_translate_stencil_op(in->zfail_op);
 539         out->dppass = panfrost_translate_stencil_op(in->zpass_op);
 540 }
 541
 542 static void
 543 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
 544                                      struct mali_shader_meta *fragmeta)
 545 {
 546         if (!ctx->rasterizer) {
 547                 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
 548                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
 549                 fragmeta->depth_units = 0.0f;
 550                 fragmeta->depth_factor = 0.0f;
 551                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
 552                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
 553                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, true);
 554                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, true);
 555                 return;
 556         }
 557
 558         struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 559
 560         bool msaa = rast->multisample;
 561
 562         /* TODO: Sample size */
 563         SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
 564         SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
 565         fragmeta->depth_units = rast->offset_units * 2.0f;
 566         fragmeta->depth_factor = rast->offset_scale;
 567
 568         /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
 569
 570         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, rast->offset_tri);
 571         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, rast->offset_tri);
 572
 573         SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, rast->depth_clip_near);
 574         SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, rast->depth_clip_far);
 575 }
 576
 577 static void
 578 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
 579                               struct mali_shader_meta *fragmeta)
 580 {
 581         const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
 582         int zfunc = PIPE_FUNC_ALWAYS;
 583
 584         if (!zsa) {
 585                 struct pipe_stencil_state default_stencil = {
 586                         .enabled = 0,
 587                         .func = PIPE_FUNC_ALWAYS,
 588                         .fail_op = MALI_STENCIL_KEEP,
 589                         .zfail_op = MALI_STENCIL_KEEP,
 590                         .zpass_op = MALI_STENCIL_KEEP,
 591                         .writemask = 0xFF,
 592                         .valuemask = 0xFF
 593                 };
 594
 595                 panfrost_make_stencil_state(&default_stencil,
 596                                             &fragmeta->stencil_front);
 597                 fragmeta->stencil_mask_front = default_stencil.writemask;
 598                 fragmeta->stencil_back = fragmeta->stencil_front;
 599                 fragmeta->stencil_mask_back = default_stencil.writemask;
 600                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
 601                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
 602         } else {
 603                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
 604                         zsa->stencil[0].enabled);
 605                 panfrost_make_stencil_state(&zsa->stencil[0],
 606                                             &fragmeta->stencil_front);
 607                 fragmeta->stencil_mask_front = zsa->stencil[0].writemask;
 608                 fragmeta->stencil_front.ref = ctx->stencil_ref.ref_value[0];
 609
 610                 /* If back-stencil is not enabled, use the front values */
 611
 612                 if (zsa->stencil[1].enabled) {
 613                         panfrost_make_stencil_state(&zsa->stencil[1],
 614                                                     &fragmeta->stencil_back);
 615                         fragmeta->stencil_mask_back = zsa->stencil[1].writemask;
 616                         fragmeta->stencil_back.ref = ctx->stencil_ref.ref_value[1];
 617                 } else {
 618                         fragmeta->stencil_back = fragmeta->stencil_front;
 619                         fragmeta->stencil_mask_back = fragmeta->stencil_mask_front;
 620                         fragmeta->stencil_back.ref = fragmeta->stencil_front.ref;
 621                 }
 622
 623                 if (zsa->depth.enabled)
 624                         zfunc = zsa->depth.func;
 625
 626                 /* Depth state (TODO: Refactor) */
 627
 628                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
 629                         zsa->depth.writemask);
 630         }
 631
 632         fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
 633         fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
 634 }
 635
 636 static bool
 637 panfrost_fs_required(
 638                 struct panfrost_shader_state *fs,
 639                 struct panfrost_blend_final *blend,
 640                 unsigned rt_count)
 641 {
 642         /* If we generally have side effects */
 643         if (fs->fs_sidefx)
 644                 return true;
 645
 646         /* If colour is written we need to execute */
 647         for (unsigned i = 0; i < rt_count; ++i) {
 648                 if (!blend[i].no_colour)
 649                         return true;
 650         }
 651
 652         /* If depth is written and not implied we need to execute.
 653          * TODO: Predicate on Z/S writes being enabled */
 654         return (fs->writes_depth || fs->writes_stencil);
 655 }
 656
 657 static void
 658 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
 659                                 struct mali_shader_meta *fragmeta,
 660                                 void *rts)
 661 {
 662         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 663         struct panfrost_shader_state *fs;
 664         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 665
 666         SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
 667                 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
 668                 !ctx->blend->base.dither);
 669
 670         /* Get blending setup */
 671         unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 672
 673         struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 674         unsigned shader_offset = 0;
 675         struct panfrost_bo *shader_bo = NULL;
 676
 677         for (unsigned c = 0; c < rt_count; ++c)
 678                 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
 679                                                           &shader_offset);
 680
 681         /* Disable shader execution if we can */
 682         if (dev->quirks & MIDGARD_SHADERLESS
 683                         && !panfrost_fs_required(fs, blend, rt_count)) {
 684                 fragmeta->shader = 0;
 685                 fragmeta->attribute_count = 0;
 686                 fragmeta->varying_count = 0;
 687                 fragmeta->texture_count = 0;
 688                 fragmeta->sampler_count = 0;
 689
 690                 /* This feature is not known to work on Bifrost */
 691                 fragmeta->midgard1.work_count = 1;
 692                 fragmeta->midgard1.uniform_count = 0;
 693                 fragmeta->midgard1.uniform_buffer_count = 0;
 694         }
 695
 696          /* If there is a blend shader, work registers are shared. We impose 8
 697           * work registers as a limit for blend shaders. Should be lower XXX */
 698
 699         if (!(dev->quirks & IS_BIFROST)) {
 700                 for (unsigned c = 0; c < rt_count; ++c) {
 701                         if (blend[c].is_shader) {
 702                                 fragmeta->midgard1.work_count =
 703                                         MAX2(fragmeta->midgard1.work_count, 8);
 704                         }
 705                 }
 706         }
 707
 708         /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
 709          * copied to the blend_meta appended (by convention), but this is the
 710          * field actually read by the hardware. (Or maybe both are read...?).
 711          * Specify the last RTi with a blend shader. */
 712
 713         fragmeta->blend.shader = 0;
 714
 715         for (signed rt = (rt_count - 1); rt >= 0; --rt) {
 716                 if (!blend[rt].is_shader)
 717                         continue;
 718
 719                 fragmeta->blend.shader = blend[rt].shader.gpu |
 720                                          blend[rt].shader.first_tag;
 721                 break;
 722         }
 723
 724         if (dev->quirks & MIDGARD_SFBD) {
 725                 /* When only a single render target platform is used, the blend
 726                  * information is inside the shader meta itself. We additionally
 727                  * need to signal CAN_DISCARD for nontrivial blend modes (so
 728                  * we're able to read back the destination buffer) */
 729
 730                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
 731                         blend[0].is_shader);
 732
 733                 if (!blend[0].is_shader) {
 734                         fragmeta->blend.equation = *blend[0].equation.equation;
 735                         fragmeta->blend.constant = blend[0].equation.constant;
 736                 }
 737
 738                 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
 739                         !blend[0].no_blending || fs->can_discard);
 740                 return;
 741         }
 742
 743         if (dev->quirks & IS_BIFROST) {
 744                 bool no_blend = true;
 745
 746                 for (unsigned i = 0; i < rt_count; ++i)
 747                         no_blend &= (blend[i].no_blending | blend[i].no_colour);
 748
 749                 SET_BIT(fragmeta->bifrost1.unk1, MALI_BIFROST_EARLY_Z,
 750                         !fs->can_discard && !fs->writes_depth && no_blend);
 751         }
 752
 753         /* Additional blend descriptor tacked on for jobs using MFBD */
 754
 755         for (unsigned i = 0; i < rt_count; ++i) {
 756                 unsigned flags = 0;
 757
 758                 if (ctx->pipe_framebuffer.nr_cbufs > i && !blend[i].no_colour) {
 759                         flags = 0x200;
 760
 761                         bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
 762                                        (ctx->pipe_framebuffer.cbufs[i]) &&
 763                                        util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
 764
 765                         SET_BIT(flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
 766                         SET_BIT(flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
 767                         SET_BIT(flags, MALI_BLEND_SRGB, is_srgb);
 768                         SET_BIT(flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
 769                 }
 770
 771                 if (dev->quirks & IS_BIFROST) {
 772                         struct bifrost_blend_rt *brts = rts;
 773
 774                         brts[i].flags = flags;
 775
 776                         if (blend[i].is_shader) {
 777                                 /* The blend shader's address needs to be at
 778                                  * the same top 32 bit as the fragment shader.
 779                                  * TODO: Ensure that's always the case.
 780                                  */
 781                                 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
 782                                        (fs->bo->gpu & (0xffffffffull << 32)));
 783                                 brts[i].shader = blend[i].shader.gpu;
 784                                 brts[i].unk2 = 0x0;
 785                         } else if (ctx->pipe_framebuffer.nr_cbufs > i) {
 786                                 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
 787                                 const struct util_format_description *format_desc;
 788                                 format_desc = util_format_description(format);
 789
 790                                 brts[i].equation = *blend[i].equation.equation;
 791
 792                                 /* TODO: this is a bit more complicated */
 793                                 brts[i].constant = blend[i].equation.constant;
 794
 795                                 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
 796
 797                                 /* 0x19 disables blending and forces REPLACE
 798                                  * mode (equivalent to rgb_mode = alpha_mode =
 799                                  * x122, colour mask = 0xF). 0x1a allows
 800                                  * blending. */
 801                                 brts[i].unk2 = blend[i].no_blending ? 0x19 : 0x1a;
 802
 803                                 brts[i].shader_type = fs->blend_types[i];
 804                         } else {
 805                                 /* Dummy attachment for depth-only */
 806                                 brts[i].unk2 = 0x3;
 807                                 brts[i].shader_type = fs->blend_types[i];
 808                         }
 809                 } else {
 810                         struct midgard_blend_rt *mrts = rts;
 811                         mrts[i].flags = flags;
 812
 813                         if (blend[i].is_shader) {
 814                                 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 815                         } else {
 816                                 mrts[i].blend.equation = *blend[i].equation.equation;
 817                                 mrts[i].blend.constant = blend[i].equation.constant;
 818                         }
 819                 }
 820         }
 821 }
 822
 823 static void
 824 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
 825                                struct mali_shader_meta *fragmeta,
 826                                void *rts)
 827 {
 828         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 829         struct panfrost_shader_state *fs;
 830
 831         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 832
 833         fragmeta->alpha_coverage = ~MALI_ALPHA_COVERAGE(0.000000);
 834         fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x10;
 835         fragmeta->unknown2_4 = 0x4e0;
 836
 837         /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
 838          * is required (independent of 32-bit/64-bit descriptors), or why it's
 839          * not used on later GPU revisions. Otherwise, all shader jobs fault on
 840          * these earlier chips (perhaps this is a chicken bit of some kind).
 841          * More investigation is needed. */
 842
 843         SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
 844
 845         if (dev->quirks & IS_BIFROST) {
 846                 /* TODO */
 847         } else {
 848                 /* Depending on whether it's legal to in the given shader, we try to
 849                  * enable early-z testing. TODO: respect e-z force */
 850
 851                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
 852                         !fs->can_discard && !fs->writes_global &&
 853                         !fs->writes_depth && !fs->writes_stencil);
 854
 855                 /* Add the writes Z/S flags if needed. */
 856                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
 857                 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
 858
 859                 /* Any time texturing is used, derivatives are implicitly calculated,
 860                  * so we need to enable helper invocations */
 861
 862                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
 863                         fs->helper_invocations);
 864
 865                 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
 866
 867                 bool depth_enabled = fs->writes_depth ||
 868                    (zsa && zsa->depth.enabled && zsa->depth.func != PIPE_FUNC_ALWAYS);
 869
 870                 SET_BIT(fragmeta->midgard1.flags_lo, 0x400, !depth_enabled && fs->can_discard);
 871                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, depth_enabled && fs->can_discard);
 872         }
 873
 874         panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
 875         panfrost_frag_meta_zsa_update(ctx, fragmeta);
 876         panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
 877 }
 878
 879 void
 880 panfrost_emit_shader_meta(struct panfrost_batch *batch,
 881                           enum pipe_shader_type st,
 882                           struct mali_vertex_tiler_postfix *postfix)
 883 {
 884         struct panfrost_context *ctx = batch->ctx;
 885         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 886
 887         if (!ss) {
 888                 postfix->shader = 0;
 889                 return;
 890         }
 891
 892         struct mali_shader_meta meta;
 893
 894         panfrost_shader_meta_init(ctx, st, &meta);
 895
 896         /* Add the shader BO to the batch. */
 897         panfrost_batch_add_bo(batch, ss->bo,
 898                               PAN_BO_ACCESS_PRIVATE |
 899                               PAN_BO_ACCESS_READ |
 900                               panfrost_bo_access_for_stage(st));
 901
 902         mali_ptr shader_ptr;
 903
 904         if (st == PIPE_SHADER_FRAGMENT) {
 905                 struct panfrost_device *dev = pan_device(ctx->base.screen);
 906                 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 907                 size_t desc_size = sizeof(meta);
 908                 void *rts = NULL;
 909                 struct panfrost_transfer xfer;
 910                 unsigned rt_size;
 911
 912                 if (dev->quirks & MIDGARD_SFBD)
 913                         rt_size = 0;
 914                 else if (dev->quirks & IS_BIFROST)
 915                         rt_size = sizeof(struct bifrost_blend_rt);
 916                 else
 917                         rt_size = sizeof(struct midgard_blend_rt);
 918
 919                 desc_size += rt_size * rt_count;
 920
 921                 if (rt_size)
 922                         rts = rzalloc_size(ctx, rt_size * rt_count);
 923
 924                 panfrost_frag_shader_meta_init(ctx, &meta, rts);
 925
 926                 xfer = panfrost_allocate_transient(batch, desc_size);
 927
 928                 memcpy(xfer.cpu, &meta, sizeof(meta));
 929                 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
 930
 931                 if (rt_size)
 932                         ralloc_free(rts);
 933
 934                 shader_ptr = xfer.gpu;
 935         } else {
 936                 shader_ptr = panfrost_upload_transient(batch, &meta,
 937                                                        sizeof(meta));
 938         }
 939
 940         postfix->shader = shader_ptr;
 941 }
 942
 943 static void
 944 panfrost_mali_viewport_init(struct panfrost_context *ctx,
 945                             struct mali_viewport *mvp)
 946 {
 947         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 948
 949         /* Clip bounds are encoded as floats. The viewport itself is encoded as
 950          * (somewhat) asymmetric ints. */
 951
 952         const struct pipe_scissor_state *ss = &ctx->scissor;
 953
 954         memset(mvp, 0, sizeof(*mvp));
 955
 956         /* By default, do no viewport clipping, i.e. clip to (-inf, inf) in
 957          * each direction. Clipping to the viewport in theory should work, but
 958          * in practice causes issues when we're not explicitly trying to
 959          * scissor */
 960
 961         *mvp = (struct mali_viewport) {
 962                 .clip_minx = -INFINITY,
 963                 .clip_miny = -INFINITY,
 964                 .clip_maxx = INFINITY,
 965                 .clip_maxy = INFINITY,
 966         };
 967
 968         /* Always scissor to the viewport by default. */
 969         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 970         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 971
 972         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 973         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 974
 975         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 976         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 977
 978         /* Apply the scissor test */
 979
 980         unsigned minx, miny, maxx, maxy;
 981
 982         if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
 983                 minx = MAX2(ss->minx, vp_minx);
 984                 miny = MAX2(ss->miny, vp_miny);
 985                 maxx = MIN2(ss->maxx, vp_maxx);
 986                 maxy = MIN2(ss->maxy, vp_maxy);
 987         } else {
 988                 minx = vp_minx;
 989                 miny = vp_miny;
 990                 maxx = vp_maxx;
 991                 maxy = vp_maxy;
 992         }
 993
 994         /* Hardware needs the min/max to be strictly ordered, so flip if we
 995          * need to. The viewport transformation in the vertex shader will
 996          * handle the negatives if we don't */
 997
 998         if (miny > maxy) {
 999                 unsigned temp = miny;
1000                 miny = maxy;
1001                 maxy = temp;
1002         }
1003
1004         if (minx > maxx) {
1005                 unsigned temp = minx;
1006                 minx = maxx;
1007                 maxx = temp;
1008         }
1009
1010         if (minz > maxz) {
1011                 float temp = minz;
1012                 minz = maxz;
1013                 maxz = temp;
1014         }
1015
1016         /* Clamp to the framebuffer size as a last check */
1017
1018         minx = MIN2(ctx->pipe_framebuffer.width, minx);
1019         maxx = MIN2(ctx->pipe_framebuffer.width, maxx);
1020
1021         miny = MIN2(ctx->pipe_framebuffer.height, miny);
1022         maxy = MIN2(ctx->pipe_framebuffer.height, maxy);
1023
1024         /* Upload */
1025
1026         mvp->viewport0[0] = minx;
1027         mvp->viewport1[0] = MALI_POSITIVE(maxx);
1028
1029         mvp->viewport0[1] = miny;
1030         mvp->viewport1[1] = MALI_POSITIVE(maxy);
1031
1032         bool clip_near = true;
1033         bool clip_far = true;
1034
1035         if (ctx->rasterizer) {
1036                 clip_near = ctx->rasterizer->base.depth_clip_near;
1037                 clip_far = ctx->rasterizer->base.depth_clip_far;
1038         }
1039
1040         mvp->clip_minz = clip_near ? minz : -INFINITY;
1041         mvp->clip_maxz = clip_far ? maxz : INFINITY;
1042 }
1043
1044 void
1045 panfrost_emit_viewport(struct panfrost_batch *batch,
1046                        struct mali_vertex_tiler_postfix *tiler_postfix)
1047 {
1048         struct panfrost_context *ctx = batch->ctx;
1049         struct mali_viewport mvp;
1050
1051         panfrost_mali_viewport_init(batch->ctx,  &mvp);
1052
1053         /* Update the job, unless we're doing wallpapering (whose lack of
1054          * scissor we can ignore, since if we "miss" a tile of wallpaper, it'll
1055          * just... be faster :) */
1056
1057         if (!ctx->wallpaper_batch)
1058                 panfrost_batch_union_scissor(batch, mvp.viewport0[0],
1059                                              mvp.viewport0[1],
1060                                              mvp.viewport1[0] + 1,
1061                                              mvp.viewport1[1] + 1);
1062
1063         tiler_postfix->viewport = panfrost_upload_transient(batch, &mvp,
1064                                                             sizeof(mvp));
1065 }
1066
1067 static mali_ptr
1068 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
1069                                  enum pipe_shader_type st,
1070                                  struct panfrost_constant_buffer *buf,
1071                                  unsigned index)
1072 {
1073         struct pipe_constant_buffer *cb = &buf->cb[index];
1074         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1075
1076         if (rsrc) {
1077                 panfrost_batch_add_bo(batch, rsrc->bo,
1078                                       PAN_BO_ACCESS_SHARED |
1079                                       PAN_BO_ACCESS_READ |
1080                                       panfrost_bo_access_for_stage(st));
1081
1082                 /* Alignment gauranteed by
1083                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
1084                 return rsrc->bo->gpu + cb->buffer_offset;
1085         } else if (cb->user_buffer) {
1086                 return panfrost_upload_transient(batch,
1087                                                  cb->user_buffer +
1088                                                  cb->buffer_offset,
1089                                                  cb->buffer_size);
1090         } else {
1091                 unreachable("No constant buffer");
1092         }
1093 }
1094
1095 struct sysval_uniform {
1096         union {
1097                 float f[4];
1098                 int32_t i[4];
1099                 uint32_t u[4];
1100                 uint64_t du[2];
1101         };
1102 };
1103
1104 static void
1105 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
1106                                       struct sysval_uniform *uniform)
1107 {
1108         struct panfrost_context *ctx = batch->ctx;
1109         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1110
1111         uniform->f[0] = vp->scale[0];
1112         uniform->f[1] = vp->scale[1];
1113         uniform->f[2] = vp->scale[2];
1114 }
1115
1116 static void
1117 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1118                                        struct sysval_uniform *uniform)
1119 {
1120         struct panfrost_context *ctx = batch->ctx;
1121         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1122
1123         uniform->f[0] = vp->translate[0];
1124         uniform->f[1] = vp->translate[1];
1125         uniform->f[2] = vp->translate[2];
1126 }
1127
1128 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1129                                        enum pipe_shader_type st,
1130                                        unsigned int sysvalid,
1131                                        struct sysval_uniform *uniform)
1132 {
1133         struct panfrost_context *ctx = batch->ctx;
1134         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1135         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1136         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1137         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1138
1139         assert(dim);
1140         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1141
1142         if (dim > 1)
1143                 uniform->i[1] = u_minify(tex->texture->height0,
1144                                          tex->u.tex.first_level);
1145
1146         if (dim > 2)
1147                 uniform->i[2] = u_minify(tex->texture->depth0,
1148                                          tex->u.tex.first_level);
1149
1150         if (is_array)
1151                 uniform->i[dim] = tex->texture->array_size;
1152 }
1153
1154 static void
1155 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1156                             enum pipe_shader_type st,
1157                             unsigned ssbo_id,
1158                             struct sysval_uniform *uniform)
1159 {
1160         struct panfrost_context *ctx = batch->ctx;
1161
1162         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1163         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1164
1165         /* Compute address */
1166         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1167
1168         panfrost_batch_add_bo(batch, bo,
1169                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1170                               panfrost_bo_access_for_stage(st));
1171
1172         /* Upload address and size as sysval */
1173         uniform->du[0] = bo->gpu + sb.buffer_offset;
1174         uniform->u[2] = sb.buffer_size;
1175 }
1176
1177 static void
1178 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1179                                enum pipe_shader_type st,
1180                                unsigned samp_idx,
1181                                struct sysval_uniform *uniform)
1182 {
1183         struct panfrost_context *ctx = batch->ctx;
1184         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1185
1186         uniform->f[0] = sampl->min_lod;
1187         uniform->f[1] = sampl->max_lod;
1188         uniform->f[2] = sampl->lod_bias;
1189
1190         /* Even without any errata, Midgard represents "no mipmapping" as
1191          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1192          * panfrost_create_sampler_state which also explains our choice of
1193          * epsilon value (again to keep behaviour consistent) */
1194
1195         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1196                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1197 }
1198
1199 static void
1200 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1201                                        struct sysval_uniform *uniform)
1202 {
1203         struct panfrost_context *ctx = batch->ctx;
1204
1205         uniform->u[0] = ctx->compute_grid->grid[0];
1206         uniform->u[1] = ctx->compute_grid->grid[1];
1207         uniform->u[2] = ctx->compute_grid->grid[2];
1208 }
1209
1210 static void
1211 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1212                         struct panfrost_shader_state *ss,
1213                         enum pipe_shader_type st)
1214 {
1215         struct sysval_uniform *uniforms = (void *)buf;
1216
1217         for (unsigned i = 0; i < ss->sysval_count; ++i) {
1218                 int sysval = ss->sysval[i];
1219
1220                 switch (PAN_SYSVAL_TYPE(sysval)) {
1221                 case PAN_SYSVAL_VIEWPORT_SCALE:
1222                         panfrost_upload_viewport_scale_sysval(batch,
1223                                                               &uniforms[i]);
1224                         break;
1225                 case PAN_SYSVAL_VIEWPORT_OFFSET:
1226                         panfrost_upload_viewport_offset_sysval(batch,
1227                                                                &uniforms[i]);
1228                         break;
1229                 case PAN_SYSVAL_TEXTURE_SIZE:
1230                         panfrost_upload_txs_sysval(batch, st,
1231                                                    PAN_SYSVAL_ID(sysval),
1232                                                    &uniforms[i]);
1233                         break;
1234                 case PAN_SYSVAL_SSBO:
1235                         panfrost_upload_ssbo_sysval(batch, st,
1236                                                     PAN_SYSVAL_ID(sysval),
1237                                                     &uniforms[i]);
1238                         break;
1239                 case PAN_SYSVAL_NUM_WORK_GROUPS:
1240                         panfrost_upload_num_work_groups_sysval(batch,
1241                                                                &uniforms[i]);
1242                         break;
1243                 case PAN_SYSVAL_SAMPLER:
1244                         panfrost_upload_sampler_sysval(batch, st,
1245                                                        PAN_SYSVAL_ID(sysval),
1246                                                        &uniforms[i]);
1247                         break;
1248                 default:
1249                         assert(0);
1250                 }
1251         }
1252 }
1253
1254 static const void *
1255 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1256                                  unsigned index)
1257 {
1258         struct pipe_constant_buffer *cb = &buf->cb[index];
1259         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1260
1261         if (rsrc)
1262                 return rsrc->bo->cpu;
1263         else if (cb->user_buffer)
1264                 return cb->user_buffer;
1265         else
1266                 unreachable("No constant buffer");
1267 }
1268
1269 void
1270 panfrost_emit_const_buf(struct panfrost_batch *batch,
1271                         enum pipe_shader_type stage,
1272                         struct mali_vertex_tiler_postfix *postfix)
1273 {
1274         struct panfrost_context *ctx = batch->ctx;
1275         struct panfrost_shader_variants *all = ctx->shader[stage];
1276
1277         if (!all)
1278                 return;
1279
1280         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1281
1282         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1283
1284         /* Uniforms are implicitly UBO #0 */
1285         bool has_uniforms = buf->enabled_mask & (1 << 0);
1286
1287         /* Allocate room for the sysval and the uniforms */
1288         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1289         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1290         size_t size = sys_size + uniform_size;
1291         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1292                                                                         size);
1293
1294         /* Upload sysvals requested by the shader */
1295         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1296
1297         /* Upload uniforms */
1298         if (has_uniforms && uniform_size) {
1299                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1300                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1301         }
1302
1303         /* Next up, attach UBOs. UBO #0 is the uniforms we just
1304          * uploaded */
1305
1306         unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1307         assert(ubo_count >= 1);
1308
1309         size_t sz = sizeof(uint64_t) * ubo_count;
1310         uint64_t ubos[PAN_MAX_CONST_BUFFERS];
1311         int uniform_count = ss->uniform_count;
1312
1313         /* Upload uniforms as a UBO */
1314         ubos[0] = MALI_MAKE_UBO(2 + uniform_count, transfer.gpu);
1315
1316         /* The rest are honest-to-goodness UBOs */
1317
1318         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1319                 size_t usz = buf->cb[ubo].buffer_size;
1320                 bool enabled = buf->enabled_mask & (1 << ubo);
1321                 bool empty = usz == 0;
1322
1323                 if (!enabled || empty) {
1324                         /* Stub out disabled UBOs to catch accesses */
1325                         ubos[ubo] = MALI_MAKE_UBO(0, 0xDEAD0000);
1326                         continue;
1327                 }
1328
1329                 mali_ptr gpu = panfrost_map_constant_buffer_gpu(batch, stage,
1330                                                                 buf, ubo);
1331
1332                 unsigned bytes_per_field = 16;
1333                 unsigned aligned = ALIGN_POT(usz, bytes_per_field);
1334                 ubos[ubo] = MALI_MAKE_UBO(aligned / bytes_per_field, gpu);
1335         }
1336
1337         mali_ptr ubufs = panfrost_upload_transient(batch, ubos, sz);
1338         postfix->uniforms = transfer.gpu;
1339         postfix->uniform_buffers = ubufs;
1340
1341         buf->dirty_mask = 0;
1342 }
1343
1344 void
1345 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1346                             const struct pipe_grid_info *info,
1347                             struct midgard_payload_vertex_tiler *vtp)
1348 {
1349         struct panfrost_context *ctx = batch->ctx;
1350         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1351         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1352         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1353                                                            128));
1354         unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1355                                info->grid[2] * 4;
1356         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1357                                                                   shared_size,
1358                                                                   1);
1359
1360         struct mali_shared_memory shared = {
1361                 .shared_memory = bo->gpu,
1362                 .shared_workgroup_count =
1363                         util_logbase2_ceil(info->grid[0]) +
1364                         util_logbase2_ceil(info->grid[1]) +
1365                         util_logbase2_ceil(info->grid[2]),
1366                 .shared_unk1 = 0x2,
1367                 .shared_shift = util_logbase2(single_size) - 1
1368         };
1369
1370         vtp->postfix.shared_memory = panfrost_upload_transient(batch, &shared,
1371                                                                sizeof(shared));
1372 }
1373
1374 static mali_ptr
1375 panfrost_get_tex_desc(struct panfrost_batch *batch,
1376                       enum pipe_shader_type st,
1377                       struct panfrost_sampler_view *view)
1378 {
1379         if (!view)
1380                 return (mali_ptr) 0;
1381
1382         struct pipe_sampler_view *pview = &view->base;
1383         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1384
1385         /* Add the BO to the job so it's retained until the job is done. */
1386
1387         panfrost_batch_add_bo(batch, rsrc->bo,
1388                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1389                               panfrost_bo_access_for_stage(st));
1390
1391         panfrost_batch_add_bo(batch, view->bo,
1392                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1393                               panfrost_bo_access_for_stage(st));
1394
1395         return view->bo->gpu;
1396 }
1397
1398 static void
1399 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1400                              struct pipe_context *pctx)
1401 {
1402         struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1403         if (view->texture_bo != rsrc->bo->gpu ||
1404             view->layout != rsrc->layout) {
1405                 panfrost_bo_unreference(view->bo);
1406                 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1407         }
1408 }
1409
1410 void
1411 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1412                                   enum pipe_shader_type stage,
1413                                   struct mali_vertex_tiler_postfix *postfix)
1414 {
1415         struct panfrost_context *ctx = batch->ctx;
1416         struct panfrost_device *device = pan_device(ctx->base.screen);
1417
1418         if (!ctx->sampler_view_count[stage])
1419                 return;
1420
1421         if (device->quirks & IS_BIFROST) {
1422                 struct bifrost_texture_descriptor *descriptors;
1423
1424                 descriptors = malloc(sizeof(struct bifrost_texture_descriptor) *
1425                                      ctx->sampler_view_count[stage]);
1426
1427                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1428                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1429                         struct pipe_sampler_view *pview = &view->base;
1430                         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1431                         panfrost_update_sampler_view(view, &ctx->base);
1432
1433                         /* Add the BOs to the job so they are retained until the job is done. */
1434
1435                         panfrost_batch_add_bo(batch, rsrc->bo,
1436                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1437                                               panfrost_bo_access_for_stage(stage));
1438
1439                         panfrost_batch_add_bo(batch, view->bo,
1440                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1441                                               panfrost_bo_access_for_stage(stage));
1442
1443                         memcpy(&descriptors[i], view->bifrost_descriptor, sizeof(*view->bifrost_descriptor));
1444                 }
1445
1446                 postfix->textures = panfrost_upload_transient(batch,
1447                                                               descriptors,
1448                                                               sizeof(struct bifrost_texture_descriptor) *
1449                                                                       ctx->sampler_view_count[stage]);
1450
1451                 free(descriptors);
1452         } else {
1453                 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1454
1455                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1456                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1457
1458                         panfrost_update_sampler_view(view, &ctx->base);
1459
1460                         trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1461                 }
1462
1463                 postfix->textures = panfrost_upload_transient(batch,
1464                                                               trampolines,
1465                                                               sizeof(uint64_t) *
1466                                                               ctx->sampler_view_count[stage]);
1467         }
1468 }
1469
1470 void
1471 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1472                                   enum pipe_shader_type stage,
1473                                   struct mali_vertex_tiler_postfix *postfix)
1474 {
1475         struct panfrost_context *ctx = batch->ctx;
1476         struct panfrost_device *device = pan_device(ctx->base.screen);
1477
1478         if (!ctx->sampler_count[stage])
1479                 return;
1480
1481         if (device->quirks & IS_BIFROST) {
1482                 size_t desc_size = sizeof(struct bifrost_sampler_descriptor);
1483                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1484                 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1485                                                                                 transfer_size);
1486                 struct bifrost_sampler_descriptor *desc = (struct bifrost_sampler_descriptor *)transfer.cpu;
1487
1488                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1489                         desc[i] = ctx->samplers[stage][i]->bifrost_hw;
1490
1491                 postfix->sampler_descriptor = transfer.gpu;
1492         } else {
1493                 size_t desc_size = sizeof(struct mali_sampler_descriptor);
1494                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1495                 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1496                                                                                 transfer_size);
1497                 struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1498
1499                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1500                         desc[i] = ctx->samplers[stage][i]->midgard_hw;
1501
1502                 postfix->sampler_descriptor = transfer.gpu;
1503         }
1504 }
1505
1506 void
1507 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1508                                struct mali_vertex_tiler_postfix *vertex_postfix)
1509 {
1510         struct panfrost_context *ctx = batch->ctx;
1511
1512         if (!ctx->vertex)
1513                 return;
1514
1515         struct panfrost_vertex_state *so = ctx->vertex;
1516
1517         panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1518         vertex_postfix->attribute_meta = panfrost_upload_transient(batch, so->hw,
1519                                                                sizeof(*so->hw) *
1520                                                                PAN_MAX_ATTRIBUTE);
1521 }
1522
1523 void
1524 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1525                           struct mali_vertex_tiler_postfix *vertex_postfix)
1526 {
1527         struct panfrost_context *ctx = batch->ctx;
1528         struct panfrost_vertex_state *so = ctx->vertex;
1529
1530         /* Staged mali_attr, and index into them. i =/= k, depending on the
1531          * vertex buffer mask and instancing. Twice as much room is allocated,
1532          * for a worst case of NPOT_DIVIDEs which take up extra slot */
1533         union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1534         unsigned k = 0;
1535
1536         for (unsigned i = 0; i < so->num_elements; ++i) {
1537                 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1538                  * means duplicating some vertex buffers (who cares? aside from
1539                  * maybe some caching implications but I somehow doubt that
1540                  * matters) */
1541
1542                 struct pipe_vertex_element *elem = &so->pipe[i];
1543                 unsigned vbi = elem->vertex_buffer_index;
1544
1545                 /* The exception to 1:1 mapping is that we can have multiple
1546                  * entries (NPOT divisors), so we fixup anyways */
1547
1548                 so->hw[i].index = k;
1549
1550                 if (!(ctx->vb_mask & (1 << vbi)))
1551                         continue;
1552
1553                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1554                 struct panfrost_resource *rsrc;
1555
1556                 rsrc = pan_resource(buf->buffer.resource);
1557                 if (!rsrc)
1558                         continue;
1559
1560                 /* Align to 64 bytes by masking off the lower bits. This
1561                  * will be adjusted back when we fixup the src_offset in
1562                  * mali_attr_meta */
1563
1564                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1565                 mali_ptr addr = raw_addr & ~63;
1566                 unsigned chopped_addr = raw_addr - addr;
1567
1568                 /* Add a dependency of the batch on the vertex buffer */
1569                 panfrost_batch_add_bo(batch, rsrc->bo,
1570                                       PAN_BO_ACCESS_SHARED |
1571                                       PAN_BO_ACCESS_READ |
1572                                       PAN_BO_ACCESS_VERTEX_TILER);
1573
1574                 /* Set common fields */
1575                 attrs[k].elements = addr;
1576                 attrs[k].stride = buf->stride;
1577
1578                 /* Since we advanced the base pointer, we shrink the buffer
1579                  * size */
1580                 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1581
1582                 /* We need to add the extra size we masked off (for
1583                  * correctness) so the data doesn't get clamped away */
1584                 attrs[k].size += chopped_addr;
1585
1586                 /* For non-instancing make sure we initialize */
1587                 attrs[k].shift = attrs[k].extra_flags = 0;
1588
1589                 /* Instancing uses a dramatically different code path than
1590                  * linear, so dispatch for the actual emission now that the
1591                  * common code is finished */
1592
1593                 unsigned divisor = elem->instance_divisor;
1594
1595                 if (divisor && ctx->instance_count == 1) {
1596                         /* Silly corner case where there's a divisor(=1) but
1597                          * there's no legitimate instancing. So we want *every*
1598                          * attribute to be the same. So set stride to zero so
1599                          * we don't go anywhere. */
1600
1601                         attrs[k].size = attrs[k].stride + chopped_addr;
1602                         attrs[k].stride = 0;
1603                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1604                 } else if (ctx->instance_count <= 1) {
1605                         /* Normal, non-instanced attributes */
1606                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1607                 } else {
1608                         unsigned instance_shift = vertex_postfix->instance_shift;
1609                         unsigned instance_odd = vertex_postfix->instance_odd;
1610
1611                         k += panfrost_vertex_instanced(ctx->padded_count,
1612                                                        instance_shift,
1613                                                        instance_odd,
1614                                                        divisor, &attrs[k]);
1615                 }
1616         }
1617
1618         /* Add special gl_VertexID/gl_InstanceID buffers */
1619
1620         panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1621         so->hw[PAN_VERTEX_ID].index = k++;
1622         panfrost_instance_id(ctx->padded_count, &attrs[k]);
1623         so->hw[PAN_INSTANCE_ID].index = k++;
1624
1625         /* Upload whatever we emitted and go */
1626
1627         vertex_postfix->attributes = panfrost_upload_transient(batch, attrs,
1628                                                            k * sizeof(*attrs));
1629 }
1630
1631 static mali_ptr
1632 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1633                        unsigned stride, unsigned count)
1634 {
1635         /* Fill out the descriptor */
1636         slot->stride = stride;
1637         slot->size = stride * count;
1638         slot->shift = slot->extra_flags = 0;
1639
1640         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1641                                                                         slot->size);
1642
1643         slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1644
1645         return transfer.gpu;
1646 }
1647
1648 static unsigned
1649 panfrost_streamout_offset(unsigned stride, unsigned offset,
1650                         struct pipe_stream_output_target *target)
1651 {
1652         return (target->buffer_offset + (offset * stride * 4)) & 63;
1653 }
1654
1655 static void
1656 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1657                         unsigned stride, unsigned offset, unsigned count,
1658                         struct pipe_stream_output_target *target)
1659 {
1660         /* Fill out the descriptor */
1661         slot->stride = stride * 4;
1662         slot->shift = slot->extra_flags = 0;
1663
1664         unsigned max_size = target->buffer_size;
1665         unsigned expected_size = slot->stride * count;
1666
1667         /* Grab the BO and bind it to the batch */
1668         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1669
1670         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1671          * the perspective of the TILER and FRAGMENT.
1672          */
1673         panfrost_batch_add_bo(batch, bo,
1674                               PAN_BO_ACCESS_SHARED |
1675                               PAN_BO_ACCESS_RW |
1676                               PAN_BO_ACCESS_VERTEX_TILER |
1677                               PAN_BO_ACCESS_FRAGMENT);
1678
1679         /* We will have an offset applied to get alignment */
1680         mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1681         slot->elements = (addr & ~63) | MALI_ATTR_LINEAR;
1682         slot->size = MIN2(max_size, expected_size) + (addr & 63);
1683 }
1684
1685 static bool
1686 has_point_coord(unsigned mask, gl_varying_slot loc)
1687 {
1688         if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1689                 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1690         else if (loc == VARYING_SLOT_PNTC)
1691                 return (mask & (1 << 8));
1692         else
1693                 return false;
1694 }
1695
1696 /* Helpers for manipulating stream out information so we can pack varyings
1697  * accordingly. Compute the src_offset for a given captured varying */
1698
1699 static struct pipe_stream_output *
1700 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1701 {
1702         for (unsigned i = 0; i < info->num_outputs; ++i) {
1703                 if (info->output[i].register_index == loc)
1704                         return &info->output[i];
1705         }
1706
1707         unreachable("Varying not captured");
1708 }
1709
1710 static unsigned
1711 pan_varying_size(enum mali_format fmt)
1712 {
1713         unsigned type = MALI_EXTRACT_TYPE(fmt);
1714         unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1715         unsigned bits = MALI_EXTRACT_BITS(fmt);
1716         unsigned bpc = 0;
1717
1718         if (bits == MALI_CHANNEL_FLOAT) {
1719                 /* No doubles */
1720                 bool fp16 = (type == MALI_FORMAT_SINT);
1721                 assert(fp16 || (type == MALI_FORMAT_UNORM));
1722
1723                 bpc = fp16 ? 2 : 4;
1724         } else {
1725                 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1726
1727                 /* See the enums */
1728                 bits = 1 << bits;
1729                 assert(bits >= 8);
1730                 bpc = bits / 8;
1731         }
1732
1733         return bpc * chan;
1734 }
1735
1736 /* Indices for named (non-XFB) varyings that are present. These are packed
1737  * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1738  * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1739  * of a given special field given a shift S by:
1740  *
1741  *      idx = popcount(P & ((1 << S) - 1))
1742  *
1743  * That is... look at all of the varyings that come earlier and count them, the
1744  * count is the new index since plus one. Likewise, the total number of special
1745  * buffers required is simply popcount(P)
1746  */
1747
1748 enum pan_special_varying {
1749         PAN_VARY_GENERAL = 0,
1750         PAN_VARY_POSITION = 1,
1751         PAN_VARY_PSIZ = 2,
1752         PAN_VARY_PNTCOORD = 3,
1753         PAN_VARY_FACE = 4,
1754         PAN_VARY_FRAGCOORD = 5,
1755
1756         /* Keep last */
1757         PAN_VARY_MAX,
1758 };
1759
1760 /* Given a varying, figure out which index it correpsonds to */
1761
1762 static inline unsigned
1763 pan_varying_index(unsigned present, enum pan_special_varying v)
1764 {
1765         unsigned mask = (1 << v) - 1;
1766         return util_bitcount(present & mask);
1767 }
1768
1769 /* Get the base offset for XFB buffers, which by convention come after
1770  * everything else. Wrapper function for semantic reasons; by construction this
1771  * is just popcount. */
1772
1773 static inline unsigned
1774 pan_xfb_base(unsigned present)
1775 {
1776         return util_bitcount(present);
1777 }
1778
1779 /* Computes the present mask for varyings so we can start emitting varying records */
1780
1781 static inline unsigned
1782 pan_varying_present(
1783         struct panfrost_shader_state *vs,
1784         struct panfrost_shader_state *fs,
1785         unsigned quirks)
1786 {
1787         /* At the moment we always emit general and position buffers. Not
1788          * strictly necessary but usually harmless */
1789
1790         unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1791
1792         /* Enable special buffers by the shader info */
1793
1794         if (vs->writes_point_size)
1795                 present |= (1 << PAN_VARY_PSIZ);
1796
1797         if (fs->reads_point_coord)
1798                 present |= (1 << PAN_VARY_PNTCOORD);
1799
1800         if (fs->reads_face)
1801                 present |= (1 << PAN_VARY_FACE);
1802
1803         if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1804                 present |= (1 << PAN_VARY_FRAGCOORD);
1805
1806         /* Also, if we have a point sprite, we need a point coord buffer */
1807
1808         for (unsigned i = 0; i < fs->varying_count; i++)  {
1809                 gl_varying_slot loc = fs->varyings_loc[i];
1810
1811                 if (has_point_coord(fs->point_sprite_mask, loc))
1812                         present |= (1 << PAN_VARY_PNTCOORD);
1813         }
1814
1815         return present;
1816 }
1817
1818 /* Emitters for varying records */
1819
1820 static struct mali_attr_meta
1821 pan_emit_vary(unsigned present, enum pan_special_varying buf,
1822                 unsigned quirks, enum mali_format format,
1823                 unsigned offset)
1824 {
1825         unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1826
1827         struct mali_attr_meta meta = {
1828                 .index = pan_varying_index(present, buf),
1829                 .unknown1 = quirks & IS_BIFROST ? 0x0 : 0x2,
1830                 .swizzle = quirks & HAS_SWIZZLES ?
1831                         panfrost_get_default_swizzle(nr_channels) :
1832                         panfrost_bifrost_swizzle(nr_channels),
1833                 .format = format,
1834                 .src_offset = offset
1835         };
1836
1837         return meta;
1838 }
1839
1840 /* General varying that is unused */
1841
1842 static struct mali_attr_meta
1843 pan_emit_vary_only(unsigned present, unsigned quirks)
1844 {
1845         return pan_emit_vary(present, 0, quirks, MALI_VARYING_DISCARD, 0);
1846 }
1847
1848 /* Special records */
1849
1850 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1851         [PAN_VARY_POSITION]     = MALI_VARYING_POS,
1852         [PAN_VARY_PSIZ]         = MALI_R16F,
1853         [PAN_VARY_PNTCOORD]     = MALI_R16F,
1854         [PAN_VARY_FACE]         = MALI_R32I,
1855         [PAN_VARY_FRAGCOORD]    = MALI_RGBA32F
1856 };
1857
1858 static struct mali_attr_meta
1859 pan_emit_vary_special(unsigned present, enum pan_special_varying buf,
1860                 unsigned quirks)
1861 {
1862         assert(buf < PAN_VARY_MAX);
1863         return pan_emit_vary(present, buf, quirks, pan_varying_formats[buf], 0);
1864 }
1865
1866 static enum mali_format
1867 pan_xfb_format(enum mali_format format, unsigned nr)
1868 {
1869         if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1870                 return MALI_R32F | MALI_NR_CHANNELS(nr);
1871         else
1872                 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1873 }
1874
1875 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1876  * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1877  * value. */
1878
1879 static struct mali_attr_meta
1880 pan_emit_vary_xfb(unsigned present,
1881                 unsigned max_xfb,
1882                 unsigned *streamout_offsets,
1883                 unsigned quirks,
1884                 enum mali_format format,
1885                 struct pipe_stream_output o)
1886 {
1887         /* Otherwise construct a record for it */
1888         struct mali_attr_meta meta = {
1889                 /* XFB buffers come after everything else */
1890                 .index = pan_xfb_base(present) + o.output_buffer,
1891
1892                 /* As usual unknown bit */
1893                 .unknown1 = quirks & IS_BIFROST ? 0x0 : 0x2,
1894
1895                 /* Override swizzle with number of channels */
1896                 .swizzle = quirks & HAS_SWIZZLES ?
1897                         panfrost_get_default_swizzle(o.num_components) :
1898                         panfrost_bifrost_swizzle(o.num_components),
1899
1900                 /* Override number of channels and precision to highp */
1901                 .format = pan_xfb_format(format, o.num_components),
1902
1903                 /* Apply given offsets together */
1904                 .src_offset = (o.dst_offset * 4) /* dwords */
1905                         + streamout_offsets[o.output_buffer]
1906         };
1907
1908         return meta;
1909 }
1910
1911 /* Determine if we should capture a varying for XFB. This requires actually
1912  * having a buffer for it. If we don't capture it, we'll fallback to a general
1913  * varying path (linked or unlinked, possibly discarding the write) */
1914
1915 static bool
1916 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1917                 unsigned loc, unsigned max_xfb)
1918 {
1919         if (!(xfb->so_mask & (1ll << loc)))
1920                 return false;
1921
1922         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1923         return o->output_buffer < max_xfb;
1924 }
1925
1926 /* Higher-level wrapper around all of the above, classifying a varying into one
1927  * of the above types */
1928
1929 static struct mali_attr_meta
1930 panfrost_emit_varying(
1931                 struct panfrost_shader_state *stage,
1932                 struct panfrost_shader_state *other,
1933                 struct panfrost_shader_state *xfb,
1934                 unsigned present,
1935                 unsigned max_xfb,
1936                 unsigned *streamout_offsets,
1937                 unsigned quirks,
1938                 unsigned *gen_offsets,
1939                 enum mali_format *gen_formats,
1940                 unsigned *gen_stride,
1941                 unsigned idx,
1942                 bool should_alloc,
1943                 bool is_fragment)
1944 {
1945         gl_varying_slot loc = stage->varyings_loc[idx];
1946         enum mali_format format = stage->varyings[idx];
1947
1948         /* Override format to match linkage */
1949         if (!should_alloc && gen_formats[idx])
1950                 format = gen_formats[idx];
1951
1952         if (has_point_coord(stage->point_sprite_mask, loc)) {
1953                 return pan_emit_vary_special(present, PAN_VARY_PNTCOORD, quirks);
1954         } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1955                 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1956                 return pan_emit_vary_xfb(present, max_xfb, streamout_offsets, quirks, format, *o);
1957         } else if (loc == VARYING_SLOT_POS) {
1958                 if (is_fragment)
1959                         return pan_emit_vary_special(present, PAN_VARY_FRAGCOORD, quirks);
1960                 else
1961                         return pan_emit_vary_special(present, PAN_VARY_POSITION, quirks);
1962         } else if (loc == VARYING_SLOT_PSIZ) {
1963                 return pan_emit_vary_special(present, PAN_VARY_PSIZ, quirks);
1964         } else if (loc == VARYING_SLOT_PNTC) {
1965                 return pan_emit_vary_special(present, PAN_VARY_PNTCOORD, quirks);
1966         } else if (loc == VARYING_SLOT_FACE) {
1967                 return pan_emit_vary_special(present, PAN_VARY_FACE, quirks);
1968         }
1969
1970         /* We've exhausted special cases, so it's otherwise a general varying. Check if we're linked */
1971         signed other_idx = -1;
1972
1973         for (unsigned j = 0; j < other->varying_count; ++j) {
1974                 if (other->varyings_loc[j] == loc) {
1975                         other_idx = j;
1976                         break;
1977                 }
1978         }
1979
1980         if (other_idx < 0)
1981                 return pan_emit_vary_only(present, quirks);
1982
1983         unsigned offset = gen_offsets[other_idx];
1984
1985         if (should_alloc) {
1986                 /* We're linked, so allocate a space via a watermark allocation */
1987                 enum mali_format alt = other->varyings[other_idx];
1988
1989                 /* Do interpolation at minimum precision */
1990                 unsigned size_main = pan_varying_size(format);
1991                 unsigned size_alt = pan_varying_size(alt);
1992                 unsigned size = MIN2(size_main, size_alt);
1993
1994                 /* If a varying is marked for XFB but not actually captured, we
1995                  * should match the format to the format that would otherwise
1996                  * be used for XFB, since dEQP checks for invariance here. It's
1997                  * unclear if this is required by the spec. */
1998
1999                 if (xfb->so_mask & (1ull << loc)) {
2000                         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
2001                         format = pan_xfb_format(format, o->num_components);
2002                         size = pan_varying_size(format);
2003                 } else if (size == size_alt) {
2004                         format = alt;
2005                 }
2006
2007                 gen_offsets[idx] = *gen_stride;
2008                 gen_formats[other_idx] = format;
2009                 offset = *gen_stride;
2010                 *gen_stride += size;
2011         }
2012
2013         return pan_emit_vary(present, PAN_VARY_GENERAL,
2014                         quirks, format, offset);
2015 }
2016
2017 static void
2018 pan_emit_special_input(union mali_attr *varyings,
2019                 unsigned present,
2020                 enum pan_special_varying v,
2021                 mali_ptr addr)
2022 {
2023         if (present & (1 << v)) {
2024                 /* Ensure we write exactly once for performance and with fields
2025                  * zeroed appropriately to avoid flakes */
2026
2027                 union mali_attr s = {
2028                         .elements = addr
2029                 };
2030
2031                 varyings[pan_varying_index(present, v)] = s;
2032         }
2033 }
2034
2035 void
2036 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
2037                                  unsigned vertex_count,
2038                                  struct mali_vertex_tiler_postfix *vertex_postfix,
2039                                  struct mali_vertex_tiler_postfix *tiler_postfix,
2040                                  union midgard_primitive_size *primitive_size)
2041 {
2042         /* Load the shaders */
2043         struct panfrost_context *ctx = batch->ctx;
2044         struct panfrost_device *dev = pan_device(ctx->base.screen);
2045         struct panfrost_shader_state *vs, *fs;
2046         size_t vs_size, fs_size;
2047
2048         /* Allocate the varying descriptor */
2049
2050         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
2051         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
2052         vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
2053         fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
2054
2055         struct panfrost_transfer trans = panfrost_allocate_transient(batch,
2056                                                                      vs_size +
2057                                                                      fs_size);
2058
2059         struct pipe_stream_output_info *so = &vs->stream_output;
2060         unsigned present = pan_varying_present(vs, fs, dev->quirks);
2061
2062         /* Check if this varying is linked by us. This is the case for
2063          * general-purpose, non-captured varyings. If it is, link it. If it's
2064          * not, use the provided stream out information to determine the
2065          * offset, since it was already linked for us. */
2066
2067         unsigned gen_offsets[32];
2068         enum mali_format gen_formats[32];
2069         memset(gen_offsets, 0, sizeof(gen_offsets));
2070         memset(gen_formats, 0, sizeof(gen_formats));
2071
2072         unsigned gen_stride = 0;
2073         assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
2074         assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
2075
2076         unsigned streamout_offsets[32];
2077
2078         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2079                 streamout_offsets[i] = panfrost_streamout_offset(
2080                                         so->stride[i],
2081                                         ctx->streamout.offsets[i],
2082                                         ctx->streamout.targets[i]);
2083         }
2084
2085         struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
2086         struct mali_attr_meta *ofs = ovs + vs->varying_count;
2087
2088         for (unsigned i = 0; i < vs->varying_count; i++) {
2089                 ovs[i] = panfrost_emit_varying(vs, fs, vs, present,
2090                                 ctx->streamout.num_targets, streamout_offsets,
2091                                 dev->quirks,
2092                                 gen_offsets, gen_formats, &gen_stride, i, true, false);
2093         }
2094
2095         for (unsigned i = 0; i < fs->varying_count; i++) {
2096                 ofs[i] = panfrost_emit_varying(fs, vs, vs, present,
2097                                 ctx->streamout.num_targets, streamout_offsets,
2098                                 dev->quirks,
2099                                 gen_offsets, gen_formats, &gen_stride, i, false, true);
2100         }
2101
2102         unsigned xfb_base = pan_xfb_base(present);
2103         struct panfrost_transfer T = panfrost_allocate_transient(batch,
2104                         sizeof(union mali_attr) * (xfb_base + ctx->streamout.num_targets));
2105         union mali_attr *varyings = (union mali_attr *) T.cpu;
2106
2107         /* Emit the stream out buffers */
2108
2109         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
2110                                                            ctx->vertex_count);
2111
2112         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2113                 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
2114                                         so->stride[i],
2115                                         ctx->streamout.offsets[i],
2116                                         out_count,
2117                                         ctx->streamout.targets[i]);
2118         }
2119
2120         panfrost_emit_varyings(batch,
2121                         &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
2122                         gen_stride, vertex_count);
2123
2124         /* fp32 vec4 gl_Position */
2125         tiler_postfix->position_varying = panfrost_emit_varyings(batch,
2126                         &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
2127                         sizeof(float) * 4, vertex_count);
2128
2129         if (present & (1 << PAN_VARY_PSIZ)) {
2130                 primitive_size->pointer = panfrost_emit_varyings(batch,
2131                                 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
2132                                 2, vertex_count);
2133         }
2134
2135         pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_VARYING_POINT_COORD);
2136         pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_VARYING_FRONT_FACING);
2137         pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_VARYING_FRAG_COORD);
2138
2139         vertex_postfix->varyings = T.gpu;
2140         tiler_postfix->varyings = T.gpu;
2141
2142         vertex_postfix->varying_meta = trans.gpu;
2143         tiler_postfix->varying_meta = trans.gpu + vs_size;
2144 }
2145
2146 void
2147 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
2148                                 struct mali_vertex_tiler_prefix *vertex_prefix,
2149                                 struct mali_vertex_tiler_postfix *vertex_postfix,
2150                                 struct mali_vertex_tiler_prefix *tiler_prefix,
2151                                 struct mali_vertex_tiler_postfix *tiler_postfix,
2152                                 union midgard_primitive_size *primitive_size)
2153 {
2154         struct panfrost_context *ctx = batch->ctx;
2155         struct panfrost_device *device = pan_device(ctx->base.screen);
2156         bool wallpapering = ctx->wallpaper_batch && batch->tiler_dep;
2157         struct bifrost_payload_vertex bifrost_vertex = {0,};
2158         struct bifrost_payload_tiler bifrost_tiler = {0,};
2159         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
2160         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
2161         void *vp, *tp;
2162         size_t vp_size, tp_size;
2163
2164         if (device->quirks & IS_BIFROST) {
2165                 bifrost_vertex.prefix = *vertex_prefix;
2166                 bifrost_vertex.postfix = *vertex_postfix;
2167                 vp = &bifrost_vertex;
2168                 vp_size = sizeof(bifrost_vertex);
2169
2170                 bifrost_tiler.prefix = *tiler_prefix;
2171                 bifrost_tiler.tiler.primitive_size = *primitive_size;
2172                 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2173                 bifrost_tiler.postfix = *tiler_postfix;
2174                 tp = &bifrost_tiler;
2175                 tp_size = sizeof(bifrost_tiler);
2176         } else {
2177                 midgard_vertex.prefix = *vertex_prefix;
2178                 midgard_vertex.postfix = *vertex_postfix;
2179                 vp = &midgard_vertex;
2180                 vp_size = sizeof(midgard_vertex);
2181
2182                 midgard_tiler.prefix = *tiler_prefix;
2183                 midgard_tiler.postfix = *tiler_postfix;
2184                 midgard_tiler.primitive_size = *primitive_size;
2185                 tp = &midgard_tiler;
2186                 tp_size = sizeof(midgard_tiler);
2187         }
2188
2189         if (wallpapering) {
2190                 /* Inject in reverse order, with "predicted" job indices.
2191                  * THIS IS A HACK XXX */
2192                 panfrost_new_job(batch, JOB_TYPE_TILER, false,
2193                                  batch->job_index + 2, tp, tp_size, true);
2194                 panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2195                                  vp, vp_size, true);
2196                 return;
2197         }
2198
2199         /* If rasterizer discard is enable, only submit the vertex */
2200
2201         bool rasterizer_discard = ctx->rasterizer &&
2202                                   ctx->rasterizer->base.rasterizer_discard;
2203
2204         unsigned vertex = panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2205                                            vp, vp_size, false);
2206
2207         if (rasterizer_discard)
2208                 return;
2209
2210         panfrost_new_job(batch, JOB_TYPE_TILER, false, vertex, tp, tp_size,
2211                          false);
2212 }
2213
2214 /* TODO: stop hardcoding this */
2215 mali_ptr
2216 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2217 {
2218         uint16_t locations[] = {
2219             128, 128,
2220             0, 256,
2221             0, 256,
2222             0, 256,
2223             0, 256,
2224             0, 256,
2225             0, 256,
2226             0, 256,
2227             0, 256,
2228             0, 256,
2229             0, 256,
2230             0, 256,
2231             0, 256,
2232             0, 256,
2233             0, 256,
2234             0, 256,
2235             0, 256,
2236             0, 256,
2237             0, 256,
2238             0, 256,
2239             0, 256,
2240             0, 256,
2241             0, 256,
2242             0, 256,
2243             0, 256,
2244             0, 256,
2245             0, 256,
2246             0, 256,
2247             0, 256,
2248             0, 256,
2249             0, 256,
2250             0, 256,
2251             128, 128,
2252             0, 0,
2253             0, 0,
2254             0, 0,
2255             0, 0,
2256             0, 0,
2257             0, 0,
2258             0, 0,
2259             0, 0,
2260             0, 0,
2261             0, 0,
2262             0, 0,
2263             0, 0,
2264             0, 0,
2265             0, 0,
2266             0, 0,
2267         };
2268
2269         return panfrost_upload_transient(batch, locations, 96 * sizeof(uint16_t));
2270 }