src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28
  29 #include "panfrost-quirks.h"
  30
  31 #include "pan_pool.h"
  32 #include "pan_bo.h"
  33 #include "pan_cmdstream.h"
  34 #include "pan_context.h"
  35 #include "pan_job.h"
  36
  37 /* If a BO is accessed for a particular shader stage, will it be in the primary
  38  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  39  * fragment will be primary, e.g. compute jobs will be considered
  40  * "vertex/tiler" by analogy */
  41
  42 static inline uint32_t
  43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  44 {
  45         assert(stage == PIPE_SHADER_FRAGMENT ||
  46                stage == PIPE_SHADER_VERTEX ||
  47                stage == PIPE_SHADER_COMPUTE);
  48
  49         return stage == PIPE_SHADER_FRAGMENT ?
  50                PAN_BO_ACCESS_FRAGMENT :
  51                PAN_BO_ACCESS_VERTEX_TILER;
  52 }
  53
  54 static void
  55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
  56                                struct mali_vertex_tiler_postfix *postfix)
  57 {
  58         struct panfrost_device *dev = pan_device(ctx->base.screen);
  59         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  60
  61         unsigned shift = panfrost_get_stack_shift(batch->stack_size);
  62         struct mali_shared_memory shared = {
  63                 .stack_shift = shift,
  64                 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
  65                 .shared_workgroup_count = ~0,
  66         };
  67         postfix->shared_memory = panfrost_pool_upload(&batch->pool, &shared, sizeof(shared));
  68 }
  69
  70 static void
  71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
  72                                struct mali_vertex_tiler_postfix *postfix)
  73 {
  74         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  75         postfix->shared_memory = panfrost_batch_reserve_framebuffer(batch);
  76 }
  77
  78 static void
  79 panfrost_vt_update_rasterizer(struct panfrost_rasterizer *rasterizer,
  80                               struct mali_vertex_tiler_prefix *prefix,
  81                               struct mali_vertex_tiler_postfix *postfix)
  82 {
  83         postfix->gl_enables |= 0x7;
  84         SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
  85                 rasterizer->base.front_ccw);
  86         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
  87                 (rasterizer->base.cull_face & PIPE_FACE_FRONT));
  88         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
  89                 (rasterizer->base.cull_face & PIPE_FACE_BACK));
  90         SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
  91                 rasterizer->base.flatshade_first);
  92 }
  93
  94 void
  95 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
  96                                   struct mali_vertex_tiler_prefix *prefix,
  97                                   union midgard_primitive_size *primitive_size)
  98 {
  99         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 100
 101         if (!panfrost_writes_point_size(ctx)) {
 102                 float val = (prefix->draw_mode == MALI_DRAW_MODE_POINTS) ?
 103                               rasterizer->base.point_size :
 104                               rasterizer->base.line_width;
 105
 106                 primitive_size->constant = val;
 107         }
 108 }
 109
 110 static void
 111 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
 112                                    struct mali_vertex_tiler_postfix *postfix)
 113 {
 114         SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
 115         if (ctx->occlusion_query) {
 116                 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
 117                 panfrost_batch_add_bo(ctx->batch, ctx->occlusion_query->bo,
 118                                       PAN_BO_ACCESS_SHARED |
 119                                       PAN_BO_ACCESS_RW |
 120                                       PAN_BO_ACCESS_FRAGMENT);
 121         } else {
 122                 postfix->occlusion_counter = 0;
 123         }
 124 }
 125
 126 void
 127 panfrost_vt_init(struct panfrost_context *ctx,
 128                  enum pipe_shader_type stage,
 129                  struct mali_vertex_tiler_prefix *prefix,
 130                  struct mali_vertex_tiler_postfix *postfix)
 131 {
 132         struct panfrost_device *device = pan_device(ctx->base.screen);
 133
 134         if (!ctx->shader[stage])
 135                 return;
 136
 137         memset(prefix, 0, sizeof(*prefix));
 138         memset(postfix, 0, sizeof(*postfix));
 139
 140         if (device->quirks & IS_BIFROST) {
 141                 postfix->gl_enables = 0x2;
 142                 panfrost_vt_emit_shared_memory(ctx, postfix);
 143         } else {
 144                 postfix->gl_enables = 0x6;
 145                 panfrost_vt_attach_framebuffer(ctx, postfix);
 146         }
 147
 148         if (stage == PIPE_SHADER_FRAGMENT) {
 149                 panfrost_vt_update_occlusion_query(ctx, postfix);
 150                 panfrost_vt_update_rasterizer(ctx->rasterizer, prefix, postfix);
 151         }
 152 }
 153
 154 static unsigned
 155 panfrost_translate_index_size(unsigned size)
 156 {
 157         switch (size) {
 158         case 1:
 159                 return MALI_DRAW_INDEXED_UINT8;
 160
 161         case 2:
 162                 return MALI_DRAW_INDEXED_UINT16;
 163
 164         case 4:
 165                 return MALI_DRAW_INDEXED_UINT32;
 166
 167         default:
 168                 unreachable("Invalid index size");
 169         }
 170 }
 171
 172 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
 173  * good for the duration of the draw (transient), could last longer. Also get
 174  * the bounds on the index buffer for the range accessed by the draw. We do
 175  * these operations together because there are natural optimizations which
 176  * require them to be together. */
 177
 178 static mali_ptr
 179 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 180                                   const struct pipe_draw_info *info,
 181                                   unsigned *min_index, unsigned *max_index)
 182 {
 183         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 184         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 185         off_t offset = info->start * info->index_size;
 186         bool needs_indices = true;
 187         mali_ptr out = 0;
 188
 189         if (info->max_index != ~0u) {
 190                 *min_index = info->min_index;
 191                 *max_index = info->max_index;
 192                 needs_indices = false;
 193         }
 194
 195         if (!info->has_user_indices) {
 196                 /* Only resources can be directly mapped */
 197                 panfrost_batch_add_bo(batch, rsrc->bo,
 198                                       PAN_BO_ACCESS_SHARED |
 199                                       PAN_BO_ACCESS_READ |
 200                                       PAN_BO_ACCESS_VERTEX_TILER);
 201                 out = rsrc->bo->gpu + offset;
 202
 203                 /* Check the cache */
 204                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 205                                                            info->start,
 206                                                            info->count,
 207                                                            min_index,
 208                                                            max_index);
 209         } else {
 210                 /* Otherwise, we need to upload to transient memory */
 211                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 212                 out = panfrost_pool_upload(&batch->pool, ibuf8 + offset,
 213                                                 info->count *
 214                                                 info->index_size);
 215         }
 216
 217         if (needs_indices) {
 218                 /* Fallback */
 219                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 220
 221                 if (!info->has_user_indices)
 222                         panfrost_minmax_cache_add(rsrc->index_cache,
 223                                                   info->start, info->count,
 224                                                   *min_index, *max_index);
 225         }
 226
 227         return out;
 228 }
 229
 230 void
 231 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
 232                           const struct pipe_draw_info *info,
 233                           enum mali_draw_mode draw_mode,
 234                           struct mali_vertex_tiler_postfix *vertex_postfix,
 235                           struct mali_vertex_tiler_prefix *tiler_prefix,
 236                           struct mali_vertex_tiler_postfix *tiler_postfix,
 237                           unsigned *vertex_count,
 238                           unsigned *padded_count)
 239 {
 240         tiler_prefix->draw_mode = draw_mode;
 241
 242         unsigned draw_flags = 0;
 243
 244         if (panfrost_writes_point_size(ctx))
 245                 draw_flags |= MALI_DRAW_VARYING_SIZE;
 246
 247         if (info->primitive_restart)
 248                 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
 249
 250         /* These doesn't make much sense */
 251
 252         draw_flags |= 0x3000;
 253
 254         if (info->index_size) {
 255                 unsigned min_index = 0, max_index = 0;
 256
 257                 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
 258                                                                        info,
 259                                                                        &min_index,
 260                                                                        &max_index);
 261
 262                 /* Use the corresponding values */
 263                 *vertex_count = max_index - min_index + 1;
 264                 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
 265                 tiler_prefix->offset_bias_correction = -min_index;
 266                 tiler_prefix->index_count = MALI_POSITIVE(info->count);
 267                 draw_flags |= panfrost_translate_index_size(info->index_size);
 268         } else {
 269                 tiler_prefix->indices = 0;
 270                 *vertex_count = ctx->vertex_count;
 271                 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
 272                 tiler_prefix->offset_bias_correction = 0;
 273                 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
 274         }
 275
 276         tiler_prefix->unknown_draw = draw_flags;
 277
 278         /* Encode the padded vertex count */
 279
 280         if (info->instance_count > 1) {
 281                 *padded_count = panfrost_padded_vertex_count(*vertex_count);
 282
 283                 unsigned shift = __builtin_ctz(ctx->padded_count);
 284                 unsigned k = ctx->padded_count >> (shift + 1);
 285
 286                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
 287                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
 288         } else {
 289                 *padded_count = *vertex_count;
 290
 291                 /* Reset instancing state */
 292                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
 293                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
 294         }
 295 }
 296
 297 static void
 298 panfrost_shader_meta_init(struct panfrost_context *ctx,
 299                           enum pipe_shader_type st,
 300                           struct mali_shader_meta *meta)
 301 {
 302         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 303         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 304
 305         memset(meta, 0, sizeof(*meta));
 306         meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
 307         meta->attribute_count = ss->attribute_count;
 308         meta->varying_count = ss->varying_count;
 309         meta->texture_count = ctx->sampler_view_count[st];
 310         meta->sampler_count = ctx->sampler_count[st];
 311
 312         if (dev->quirks & IS_BIFROST) {
 313                 if (st == PIPE_SHADER_VERTEX)
 314                         meta->bifrost1.unk1 = 0x800000;
 315                 else {
 316                         /* First clause ATEST |= 0x4000000.
 317                          * Less than 32 regs |= 0x200 */
 318                         meta->bifrost1.unk1 = 0x950020;
 319                 }
 320
 321                 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 322                 if (st == PIPE_SHADER_VERTEX)
 323                         meta->bifrost2.preload_regs = 0xC0;
 324                 else {
 325                         meta->bifrost2.preload_regs = 0x1;
 326                         SET_BIT(meta->bifrost2.preload_regs, 0x10, ss->reads_frag_coord);
 327                 }
 328
 329                 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
 330                                                     ss->uniform_cutoff);
 331         } else {
 332                 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
 333                                                     ss->uniform_cutoff);
 334                 meta->midgard1.work_count = ss->work_reg_count;
 335
 336                 /* TODO: This is not conformant on ES3 */
 337                 meta->midgard1.flags_hi = MALI_SUPPRESS_INF_NAN;
 338
 339                 meta->midgard1.flags_lo = 0x20;
 340                 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 341
 342                 SET_BIT(meta->midgard1.flags_hi, MALI_WRITES_GLOBAL, ss->writes_global);
 343         }
 344 }
 345
 346 static unsigned
 347 translate_tex_wrap(enum pipe_tex_wrap w)
 348 {
 349         switch (w) {
 350         case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT;
 351         case PIPE_TEX_WRAP_CLAMP: return MALI_WRAP_MODE_CLAMP;
 352         case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE;
 353         case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER;
 354         case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT;
 355         case PIPE_TEX_WRAP_MIRROR_CLAMP: return MALI_WRAP_MODE_MIRRORED_CLAMP;
 356         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
 357         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
 358         default: unreachable("Invalid wrap");
 359         }
 360 }
 361
 362 /* The hardware compares in the wrong order order, so we have to flip before
 363  * encoding. Yes, really. */
 364
 365 static enum mali_func
 366 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
 367 {
 368         if (!cso->compare_mode)
 369                 return MALI_FUNC_NEVER;
 370
 371         enum mali_func f = panfrost_translate_compare_func(cso->compare_func);
 372         return panfrost_flip_compare_func(f);
 373 }
 374
 375 static enum mali_mipmap_mode
 376 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
 377 {
 378         switch (f) {
 379         case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST;
 380         case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR;
 381         case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE;
 382         default: unreachable("Invalid");
 383         }
 384 }
 385
 386 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 387                                 struct mali_midgard_sampler_packed *hw)
 388 {
 389         pan_pack(hw, MIDGARD_SAMPLER, cfg) {
 390                 cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 391                 cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 392                 cfg.mipmap_mode = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) ?
 393                         MALI_MIPMAP_MODE_TRILINEAR : MALI_MIPMAP_MODE_NEAREST;
 394                 cfg.normalized_coordinates = cso->normalized_coords;
 395
 396                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
 397
 398                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
 399
 400                 /* If necessary, we disable mipmapping in the sampler descriptor by
 401                  * clamping the LOD as tight as possible (from 0 to epsilon,
 402                  * essentially -- remember these are fixed point numbers, so
 403                  * epsilon=1/256) */
 404
 405                 cfg.maximum_lod = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) ?
 406                         cfg.minimum_lod + 1 :
 407                         FIXED_16(cso->max_lod, false);
 408
 409                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
 410                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
 411                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
 412
 413                 cfg.compare_function = panfrost_sampler_compare_func(cso);
 414                 cfg.seamless_cube_map = cso->seamless_cube_map;
 415
 416                 cfg.border_color_r = cso->border_color.f[0];
 417                 cfg.border_color_g = cso->border_color.f[1];
 418                 cfg.border_color_b = cso->border_color.f[2];
 419                 cfg.border_color_a = cso->border_color.f[3];
 420         }
 421 }
 422
 423 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
 424                                         struct mali_bifrost_sampler_packed *hw)
 425 {
 426         pan_pack(hw, BIFROST_SAMPLER, cfg) {
 427                 cfg.magnify_linear = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR;
 428                 cfg.minify_linear = cso->min_img_filter == PIPE_TEX_FILTER_LINEAR;
 429                 cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
 430                 cfg.normalized_coordinates = cso->normalized_coords;
 431
 432                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
 433                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
 434                 cfg.maximum_lod = FIXED_16(cso->max_lod, false);
 435
 436                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
 437                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
 438                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
 439
 440                 cfg.compare_function = panfrost_sampler_compare_func(cso);
 441                 cfg.seamless_cube_map = cso->seamless_cube_map;
 442         }
 443 }
 444
 445 static void
 446 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
 447                                      struct mali_shader_meta *fragmeta)
 448 {
 449         struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 450
 451         bool msaa = rast->multisample;
 452
 453         /* TODO: Sample size */
 454         SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
 455         SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
 456
 457         struct panfrost_shader_state *fs;
 458         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 459
 460         /* EXT_shader_framebuffer_fetch requires the shader to be run
 461          * per-sample when outputs are read. */
 462         bool per_sample = ctx->min_samples > 1 || fs->outputs_read;
 463         SET_BIT(fragmeta->unknown2_3, MALI_PER_SAMPLE, msaa && per_sample);
 464
 465         fragmeta->depth_units = rast->offset_units * 2.0f;
 466         fragmeta->depth_factor = rast->offset_scale;
 467
 468         /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
 469
 470         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, rast->offset_tri);
 471         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, rast->offset_tri);
 472
 473         SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, rast->depth_clip_near);
 474         SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, rast->depth_clip_far);
 475 }
 476
 477 static void
 478 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
 479                               struct mali_shader_meta *fragmeta)
 480 {
 481         const struct panfrost_zsa_state *so = ctx->depth_stencil;
 482         int zfunc = PIPE_FUNC_ALWAYS;
 483
 484         if (!so) {
 485                 /* If stenciling is disabled, the state is irrelevant */
 486                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
 487                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
 488         } else {
 489                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
 490                         so->base.stencil[0].enabled);
 491
 492                 fragmeta->stencil_mask_front = so->stencil_mask_front;
 493                 fragmeta->stencil_mask_back = so->stencil_mask_back;
 494
 495                 /* Bottom bits for stencil ref, exactly one word */
 496                 fragmeta->stencil_front.opaque[0] = so->stencil_front.opaque[0] | ctx->stencil_ref.ref_value[0];
 497
 498                 /* If back-stencil is not enabled, use the front values */
 499
 500                 if (so->base.stencil[1].enabled)
 501                         fragmeta->stencil_back.opaque[0] = so->stencil_back.opaque[0] | ctx->stencil_ref.ref_value[1];
 502                 else
 503                         fragmeta->stencil_back = fragmeta->stencil_front;
 504
 505                 if (so->base.depth.enabled)
 506                         zfunc = so->base.depth.func;
 507
 508                 /* Depth state (TODO: Refactor) */
 509
 510                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
 511                         so->base.depth.writemask);
 512         }
 513
 514         fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
 515         fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
 516 }
 517
 518 static bool
 519 panfrost_fs_required(
 520                 struct panfrost_shader_state *fs,
 521                 struct panfrost_blend_final *blend,
 522                 unsigned rt_count)
 523 {
 524         /* If we generally have side effects */
 525         if (fs->fs_sidefx)
 526                 return true;
 527
 528         /* If colour is written we need to execute */
 529         for (unsigned i = 0; i < rt_count; ++i) {
 530                 if (!blend[i].no_colour)
 531                         return true;
 532         }
 533
 534         /* If depth is written and not implied we need to execute.
 535          * TODO: Predicate on Z/S writes being enabled */
 536         return (fs->writes_depth || fs->writes_stencil);
 537 }
 538
 539 static void
 540 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
 541                                 struct mali_shader_meta *fragmeta,
 542                                 void *rts)
 543 {
 544         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 545         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 546         struct panfrost_shader_state *fs;
 547         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 548
 549         SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
 550                 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
 551                 !ctx->blend->base.dither);
 552
 553         SET_BIT(fragmeta->unknown2_4, MALI_ALPHA_TO_COVERAGE,
 554                         ctx->blend->base.alpha_to_coverage);
 555
 556         /* Get blending setup */
 557         unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 558
 559         struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 560         unsigned shader_offset = 0;
 561         struct panfrost_bo *shader_bo = NULL;
 562
 563         for (unsigned c = 0; c < rt_count; ++c)
 564                 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
 565                                                           &shader_offset);
 566
 567         /* Disable shader execution if we can */
 568         if (dev->quirks & MIDGARD_SHADERLESS
 569                         && !panfrost_fs_required(fs, blend, rt_count)) {
 570                 fragmeta->shader = 0;
 571                 fragmeta->attribute_count = 0;
 572                 fragmeta->varying_count = 0;
 573                 fragmeta->texture_count = 0;
 574                 fragmeta->sampler_count = 0;
 575
 576                 /* This feature is not known to work on Bifrost */
 577                 fragmeta->midgard1.work_count = 1;
 578                 fragmeta->midgard1.uniform_count = 0;
 579                 fragmeta->midgard1.uniform_buffer_count = 0;
 580         }
 581
 582          /* If there is a blend shader, work registers are shared. We impose 8
 583           * work registers as a limit for blend shaders. Should be lower XXX */
 584
 585         if (!(dev->quirks & IS_BIFROST)) {
 586                 for (unsigned c = 0; c < rt_count; ++c) {
 587                         if (blend[c].is_shader) {
 588                                 fragmeta->midgard1.work_count =
 589                                         MAX2(fragmeta->midgard1.work_count, 8);
 590                         }
 591                 }
 592         }
 593
 594         /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
 595          * copied to the blend_meta appended (by convention), but this is the
 596          * field actually read by the hardware. (Or maybe both are read...?).
 597          * Specify the last RTi with a blend shader. */
 598
 599         fragmeta->blend.shader = 0;
 600
 601         for (signed rt = (rt_count - 1); rt >= 0; --rt) {
 602                 if (!blend[rt].is_shader)
 603                         continue;
 604
 605                 fragmeta->blend.shader = blend[rt].shader.gpu |
 606                                          blend[rt].shader.first_tag;
 607                 break;
 608         }
 609
 610         if (dev->quirks & MIDGARD_SFBD) {
 611                 /* When only a single render target platform is used, the blend
 612                  * information is inside the shader meta itself. We additionally
 613                  * need to signal CAN_DISCARD for nontrivial blend modes (so
 614                  * we're able to read back the destination buffer) */
 615
 616                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
 617                         blend[0].is_shader);
 618
 619                 if (!blend[0].is_shader) {
 620                         fragmeta->blend.equation = *blend[0].equation.equation;
 621                         fragmeta->blend.constant = blend[0].equation.constant;
 622                 }
 623
 624                 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
 625                         !blend[0].no_blending || fs->can_discard);
 626
 627                 batch->draws |= PIPE_CLEAR_COLOR0;
 628                 return;
 629         }
 630
 631         if (dev->quirks & IS_BIFROST) {
 632                 bool no_blend = true;
 633
 634                 for (unsigned i = 0; i < rt_count; ++i)
 635                         no_blend &= (blend[i].no_blending | blend[i].no_colour);
 636
 637                 SET_BIT(fragmeta->bifrost1.unk1, MALI_BIFROST_EARLY_Z,
 638                         !fs->can_discard && !fs->writes_depth && no_blend);
 639         }
 640
 641         /* Additional blend descriptor tacked on for jobs using MFBD */
 642
 643         for (unsigned i = 0; i < rt_count; ++i) {
 644                 unsigned flags = 0;
 645
 646                 if (ctx->pipe_framebuffer.nr_cbufs > i && !blend[i].no_colour) {
 647                         flags = 0x200;
 648                         batch->draws |= (PIPE_CLEAR_COLOR0 << i);
 649
 650                         bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
 651                                        (ctx->pipe_framebuffer.cbufs[i]) &&
 652                                        util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
 653
 654                         SET_BIT(flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
 655                         SET_BIT(flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
 656                         SET_BIT(flags, MALI_BLEND_SRGB, is_srgb);
 657                         SET_BIT(flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
 658                 }
 659
 660                 if (dev->quirks & IS_BIFROST) {
 661                         struct bifrost_blend_rt *brts = rts;
 662
 663                         brts[i].flags = flags;
 664
 665                         if (blend[i].is_shader) {
 666                                 /* The blend shader's address needs to be at
 667                                  * the same top 32 bit as the fragment shader.
 668                                  * TODO: Ensure that's always the case.
 669                                  */
 670                                 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
 671                                        (fs->bo->gpu & (0xffffffffull << 32)));
 672                                 brts[i].shader = blend[i].shader.gpu;
 673                                 brts[i].unk2 = 0x0;
 674                         } else if (ctx->pipe_framebuffer.nr_cbufs > i) {
 675                                 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
 676                                 const struct util_format_description *format_desc;
 677                                 format_desc = util_format_description(format);
 678
 679                                 brts[i].equation = *blend[i].equation.equation;
 680
 681                                 /* TODO: this is a bit more complicated */
 682                                 brts[i].constant = blend[i].equation.constant;
 683
 684                                 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
 685
 686                                 /* 0x19 disables blending and forces REPLACE
 687                                  * mode (equivalent to rgb_mode = alpha_mode =
 688                                  * x122, colour mask = 0xF). 0x1a allows
 689                                  * blending. */
 690                                 brts[i].unk2 = blend[i].no_blending ? 0x19 : 0x1a;
 691
 692                                 brts[i].shader_type = fs->blend_types[i];
 693                         } else {
 694                                 /* Dummy attachment for depth-only */
 695                                 brts[i].unk2 = 0x3;
 696                                 brts[i].shader_type = fs->blend_types[i];
 697                         }
 698                 } else {
 699                         struct midgard_blend_rt *mrts = rts;
 700                         mrts[i].flags = flags;
 701
 702                         if (blend[i].is_shader) {
 703                                 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 704                         } else {
 705                                 mrts[i].blend.equation = *blend[i].equation.equation;
 706                                 mrts[i].blend.constant = blend[i].equation.constant;
 707                         }
 708                 }
 709         }
 710 }
 711
 712 static void
 713 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
 714                                struct mali_shader_meta *fragmeta,
 715                                void *rts)
 716 {
 717         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 718         struct panfrost_shader_state *fs;
 719
 720         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 721
 722         bool msaa = ctx->rasterizer->base.multisample;
 723         fragmeta->coverage_mask = msaa ? ctx->sample_mask : ~0;
 724
 725         fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x10;
 726         fragmeta->unknown2_4 = 0x4e0;
 727
 728         /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
 729          * is required (independent of 32-bit/64-bit descriptors), or why it's
 730          * not used on later GPU revisions. Otherwise, all shader jobs fault on
 731          * these earlier chips (perhaps this is a chicken bit of some kind).
 732          * More investigation is needed. */
 733
 734         SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
 735
 736         if (dev->quirks & IS_BIFROST) {
 737                 /* TODO */
 738         } else {
 739                 /* Depending on whether it's legal to in the given shader, we try to
 740                  * enable early-z testing. TODO: respect e-z force */
 741
 742                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
 743                         !fs->can_discard && !fs->writes_global &&
 744                         !fs->writes_depth && !fs->writes_stencil &&
 745                         !ctx->blend->base.alpha_to_coverage);
 746
 747                 /* Add the writes Z/S flags if needed. */
 748                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
 749                 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
 750
 751                 /* Any time texturing is used, derivatives are implicitly calculated,
 752                  * so we need to enable helper invocations */
 753
 754                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
 755                         fs->helper_invocations);
 756
 757                 /* If discard is enabled, which bit we set to convey this
 758                  * depends on if depth/stencil is used for the draw or not.
 759                  * Just one of depth OR stencil is enough to trigger this. */
 760
 761                 const struct pipe_depth_stencil_alpha_state *zsa = &ctx->depth_stencil->base;
 762                 bool zs_enabled = fs->writes_depth || fs->writes_stencil;
 763
 764                 if (zsa) {
 765                         zs_enabled |= (zsa->depth.enabled && zsa->depth.func != PIPE_FUNC_ALWAYS);
 766                         zs_enabled |= zsa->stencil[0].enabled;
 767                 }
 768
 769                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_TILEBUFFER,
 770                         fs->outputs_read || (!zs_enabled && fs->can_discard));
 771                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, zs_enabled && fs->can_discard);
 772         }
 773
 774         panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
 775         panfrost_frag_meta_zsa_update(ctx, fragmeta);
 776         panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
 777 }
 778
 779 void
 780 panfrost_emit_shader_meta(struct panfrost_batch *batch,
 781                           enum pipe_shader_type st,
 782                           struct mali_vertex_tiler_postfix *postfix)
 783 {
 784         struct panfrost_context *ctx = batch->ctx;
 785         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 786
 787         if (!ss) {
 788                 postfix->shader = 0;
 789                 return;
 790         }
 791
 792         struct mali_shader_meta meta;
 793
 794         panfrost_shader_meta_init(ctx, st, &meta);
 795
 796         /* Add the shader BO to the batch. */
 797         panfrost_batch_add_bo(batch, ss->bo,
 798                               PAN_BO_ACCESS_PRIVATE |
 799                               PAN_BO_ACCESS_READ |
 800                               panfrost_bo_access_for_stage(st));
 801
 802         mali_ptr shader_ptr;
 803
 804         if (st == PIPE_SHADER_FRAGMENT) {
 805                 struct panfrost_device *dev = pan_device(ctx->base.screen);
 806                 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 807                 size_t desc_size = sizeof(meta);
 808                 void *rts = NULL;
 809                 struct panfrost_transfer xfer;
 810                 unsigned rt_size;
 811
 812                 if (dev->quirks & MIDGARD_SFBD)
 813                         rt_size = 0;
 814                 else if (dev->quirks & IS_BIFROST)
 815                         rt_size = sizeof(struct bifrost_blend_rt);
 816                 else
 817                         rt_size = sizeof(struct midgard_blend_rt);
 818
 819                 desc_size += rt_size * rt_count;
 820
 821                 if (rt_size)
 822                         rts = rzalloc_size(ctx, rt_size * rt_count);
 823
 824                 panfrost_frag_shader_meta_init(ctx, &meta, rts);
 825
 826                 xfer = panfrost_pool_alloc(&batch->pool, desc_size);
 827
 828                 memcpy(xfer.cpu, &meta, sizeof(meta));
 829                 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
 830
 831                 if (rt_size)
 832                         ralloc_free(rts);
 833
 834                 shader_ptr = xfer.gpu;
 835         } else {
 836                 shader_ptr = panfrost_pool_upload(&batch->pool, &meta,
 837                                                        sizeof(meta));
 838         }
 839
 840         postfix->shader = shader_ptr;
 841 }
 842
 843 void
 844 panfrost_emit_viewport(struct panfrost_batch *batch,
 845                        struct mali_vertex_tiler_postfix *tiler_postfix)
 846 {
 847         struct panfrost_context *ctx = batch->ctx;
 848         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 849         const struct pipe_scissor_state *ss = &ctx->scissor;
 850         const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 851         const struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
 852
 853         /* Derive min/max from translate/scale. Note since |x| >= 0 by
 854          * definition, we have that -|x| <= |x| hence translate - |scale| <=
 855          * translate + |scale|, so the ordering is correct here. */
 856         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 857         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 858         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 859         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 860         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 861         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 862
 863         /* Scissor to the intersection of viewport and to the scissor, clamped
 864          * to the framebuffer */
 865
 866         unsigned minx = MIN2(fb->width, vp_minx);
 867         unsigned maxx = MIN2(fb->width, vp_maxx);
 868         unsigned miny = MIN2(fb->height, vp_miny);
 869         unsigned maxy = MIN2(fb->height, vp_maxy);
 870
 871         if (ss && rast->scissor) {
 872                 minx = MAX2(ss->minx, minx);
 873                 miny = MAX2(ss->miny, miny);
 874                 maxx = MIN2(ss->maxx, maxx);
 875                 maxy = MIN2(ss->maxy, maxy);
 876         }
 877
 878         struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, MALI_VIEWPORT_LENGTH);
 879
 880         pan_pack(T.cpu, VIEWPORT, cfg) {
 881                 cfg.scissor_minimum_x = minx;
 882                 cfg.scissor_minimum_y = miny;
 883                 cfg.scissor_maximum_x = maxx - 1;
 884                 cfg.scissor_maximum_y = maxy - 1;
 885
 886                 cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY;
 887                 cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY;
 888         }
 889
 890         tiler_postfix->viewport = T.gpu;
 891         panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
 892 }
 893
 894 static mali_ptr
 895 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
 896                                  enum pipe_shader_type st,
 897                                  struct panfrost_constant_buffer *buf,
 898                                  unsigned index)
 899 {
 900         struct pipe_constant_buffer *cb = &buf->cb[index];
 901         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
 902
 903         if (rsrc) {
 904                 panfrost_batch_add_bo(batch, rsrc->bo,
 905                                       PAN_BO_ACCESS_SHARED |
 906                                       PAN_BO_ACCESS_READ |
 907                                       panfrost_bo_access_for_stage(st));
 908
 909                 /* Alignment gauranteed by
 910                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
 911                 return rsrc->bo->gpu + cb->buffer_offset;
 912         } else if (cb->user_buffer) {
 913                 return panfrost_pool_upload(&batch->pool,
 914                                                  cb->user_buffer +
 915                                                  cb->buffer_offset,
 916                                                  cb->buffer_size);
 917         } else {
 918                 unreachable("No constant buffer");
 919         }
 920 }
 921
 922 struct sysval_uniform {
 923         union {
 924                 float f[4];
 925                 int32_t i[4];
 926                 uint32_t u[4];
 927                 uint64_t du[2];
 928         };
 929 };
 930
 931 static void
 932 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
 933                                       struct sysval_uniform *uniform)
 934 {
 935         struct panfrost_context *ctx = batch->ctx;
 936         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 937
 938         uniform->f[0] = vp->scale[0];
 939         uniform->f[1] = vp->scale[1];
 940         uniform->f[2] = vp->scale[2];
 941 }
 942
 943 static void
 944 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
 945                                        struct sysval_uniform *uniform)
 946 {
 947         struct panfrost_context *ctx = batch->ctx;
 948         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 949
 950         uniform->f[0] = vp->translate[0];
 951         uniform->f[1] = vp->translate[1];
 952         uniform->f[2] = vp->translate[2];
 953 }
 954
 955 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
 956                                        enum pipe_shader_type st,
 957                                        unsigned int sysvalid,
 958                                        struct sysval_uniform *uniform)
 959 {
 960         struct panfrost_context *ctx = batch->ctx;
 961         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
 962         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
 963         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
 964         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
 965
 966         assert(dim);
 967         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
 968
 969         if (dim > 1)
 970                 uniform->i[1] = u_minify(tex->texture->height0,
 971                                          tex->u.tex.first_level);
 972
 973         if (dim > 2)
 974                 uniform->i[2] = u_minify(tex->texture->depth0,
 975                                          tex->u.tex.first_level);
 976
 977         if (is_array)
 978                 uniform->i[dim] = tex->texture->array_size;
 979 }
 980
 981 static void
 982 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
 983                             enum pipe_shader_type st,
 984                             unsigned ssbo_id,
 985                             struct sysval_uniform *uniform)
 986 {
 987         struct panfrost_context *ctx = batch->ctx;
 988
 989         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
 990         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
 991
 992         /* Compute address */
 993         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
 994
 995         panfrost_batch_add_bo(batch, bo,
 996                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
 997                               panfrost_bo_access_for_stage(st));
 998
 999         /* Upload address and size as sysval */
1000         uniform->du[0] = bo->gpu + sb.buffer_offset;
1001         uniform->u[2] = sb.buffer_size;
1002 }
1003
1004 static void
1005 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1006                                enum pipe_shader_type st,
1007                                unsigned samp_idx,
1008                                struct sysval_uniform *uniform)
1009 {
1010         struct panfrost_context *ctx = batch->ctx;
1011         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1012
1013         uniform->f[0] = sampl->min_lod;
1014         uniform->f[1] = sampl->max_lod;
1015         uniform->f[2] = sampl->lod_bias;
1016
1017         /* Even without any errata, Midgard represents "no mipmapping" as
1018          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1019          * panfrost_create_sampler_state which also explains our choice of
1020          * epsilon value (again to keep behaviour consistent) */
1021
1022         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1023                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1024 }
1025
1026 static void
1027 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1028                                        struct sysval_uniform *uniform)
1029 {
1030         struct panfrost_context *ctx = batch->ctx;
1031
1032         uniform->u[0] = ctx->compute_grid->grid[0];
1033         uniform->u[1] = ctx->compute_grid->grid[1];
1034         uniform->u[2] = ctx->compute_grid->grid[2];
1035 }
1036
1037 static void
1038 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1039                         struct panfrost_shader_state *ss,
1040                         enum pipe_shader_type st)
1041 {
1042         struct sysval_uniform *uniforms = (void *)buf;
1043
1044         for (unsigned i = 0; i < ss->sysval_count; ++i) {
1045                 int sysval = ss->sysval[i];
1046
1047                 switch (PAN_SYSVAL_TYPE(sysval)) {
1048                 case PAN_SYSVAL_VIEWPORT_SCALE:
1049                         panfrost_upload_viewport_scale_sysval(batch,
1050                                                               &uniforms[i]);
1051                         break;
1052                 case PAN_SYSVAL_VIEWPORT_OFFSET:
1053                         panfrost_upload_viewport_offset_sysval(batch,
1054                                                                &uniforms[i]);
1055                         break;
1056                 case PAN_SYSVAL_TEXTURE_SIZE:
1057                         panfrost_upload_txs_sysval(batch, st,
1058                                                    PAN_SYSVAL_ID(sysval),
1059                                                    &uniforms[i]);
1060                         break;
1061                 case PAN_SYSVAL_SSBO:
1062                         panfrost_upload_ssbo_sysval(batch, st,
1063                                                     PAN_SYSVAL_ID(sysval),
1064                                                     &uniforms[i]);
1065                         break;
1066                 case PAN_SYSVAL_NUM_WORK_GROUPS:
1067                         panfrost_upload_num_work_groups_sysval(batch,
1068                                                                &uniforms[i]);
1069                         break;
1070                 case PAN_SYSVAL_SAMPLER:
1071                         panfrost_upload_sampler_sysval(batch, st,
1072                                                        PAN_SYSVAL_ID(sysval),
1073                                                        &uniforms[i]);
1074                         break;
1075                 default:
1076                         assert(0);
1077                 }
1078         }
1079 }
1080
1081 static const void *
1082 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1083                                  unsigned index)
1084 {
1085         struct pipe_constant_buffer *cb = &buf->cb[index];
1086         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1087
1088         if (rsrc)
1089                 return rsrc->bo->cpu;
1090         else if (cb->user_buffer)
1091                 return cb->user_buffer;
1092         else
1093                 unreachable("No constant buffer");
1094 }
1095
1096 void
1097 panfrost_emit_const_buf(struct panfrost_batch *batch,
1098                         enum pipe_shader_type stage,
1099                         struct mali_vertex_tiler_postfix *postfix)
1100 {
1101         struct panfrost_context *ctx = batch->ctx;
1102         struct panfrost_shader_variants *all = ctx->shader[stage];
1103
1104         if (!all)
1105                 return;
1106
1107         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1108
1109         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1110
1111         /* Uniforms are implicitly UBO #0 */
1112         bool has_uniforms = buf->enabled_mask & (1 << 0);
1113
1114         /* Allocate room for the sysval and the uniforms */
1115         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1116         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1117         size_t size = sys_size + uniform_size;
1118         struct panfrost_transfer transfer = panfrost_pool_alloc(&batch->pool,
1119                                                                         size);
1120
1121         /* Upload sysvals requested by the shader */
1122         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1123
1124         /* Upload uniforms */
1125         if (has_uniforms && uniform_size) {
1126                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1127                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1128         }
1129
1130         /* Next up, attach UBOs. UBO #0 is the uniforms we just
1131          * uploaded */
1132
1133         unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1134         assert(ubo_count >= 1);
1135
1136         size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count;
1137         struct panfrost_transfer ubos = panfrost_pool_alloc(&batch->pool, sz);
1138         uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
1139
1140         /* Upload uniforms as a UBO */
1141
1142         if (ss->uniform_count) {
1143                 pan_pack(ubo_ptr, UNIFORM_BUFFER, cfg) {
1144                         cfg.entries = ss->uniform_count;
1145                         cfg.pointer = transfer.gpu;
1146                 }
1147         } else {
1148                 *ubo_ptr = 0;
1149         }
1150
1151         /* The rest are honest-to-goodness UBOs */
1152
1153         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1154                 size_t usz = buf->cb[ubo].buffer_size;
1155                 bool enabled = buf->enabled_mask & (1 << ubo);
1156                 bool empty = usz == 0;
1157
1158                 if (!enabled || empty) {
1159                         ubo_ptr[ubo] = 0;
1160                         continue;
1161                 }
1162
1163                 pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) {
1164                         cfg.entries = DIV_ROUND_UP(usz, 16);
1165                         cfg.pointer = panfrost_map_constant_buffer_gpu(batch,
1166                                         stage, buf, ubo);
1167                 }
1168         }
1169
1170         postfix->uniforms = transfer.gpu;
1171         postfix->uniform_buffers = ubos.gpu;
1172
1173         buf->dirty_mask = 0;
1174 }
1175
1176 void
1177 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1178                             const struct pipe_grid_info *info,
1179                             struct midgard_payload_vertex_tiler *vtp)
1180 {
1181         struct panfrost_context *ctx = batch->ctx;
1182         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1183         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1184         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1185                                                            128));
1186         unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1187                                info->grid[2] * 4;
1188         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1189                                                                   shared_size,
1190                                                                   1);
1191
1192         struct mali_shared_memory shared = {
1193                 .shared_memory = bo->gpu,
1194                 .shared_workgroup_count =
1195                         util_logbase2_ceil(info->grid[0]) +
1196                         util_logbase2_ceil(info->grid[1]) +
1197                         util_logbase2_ceil(info->grid[2]),
1198                 .shared_unk1 = 0x2,
1199                 .shared_shift = util_logbase2(single_size) - 1
1200         };
1201
1202         vtp->postfix.shared_memory = panfrost_pool_upload(&batch->pool, &shared,
1203                                                                sizeof(shared));
1204 }
1205
1206 static mali_ptr
1207 panfrost_get_tex_desc(struct panfrost_batch *batch,
1208                       enum pipe_shader_type st,
1209                       struct panfrost_sampler_view *view)
1210 {
1211         if (!view)
1212                 return (mali_ptr) 0;
1213
1214         struct pipe_sampler_view *pview = &view->base;
1215         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1216
1217         /* Add the BO to the job so it's retained until the job is done. */
1218
1219         panfrost_batch_add_bo(batch, rsrc->bo,
1220                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1221                               panfrost_bo_access_for_stage(st));
1222
1223         panfrost_batch_add_bo(batch, view->bo,
1224                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1225                               panfrost_bo_access_for_stage(st));
1226
1227         return view->bo->gpu;
1228 }
1229
1230 static void
1231 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1232                              struct pipe_context *pctx)
1233 {
1234         struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1235         if (view->texture_bo != rsrc->bo->gpu ||
1236             view->modifier != rsrc->modifier) {
1237                 panfrost_bo_unreference(view->bo);
1238                 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1239         }
1240 }
1241
1242 void
1243 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1244                                   enum pipe_shader_type stage,
1245                                   struct mali_vertex_tiler_postfix *postfix)
1246 {
1247         struct panfrost_context *ctx = batch->ctx;
1248         struct panfrost_device *device = pan_device(ctx->base.screen);
1249
1250         if (!ctx->sampler_view_count[stage])
1251                 return;
1252
1253         if (device->quirks & IS_BIFROST) {
1254                 struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool,
1255                                 MALI_BIFROST_TEXTURE_LENGTH *
1256                                 ctx->sampler_view_count[stage]);
1257
1258                 struct mali_bifrost_texture_packed *out =
1259                         (struct mali_bifrost_texture_packed *) T.cpu;
1260
1261                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1262                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1263                         struct pipe_sampler_view *pview = &view->base;
1264                         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1265
1266                         panfrost_update_sampler_view(view, &ctx->base);
1267                         out[i] = view->bifrost_descriptor;
1268
1269                         /* Add the BOs to the job so they are retained until the job is done. */
1270
1271                         panfrost_batch_add_bo(batch, rsrc->bo,
1272                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1273                                               panfrost_bo_access_for_stage(stage));
1274
1275                         panfrost_batch_add_bo(batch, view->bo,
1276                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1277                                               panfrost_bo_access_for_stage(stage));
1278                 }
1279
1280                 postfix->textures = T.gpu;
1281         } else {
1282                 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1283
1284                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1285                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1286
1287                         panfrost_update_sampler_view(view, &ctx->base);
1288
1289                         trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1290                 }
1291
1292                 postfix->textures = panfrost_pool_upload(&batch->pool,
1293                                                               trampolines,
1294                                                               sizeof(uint64_t) *
1295                                                               ctx->sampler_view_count[stage]);
1296         }
1297 }
1298
1299 void
1300 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1301                                   enum pipe_shader_type stage,
1302                                   struct mali_vertex_tiler_postfix *postfix)
1303 {
1304         struct panfrost_context *ctx = batch->ctx;
1305
1306         if (!ctx->sampler_count[stage])
1307                 return;
1308
1309         size_t desc_size = MALI_BIFROST_SAMPLER_LENGTH;
1310         assert(MALI_BIFROST_SAMPLER_LENGTH == MALI_MIDGARD_SAMPLER_LENGTH);
1311
1312         size_t sz = desc_size * ctx->sampler_count[stage];
1313         struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, sz);
1314         struct mali_midgard_sampler_packed *out = (struct mali_midgard_sampler_packed *) T.cpu;
1315
1316         for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i)
1317                 out[i] = ctx->samplers[stage][i]->hw;
1318
1319         postfix->sampler_descriptor = T.gpu;
1320 }
1321
1322 void
1323 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1324                           struct mali_vertex_tiler_postfix *vertex_postfix)
1325 {
1326         struct panfrost_context *ctx = batch->ctx;
1327         struct panfrost_vertex_state *so = ctx->vertex;
1328
1329         unsigned instance_shift = vertex_postfix->instance_shift;
1330         unsigned instance_odd = vertex_postfix->instance_odd;
1331
1332         /* Worst case: everything is NPOT */
1333
1334         struct panfrost_transfer S = panfrost_pool_alloc(&batch->pool,
1335                         MALI_ATTRIBUTE_LENGTH * PIPE_MAX_ATTRIBS * 2);
1336
1337         struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool,
1338                         MALI_ATTRIBUTE_LENGTH * (PAN_INSTANCE_ID + 1));
1339
1340         struct mali_attribute_buffer_packed *bufs =
1341                 (struct mali_attribute_buffer_packed *) S.cpu;
1342
1343         struct mali_attribute_packed *out =
1344                 (struct mali_attribute_packed *) T.cpu;
1345
1346         unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 };
1347         unsigned k = 0;
1348
1349         for (unsigned i = 0; i < so->num_elements; ++i) {
1350                 /* We map buffers 1:1 with the attributes, which
1351                  * means duplicating some vertex buffers (who cares? aside from
1352                  * maybe some caching implications but I somehow doubt that
1353                  * matters) */
1354
1355                 struct pipe_vertex_element *elem = &so->pipe[i];
1356                 unsigned vbi = elem->vertex_buffer_index;
1357                 attrib_to_buffer[i] = k;
1358
1359                 if (!(ctx->vb_mask & (1 << vbi)))
1360                         continue;
1361
1362                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1363                 struct panfrost_resource *rsrc;
1364
1365                 rsrc = pan_resource(buf->buffer.resource);
1366                 if (!rsrc)
1367                         continue;
1368
1369                 /* Add a dependency of the batch on the vertex buffer */
1370                 panfrost_batch_add_bo(batch, rsrc->bo,
1371                                       PAN_BO_ACCESS_SHARED |
1372                                       PAN_BO_ACCESS_READ |
1373                                       PAN_BO_ACCESS_VERTEX_TILER);
1374
1375                 /* Mask off lower bits, see offset fixup below */
1376                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1377                 mali_ptr addr = raw_addr & ~63;
1378
1379                 /* Since we advanced the base pointer, we shrink the buffer
1380                  * size, but add the offset we subtracted */
1381                 unsigned size = rsrc->base.width0 + (raw_addr - addr)
1382                         - buf->buffer_offset;
1383
1384                 /* When there is a divisor, the hardware-level divisor is
1385                  * the product of the instance divisor and the padded count */
1386                 unsigned divisor = elem->instance_divisor;
1387                 unsigned hw_divisor = ctx->padded_count * divisor;
1388                 unsigned stride = buf->stride;
1389
1390                 /* If there's a divisor(=1) but no instancing, we want every
1391                  * attribute to be the same */
1392
1393                 if (divisor && ctx->instance_count == 1)
1394                         stride = 0;
1395
1396                 if (!divisor || ctx->instance_count <= 1) {
1397                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1398                                 if (ctx->instance_count > 1)
1399                                         cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
1400
1401                                 cfg.pointer = addr;
1402                                 cfg.stride = stride;
1403                                 cfg.size = size;
1404                                 cfg.divisor_r = instance_shift;
1405                                 cfg.divisor_p = instance_odd;
1406                         }
1407                 } else if (util_is_power_of_two_or_zero(hw_divisor)) {
1408                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1409                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
1410                                 cfg.pointer = addr;
1411                                 cfg.stride = stride;
1412                                 cfg.size = size;
1413                                 cfg.divisor_r = __builtin_ctz(hw_divisor);
1414                         }
1415
1416                 } else {
1417                         unsigned shift = 0, extra_flags = 0;
1418
1419                         unsigned magic_divisor =
1420                                 panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
1421
1422                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1423                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
1424                                 cfg.pointer = addr;
1425                                 cfg.stride = stride;
1426                                 cfg.size = size;
1427
1428                                 cfg.divisor_r = shift;
1429                                 cfg.divisor_e = extra_flags;
1430                         }
1431
1432                         pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
1433                                 cfg.divisor_numerator = magic_divisor;
1434                                 cfg.divisor = divisor;
1435                         }
1436
1437                         ++k;
1438                 }
1439
1440                 ++k;
1441         }
1442
1443         /* Add special gl_VertexID/gl_InstanceID buffers */
1444
1445         panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1446
1447         pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
1448                 cfg.buffer_index = k++;
1449                 cfg.format = so->formats[PAN_VERTEX_ID];
1450         }
1451
1452         panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1453
1454         pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
1455                 cfg.buffer_index = k++;
1456                 cfg.format = so->formats[PAN_INSTANCE_ID];
1457         }
1458
1459         /* Attribute addresses require 64-byte alignment, so let:
1460          *
1461          *      base' = base & ~63 = base - (base & 63)
1462          *      offset' = offset + (base & 63)
1463          *
1464          * Since base' + offset' = base + offset, these are equivalent
1465          * addressing modes and now base is 64 aligned.
1466          */
1467
1468         unsigned start = vertex_postfix->offset_start;
1469
1470         for (unsigned i = 0; i < so->num_elements; ++i) {
1471                 unsigned vbi = so->pipe[i].vertex_buffer_index;
1472                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1473
1474                 /* Adjust by the masked off bits of the offset. Make sure we
1475                  * read src_offset from so->hw (which is not GPU visible)
1476                  * rather than target (which is) due to caching effects */
1477
1478                 unsigned src_offset = so->pipe[i].src_offset;
1479
1480                 /* BOs aligned to 4k so guaranteed aligned to 64 */
1481                 src_offset += (buf->buffer_offset & 63);
1482
1483                 /* Also, somewhat obscurely per-instance data needs to be
1484                  * offset in response to a delayed start in an indexed draw */
1485
1486                 if (so->pipe[i].instance_divisor && ctx->instance_count > 1 && start)
1487                         src_offset -= buf->stride * start;
1488
1489                 pan_pack(out + i, ATTRIBUTE, cfg) {
1490                         cfg.buffer_index = attrib_to_buffer[i];
1491                         cfg.format = so->formats[i];
1492                         cfg.offset = src_offset;
1493                 }
1494         }
1495
1496         vertex_postfix->attributes = S.gpu;
1497         vertex_postfix->attribute_meta = T.gpu;
1498 }
1499
1500 static mali_ptr
1501 panfrost_emit_varyings(struct panfrost_batch *batch,
1502                 struct mali_attribute_buffer_packed *slot,
1503                 unsigned stride, unsigned count)
1504 {
1505         unsigned size = stride * count;
1506         mali_ptr ptr = panfrost_pool_alloc(&batch->pool, size).gpu;
1507
1508         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1509                 cfg.stride = stride;
1510                 cfg.size = size;
1511                 cfg.pointer = ptr;
1512         }
1513
1514         return ptr;
1515 }
1516
1517 static unsigned
1518 panfrost_streamout_offset(unsigned stride, unsigned offset,
1519                         struct pipe_stream_output_target *target)
1520 {
1521         return (target->buffer_offset + (offset * stride * 4)) & 63;
1522 }
1523
1524 static void
1525 panfrost_emit_streamout(struct panfrost_batch *batch,
1526                         struct mali_attribute_buffer_packed *slot,
1527                         unsigned stride_words, unsigned offset, unsigned count,
1528                         struct pipe_stream_output_target *target)
1529 {
1530         unsigned stride = stride_words * 4;
1531         unsigned max_size = target->buffer_size;
1532         unsigned expected_size = stride * count;
1533
1534         /* Grab the BO and bind it to the batch */
1535         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1536
1537         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1538          * the perspective of the TILER and FRAGMENT.
1539          */
1540         panfrost_batch_add_bo(batch, bo,
1541                               PAN_BO_ACCESS_SHARED |
1542                               PAN_BO_ACCESS_RW |
1543                               PAN_BO_ACCESS_VERTEX_TILER |
1544                               PAN_BO_ACCESS_FRAGMENT);
1545
1546         /* We will have an offset applied to get alignment */
1547         mali_ptr addr = bo->gpu + target->buffer_offset + (offset * stride);
1548
1549         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1550                 cfg.pointer = (addr & ~63);
1551                 cfg.stride = stride;
1552                 cfg.size = MIN2(max_size, expected_size) + (addr & 63);
1553         }
1554 }
1555
1556 static bool
1557 has_point_coord(unsigned mask, gl_varying_slot loc)
1558 {
1559         if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1560                 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1561         else if (loc == VARYING_SLOT_PNTC)
1562                 return (mask & (1 << 8));
1563         else
1564                 return false;
1565 }
1566
1567 /* Helpers for manipulating stream out information so we can pack varyings
1568  * accordingly. Compute the src_offset for a given captured varying */
1569
1570 static struct pipe_stream_output *
1571 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1572 {
1573         for (unsigned i = 0; i < info->num_outputs; ++i) {
1574                 if (info->output[i].register_index == loc)
1575                         return &info->output[i];
1576         }
1577
1578         unreachable("Varying not captured");
1579 }
1580
1581 static unsigned
1582 pan_varying_size(enum mali_format fmt)
1583 {
1584         unsigned type = MALI_EXTRACT_TYPE(fmt);
1585         unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1586         unsigned bits = MALI_EXTRACT_BITS(fmt);
1587         unsigned bpc = 0;
1588
1589         if (bits == MALI_CHANNEL_FLOAT) {
1590                 /* No doubles */
1591                 bool fp16 = (type == MALI_FORMAT_SINT);
1592                 assert(fp16 || (type == MALI_FORMAT_UNORM));
1593
1594                 bpc = fp16 ? 2 : 4;
1595         } else {
1596                 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1597
1598                 /* See the enums */
1599                 bits = 1 << bits;
1600                 assert(bits >= 8);
1601                 bpc = bits / 8;
1602         }
1603
1604         return bpc * chan;
1605 }
1606
1607 /* Indices for named (non-XFB) varyings that are present. These are packed
1608  * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1609  * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1610  * of a given special field given a shift S by:
1611  *
1612  *      idx = popcount(P & ((1 << S) - 1))
1613  *
1614  * That is... look at all of the varyings that come earlier and count them, the
1615  * count is the new index since plus one. Likewise, the total number of special
1616  * buffers required is simply popcount(P)
1617  */
1618
1619 enum pan_special_varying {
1620         PAN_VARY_GENERAL = 0,
1621         PAN_VARY_POSITION = 1,
1622         PAN_VARY_PSIZ = 2,
1623         PAN_VARY_PNTCOORD = 3,
1624         PAN_VARY_FACE = 4,
1625         PAN_VARY_FRAGCOORD = 5,
1626
1627         /* Keep last */
1628         PAN_VARY_MAX,
1629 };
1630
1631 /* Given a varying, figure out which index it correpsonds to */
1632
1633 static inline unsigned
1634 pan_varying_index(unsigned present, enum pan_special_varying v)
1635 {
1636         unsigned mask = (1 << v) - 1;
1637         return util_bitcount(present & mask);
1638 }
1639
1640 /* Get the base offset for XFB buffers, which by convention come after
1641  * everything else. Wrapper function for semantic reasons; by construction this
1642  * is just popcount. */
1643
1644 static inline unsigned
1645 pan_xfb_base(unsigned present)
1646 {
1647         return util_bitcount(present);
1648 }
1649
1650 /* Computes the present mask for varyings so we can start emitting varying records */
1651
1652 static inline unsigned
1653 pan_varying_present(
1654         struct panfrost_shader_state *vs,
1655         struct panfrost_shader_state *fs,
1656         unsigned quirks)
1657 {
1658         /* At the moment we always emit general and position buffers. Not
1659          * strictly necessary but usually harmless */
1660
1661         unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1662
1663         /* Enable special buffers by the shader info */
1664
1665         if (vs->writes_point_size)
1666                 present |= (1 << PAN_VARY_PSIZ);
1667
1668         if (fs->reads_point_coord)
1669                 present |= (1 << PAN_VARY_PNTCOORD);
1670
1671         if (fs->reads_face)
1672                 present |= (1 << PAN_VARY_FACE);
1673
1674         if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1675                 present |= (1 << PAN_VARY_FRAGCOORD);
1676
1677         /* Also, if we have a point sprite, we need a point coord buffer */
1678
1679         for (unsigned i = 0; i < fs->varying_count; i++)  {
1680                 gl_varying_slot loc = fs->varyings_loc[i];
1681
1682                 if (has_point_coord(fs->point_sprite_mask, loc))
1683                         present |= (1 << PAN_VARY_PNTCOORD);
1684         }
1685
1686         return present;
1687 }
1688
1689 /* Emitters for varying records */
1690
1691 static void
1692 pan_emit_vary(struct mali_attribute_packed *out,
1693                 unsigned present, enum pan_special_varying buf,
1694                 unsigned quirks, enum mali_format format,
1695                 unsigned offset)
1696 {
1697         unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1698         unsigned swizzle = quirks & HAS_SWIZZLES ?
1699                         panfrost_get_default_swizzle(nr_channels) :
1700                         panfrost_bifrost_swizzle(nr_channels);
1701
1702         pan_pack(out, ATTRIBUTE, cfg) {
1703                 cfg.buffer_index = pan_varying_index(present, buf);
1704                 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1705                 cfg.format = (format << 12) | swizzle;
1706                 cfg.offset = offset;
1707         }
1708 }
1709
1710 /* General varying that is unused */
1711
1712 static void
1713 pan_emit_vary_only(struct mali_attribute_packed *out,
1714                 unsigned present, unsigned quirks)
1715 {
1716         pan_emit_vary(out, present, 0, quirks, MALI_VARYING_DISCARD, 0);
1717 }
1718
1719 /* Special records */
1720
1721 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1722         [PAN_VARY_POSITION]     = MALI_VARYING_POS,
1723         [PAN_VARY_PSIZ]         = MALI_R16F,
1724         [PAN_VARY_PNTCOORD]     = MALI_R16F,
1725         [PAN_VARY_FACE]         = MALI_R32I,
1726         [PAN_VARY_FRAGCOORD]    = MALI_RGBA32F
1727 };
1728
1729 static void
1730 pan_emit_vary_special(struct mali_attribute_packed *out,
1731                 unsigned present, enum pan_special_varying buf,
1732                 unsigned quirks)
1733 {
1734         assert(buf < PAN_VARY_MAX);
1735         pan_emit_vary(out, present, buf, quirks, pan_varying_formats[buf], 0);
1736 }
1737
1738 static enum mali_format
1739 pan_xfb_format(enum mali_format format, unsigned nr)
1740 {
1741         if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1742                 return MALI_R32F | MALI_NR_CHANNELS(nr);
1743         else
1744                 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1745 }
1746
1747 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1748  * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1749  * value. */
1750
1751 static void
1752 pan_emit_vary_xfb(struct mali_attribute_packed *out,
1753                 unsigned present,
1754                 unsigned max_xfb,
1755                 unsigned *streamout_offsets,
1756                 unsigned quirks,
1757                 enum mali_format format,
1758                 struct pipe_stream_output o)
1759 {
1760         unsigned swizzle = quirks & HAS_SWIZZLES ?
1761                         panfrost_get_default_swizzle(o.num_components) :
1762                         panfrost_bifrost_swizzle(o.num_components);
1763
1764         pan_pack(out, ATTRIBUTE, cfg) {
1765                 /* XFB buffers come after everything else */
1766                 cfg.buffer_index = pan_xfb_base(present) + o.output_buffer;
1767                 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1768
1769                 /* Override number of channels and precision to highp */
1770                 cfg.format = (pan_xfb_format(format, o.num_components) << 12) | swizzle;
1771
1772                 /* Apply given offsets together */
1773                 cfg.offset = (o.dst_offset * 4) /* dwords */
1774                         + streamout_offsets[o.output_buffer];
1775         }
1776 }
1777
1778 /* Determine if we should capture a varying for XFB. This requires actually
1779  * having a buffer for it. If we don't capture it, we'll fallback to a general
1780  * varying path (linked or unlinked, possibly discarding the write) */
1781
1782 static bool
1783 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1784                 unsigned loc, unsigned max_xfb)
1785 {
1786         if (!(xfb->so_mask & (1ll << loc)))
1787                 return false;
1788
1789         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1790         return o->output_buffer < max_xfb;
1791 }
1792
1793 static void
1794 pan_emit_general_varying(struct mali_attribute_packed *out,
1795                 struct panfrost_shader_state *other,
1796                 struct panfrost_shader_state *xfb,
1797                 gl_varying_slot loc,
1798                 enum mali_format format,
1799                 unsigned present,
1800                 unsigned quirks,
1801                 unsigned *gen_offsets,
1802                 enum mali_format *gen_formats,
1803                 unsigned *gen_stride,
1804                 unsigned idx,
1805                 bool should_alloc)
1806 {
1807         /* Check if we're linked */
1808         signed other_idx = -1;
1809
1810         for (unsigned j = 0; j < other->varying_count; ++j) {
1811                 if (other->varyings_loc[j] == loc) {
1812                         other_idx = j;
1813                         break;
1814                 }
1815         }
1816
1817         if (other_idx < 0) {
1818                 pan_emit_vary_only(out, present, quirks);
1819                 return;
1820         }
1821
1822         unsigned offset = gen_offsets[other_idx];
1823
1824         if (should_alloc) {
1825                 /* We're linked, so allocate a space via a watermark allocation */
1826                 enum mali_format alt = other->varyings[other_idx];
1827
1828                 /* Do interpolation at minimum precision */
1829                 unsigned size_main = pan_varying_size(format);
1830                 unsigned size_alt = pan_varying_size(alt);
1831                 unsigned size = MIN2(size_main, size_alt);
1832
1833                 /* If a varying is marked for XFB but not actually captured, we
1834                  * should match the format to the format that would otherwise
1835                  * be used for XFB, since dEQP checks for invariance here. It's
1836                  * unclear if this is required by the spec. */
1837
1838                 if (xfb->so_mask & (1ull << loc)) {
1839                         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1840                         format = pan_xfb_format(format, o->num_components);
1841                         size = pan_varying_size(format);
1842                 } else if (size == size_alt) {
1843                         format = alt;
1844                 }
1845
1846                 gen_offsets[idx] = *gen_stride;
1847                 gen_formats[other_idx] = format;
1848                 offset = *gen_stride;
1849                 *gen_stride += size;
1850         }
1851
1852         pan_emit_vary(out, present, PAN_VARY_GENERAL, quirks, format, offset);
1853 }
1854
1855 /* Higher-level wrapper around all of the above, classifying a varying into one
1856  * of the above types */
1857
1858 static void
1859 panfrost_emit_varying(
1860                 struct mali_attribute_packed *out,
1861                 struct panfrost_shader_state *stage,
1862                 struct panfrost_shader_state *other,
1863                 struct panfrost_shader_state *xfb,
1864                 unsigned present,
1865                 unsigned max_xfb,
1866                 unsigned *streamout_offsets,
1867                 unsigned quirks,
1868                 unsigned *gen_offsets,
1869                 enum mali_format *gen_formats,
1870                 unsigned *gen_stride,
1871                 unsigned idx,
1872                 bool should_alloc,
1873                 bool is_fragment)
1874 {
1875         gl_varying_slot loc = stage->varyings_loc[idx];
1876         enum mali_format format = stage->varyings[idx];
1877
1878         /* Override format to match linkage */
1879         if (!should_alloc && gen_formats[idx])
1880                 format = gen_formats[idx];
1881
1882         if (has_point_coord(stage->point_sprite_mask, loc)) {
1883                 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1884         } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1885                 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1886                 pan_emit_vary_xfb(out, present, max_xfb, streamout_offsets, quirks, format, *o);
1887         } else if (loc == VARYING_SLOT_POS) {
1888                 if (is_fragment)
1889                         pan_emit_vary_special(out, present, PAN_VARY_FRAGCOORD, quirks);
1890                 else
1891                         pan_emit_vary_special(out, present, PAN_VARY_POSITION, quirks);
1892         } else if (loc == VARYING_SLOT_PSIZ) {
1893                 pan_emit_vary_special(out, present, PAN_VARY_PSIZ, quirks);
1894         } else if (loc == VARYING_SLOT_PNTC) {
1895                 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1896         } else if (loc == VARYING_SLOT_FACE) {
1897                 pan_emit_vary_special(out, present, PAN_VARY_FACE, quirks);
1898         } else {
1899                 pan_emit_general_varying(out, other, xfb, loc, format, present,
1900                                 quirks, gen_offsets, gen_formats, gen_stride,
1901                                 idx, should_alloc);
1902         }
1903 }
1904
1905 static void
1906 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
1907                 unsigned present,
1908                 enum pan_special_varying v,
1909                 unsigned special)
1910 {
1911         if (present & (1 << v)) {
1912                 unsigned idx = pan_varying_index(present, v);
1913
1914                 pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
1915                         cfg.special = special;
1916                         cfg.type = 0;
1917                 }
1918         }
1919 }
1920
1921 void
1922 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1923                                  unsigned vertex_count,
1924                                  struct mali_vertex_tiler_postfix *vertex_postfix,
1925                                  struct mali_vertex_tiler_postfix *tiler_postfix,
1926                                  union midgard_primitive_size *primitive_size)
1927 {
1928         /* Load the shaders */
1929         struct panfrost_context *ctx = batch->ctx;
1930         struct panfrost_device *dev = pan_device(ctx->base.screen);
1931         struct panfrost_shader_state *vs, *fs;
1932         size_t vs_size, fs_size;
1933
1934         /* Allocate the varying descriptor */
1935
1936         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1937         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1938         vs_size = MALI_ATTRIBUTE_LENGTH * vs->varying_count;
1939         fs_size = MALI_ATTRIBUTE_LENGTH * fs->varying_count;
1940
1941         struct panfrost_transfer trans = panfrost_pool_alloc(&batch->pool,
1942                                                                      vs_size +
1943                                                                      fs_size);
1944
1945         struct pipe_stream_output_info *so = &vs->stream_output;
1946         unsigned present = pan_varying_present(vs, fs, dev->quirks);
1947
1948         /* Check if this varying is linked by us. This is the case for
1949          * general-purpose, non-captured varyings. If it is, link it. If it's
1950          * not, use the provided stream out information to determine the
1951          * offset, since it was already linked for us. */
1952
1953         unsigned gen_offsets[32];
1954         enum mali_format gen_formats[32];
1955         memset(gen_offsets, 0, sizeof(gen_offsets));
1956         memset(gen_formats, 0, sizeof(gen_formats));
1957
1958         unsigned gen_stride = 0;
1959         assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
1960         assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
1961
1962         unsigned streamout_offsets[32];
1963
1964         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1965                 streamout_offsets[i] = panfrost_streamout_offset(
1966                                         so->stride[i],
1967                                         ctx->streamout.offsets[i],
1968                                         ctx->streamout.targets[i]);
1969         }
1970
1971         struct mali_attribute_packed *ovs = (struct mali_attribute_packed *)trans.cpu;
1972         struct mali_attribute_packed *ofs = ovs + vs->varying_count;
1973
1974         for (unsigned i = 0; i < vs->varying_count; i++) {
1975                 panfrost_emit_varying(ovs + i, vs, fs, vs, present,
1976                                 ctx->streamout.num_targets, streamout_offsets,
1977                                 dev->quirks,
1978                                 gen_offsets, gen_formats, &gen_stride, i, true, false);
1979         }
1980
1981         for (unsigned i = 0; i < fs->varying_count; i++) {
1982                 panfrost_emit_varying(ofs + i, fs, vs, vs, present,
1983                                 ctx->streamout.num_targets, streamout_offsets,
1984                                 dev->quirks,
1985                                 gen_offsets, gen_formats, &gen_stride, i, false, true);
1986         }
1987
1988         unsigned xfb_base = pan_xfb_base(present);
1989         struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool,
1990                         MALI_ATTRIBUTE_BUFFER_LENGTH * (xfb_base + ctx->streamout.num_targets));
1991         struct mali_attribute_buffer_packed *varyings =
1992                 (struct mali_attribute_buffer_packed *) T.cpu;
1993
1994         /* Emit the stream out buffers */
1995
1996         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1997                                                            ctx->vertex_count);
1998
1999         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2000                 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
2001                                         so->stride[i],
2002                                         ctx->streamout.offsets[i],
2003                                         out_count,
2004                                         ctx->streamout.targets[i]);
2005         }
2006
2007         panfrost_emit_varyings(batch,
2008                         &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
2009                         gen_stride, vertex_count);
2010
2011         /* fp32 vec4 gl_Position */
2012         tiler_postfix->position_varying = panfrost_emit_varyings(batch,
2013                         &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
2014                         sizeof(float) * 4, vertex_count);
2015
2016         if (present & (1 << PAN_VARY_PSIZ)) {
2017                 primitive_size->pointer = panfrost_emit_varyings(batch,
2018                                 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
2019                                 2, vertex_count);
2020         }
2021
2022         pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD);
2023         pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
2024         pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
2025
2026         vertex_postfix->varyings = T.gpu;
2027         tiler_postfix->varyings = T.gpu;
2028
2029         vertex_postfix->varying_meta = trans.gpu;
2030         tiler_postfix->varying_meta = trans.gpu + vs_size;
2031 }
2032
2033 void
2034 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
2035                                 struct mali_vertex_tiler_prefix *vertex_prefix,
2036                                 struct mali_vertex_tiler_postfix *vertex_postfix,
2037                                 struct mali_vertex_tiler_prefix *tiler_prefix,
2038                                 struct mali_vertex_tiler_postfix *tiler_postfix,
2039                                 union midgard_primitive_size *primitive_size)
2040 {
2041         struct panfrost_context *ctx = batch->ctx;
2042         struct panfrost_device *device = pan_device(ctx->base.screen);
2043         bool wallpapering = ctx->wallpaper_batch && batch->scoreboard.tiler_dep;
2044         struct bifrost_payload_vertex bifrost_vertex = {0,};
2045         struct bifrost_payload_tiler bifrost_tiler = {0,};
2046         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
2047         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
2048         void *vp, *tp;
2049         size_t vp_size, tp_size;
2050
2051         if (device->quirks & IS_BIFROST) {
2052                 bifrost_vertex.prefix = *vertex_prefix;
2053                 bifrost_vertex.postfix = *vertex_postfix;
2054                 vp = &bifrost_vertex;
2055                 vp_size = sizeof(bifrost_vertex);
2056
2057                 bifrost_tiler.prefix = *tiler_prefix;
2058                 bifrost_tiler.tiler.primitive_size = *primitive_size;
2059                 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2060                 bifrost_tiler.postfix = *tiler_postfix;
2061                 tp = &bifrost_tiler;
2062                 tp_size = sizeof(bifrost_tiler);
2063         } else {
2064                 midgard_vertex.prefix = *vertex_prefix;
2065                 midgard_vertex.postfix = *vertex_postfix;
2066                 vp = &midgard_vertex;
2067                 vp_size = sizeof(midgard_vertex);
2068
2069                 midgard_tiler.prefix = *tiler_prefix;
2070                 midgard_tiler.postfix = *tiler_postfix;
2071                 midgard_tiler.primitive_size = *primitive_size;
2072                 tp = &midgard_tiler;
2073                 tp_size = sizeof(midgard_tiler);
2074         }
2075
2076         if (wallpapering) {
2077                 /* Inject in reverse order, with "predicted" job indices.
2078                  * THIS IS A HACK XXX */
2079                 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false,
2080                                  batch->scoreboard.job_index + 2, tp, tp_size, true);
2081                 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2082                                  vp, vp_size, true);
2083                 return;
2084         }
2085
2086         /* If rasterizer discard is enable, only submit the vertex */
2087
2088         unsigned vertex = panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2089                                            vp, vp_size, false);
2090
2091         if (ctx->rasterizer->base.rasterizer_discard)
2092                 return;
2093
2094         panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false, vertex, tp, tp_size,
2095                          false);
2096 }
2097
2098 /* TODO: stop hardcoding this */
2099 mali_ptr
2100 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2101 {
2102         uint16_t locations[] = {
2103             128, 128,
2104             0, 256,
2105             0, 256,
2106             0, 256,
2107             0, 256,
2108             0, 256,
2109             0, 256,
2110             0, 256,
2111             0, 256,
2112             0, 256,
2113             0, 256,
2114             0, 256,
2115             0, 256,
2116             0, 256,
2117             0, 256,
2118             0, 256,
2119             0, 256,
2120             0, 256,
2121             0, 256,
2122             0, 256,
2123             0, 256,
2124             0, 256,
2125             0, 256,
2126             0, 256,
2127             0, 256,
2128             0, 256,
2129             0, 256,
2130             0, 256,
2131             0, 256,
2132             0, 256,
2133             0, 256,
2134             0, 256,
2135             128, 128,
2136             0, 0,
2137             0, 0,
2138             0, 0,
2139             0, 0,
2140             0, 0,
2141             0, 0,
2142             0, 0,
2143             0, 0,
2144             0, 0,
2145             0, 0,
2146             0, 0,
2147             0, 0,
2148             0, 0,
2149             0, 0,
2150             0, 0,
2151         };
2152
2153         return panfrost_pool_upload(&batch->pool, locations, 96 * sizeof(uint16_t));
2154 }