src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28
  29 #include "panfrost-quirks.h"
  30
  31 #include "pan_pool.h"
  32 #include "pan_bo.h"
  33 #include "pan_cmdstream.h"
  34 #include "pan_context.h"
  35 #include "pan_job.h"
  36
  37 /* If a BO is accessed for a particular shader stage, will it be in the primary
  38  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  39  * fragment will be primary, e.g. compute jobs will be considered
  40  * "vertex/tiler" by analogy */
  41
  42 static inline uint32_t
  43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  44 {
  45         assert(stage == PIPE_SHADER_FRAGMENT ||
  46                stage == PIPE_SHADER_VERTEX ||
  47                stage == PIPE_SHADER_COMPUTE);
  48
  49         return stage == PIPE_SHADER_FRAGMENT ?
  50                PAN_BO_ACCESS_FRAGMENT :
  51                PAN_BO_ACCESS_VERTEX_TILER;
  52 }
  53
  54 static void
  55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
  56                                struct mali_vertex_tiler_postfix *postfix)
  57 {
  58         struct panfrost_device *dev = pan_device(ctx->base.screen);
  59         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  60
  61         struct mali_shared_memory shared = {
  62                 .shared_workgroup_count = ~0,
  63         };
  64
  65         if (batch->stack_size) {
  66                 struct panfrost_bo *stack =
  67                         panfrost_batch_get_scratchpad(batch, batch->stack_size,
  68                                         dev->thread_tls_alloc,
  69                                         dev->core_count);
  70
  71                 shared.stack_shift = panfrost_get_stack_shift(batch->stack_size);
  72                 shared.scratchpad = stack->gpu;
  73         }
  74
  75         postfix->shared_memory = panfrost_pool_upload(&batch->pool, &shared, sizeof(shared));
  76 }
  77
  78 static void
  79 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
  80                                struct mali_vertex_tiler_postfix *postfix)
  81 {
  82         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  83         postfix->shared_memory = panfrost_batch_reserve_framebuffer(batch);
  84 }
  85
  86 static void
  87 panfrost_vt_update_rasterizer(struct panfrost_rasterizer *rasterizer,
  88                               struct mali_vertex_tiler_prefix *prefix,
  89                               struct mali_vertex_tiler_postfix *postfix)
  90 {
  91         postfix->gl_enables |= 0x7;
  92         SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
  93                 rasterizer->base.front_ccw);
  94         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
  95                 (rasterizer->base.cull_face & PIPE_FACE_FRONT));
  96         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
  97                 (rasterizer->base.cull_face & PIPE_FACE_BACK));
  98         SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
  99                 rasterizer->base.flatshade_first);
 100 }
 101
 102 void
 103 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
 104                                   struct mali_vertex_tiler_prefix *prefix,
 105                                   union midgard_primitive_size *primitive_size)
 106 {
 107         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 108
 109         if (!panfrost_writes_point_size(ctx)) {
 110                 float val = (prefix->draw_mode == MALI_DRAW_MODE_POINTS) ?
 111                               rasterizer->base.point_size :
 112                               rasterizer->base.line_width;
 113
 114                 primitive_size->constant = val;
 115         }
 116 }
 117
 118 static void
 119 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
 120                                    struct mali_vertex_tiler_postfix *postfix)
 121 {
 122         SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
 123         if (ctx->occlusion_query) {
 124                 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
 125                 panfrost_batch_add_bo(ctx->batch, ctx->occlusion_query->bo,
 126                                       PAN_BO_ACCESS_SHARED |
 127                                       PAN_BO_ACCESS_RW |
 128                                       PAN_BO_ACCESS_FRAGMENT);
 129         } else {
 130                 postfix->occlusion_counter = 0;
 131         }
 132 }
 133
 134 void
 135 panfrost_vt_init(struct panfrost_context *ctx,
 136                  enum pipe_shader_type stage,
 137                  struct mali_vertex_tiler_prefix *prefix,
 138                  struct mali_vertex_tiler_postfix *postfix)
 139 {
 140         struct panfrost_device *device = pan_device(ctx->base.screen);
 141
 142         if (!ctx->shader[stage])
 143                 return;
 144
 145         memset(prefix, 0, sizeof(*prefix));
 146         memset(postfix, 0, sizeof(*postfix));
 147
 148         if (device->quirks & IS_BIFROST) {
 149                 postfix->gl_enables = 0x2;
 150                 panfrost_vt_emit_shared_memory(ctx, postfix);
 151         } else {
 152                 postfix->gl_enables = 0x6;
 153                 panfrost_vt_attach_framebuffer(ctx, postfix);
 154         }
 155
 156         if (stage == PIPE_SHADER_FRAGMENT) {
 157                 panfrost_vt_update_occlusion_query(ctx, postfix);
 158                 panfrost_vt_update_rasterizer(ctx->rasterizer, prefix, postfix);
 159         }
 160 }
 161
 162 static unsigned
 163 panfrost_translate_index_size(unsigned size)
 164 {
 165         switch (size) {
 166         case 1:
 167                 return MALI_DRAW_INDEXED_UINT8;
 168
 169         case 2:
 170                 return MALI_DRAW_INDEXED_UINT16;
 171
 172         case 4:
 173                 return MALI_DRAW_INDEXED_UINT32;
 174
 175         default:
 176                 unreachable("Invalid index size");
 177         }
 178 }
 179
 180 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
 181  * good for the duration of the draw (transient), could last longer. Also get
 182  * the bounds on the index buffer for the range accessed by the draw. We do
 183  * these operations together because there are natural optimizations which
 184  * require them to be together. */
 185
 186 static mali_ptr
 187 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 188                                   const struct pipe_draw_info *info,
 189                                   unsigned *min_index, unsigned *max_index)
 190 {
 191         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 192         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 193         off_t offset = info->start * info->index_size;
 194         bool needs_indices = true;
 195         mali_ptr out = 0;
 196
 197         if (info->max_index != ~0u) {
 198                 *min_index = info->min_index;
 199                 *max_index = info->max_index;
 200                 needs_indices = false;
 201         }
 202
 203         if (!info->has_user_indices) {
 204                 /* Only resources can be directly mapped */
 205                 panfrost_batch_add_bo(batch, rsrc->bo,
 206                                       PAN_BO_ACCESS_SHARED |
 207                                       PAN_BO_ACCESS_READ |
 208                                       PAN_BO_ACCESS_VERTEX_TILER);
 209                 out = rsrc->bo->gpu + offset;
 210
 211                 /* Check the cache */
 212                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 213                                                            info->start,
 214                                                            info->count,
 215                                                            min_index,
 216                                                            max_index);
 217         } else {
 218                 /* Otherwise, we need to upload to transient memory */
 219                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 220                 out = panfrost_pool_upload(&batch->pool, ibuf8 + offset,
 221                                                 info->count *
 222                                                 info->index_size);
 223         }
 224
 225         if (needs_indices) {
 226                 /* Fallback */
 227                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 228
 229                 if (!info->has_user_indices)
 230                         panfrost_minmax_cache_add(rsrc->index_cache,
 231                                                   info->start, info->count,
 232                                                   *min_index, *max_index);
 233         }
 234
 235         return out;
 236 }
 237
 238 void
 239 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
 240                           const struct pipe_draw_info *info,
 241                           enum mali_draw_mode draw_mode,
 242                           struct mali_vertex_tiler_postfix *vertex_postfix,
 243                           struct mali_vertex_tiler_prefix *tiler_prefix,
 244                           struct mali_vertex_tiler_postfix *tiler_postfix,
 245                           unsigned *vertex_count,
 246                           unsigned *padded_count)
 247 {
 248         tiler_prefix->draw_mode = draw_mode;
 249
 250         unsigned draw_flags = 0;
 251
 252         if (panfrost_writes_point_size(ctx))
 253                 draw_flags |= MALI_DRAW_VARYING_SIZE;
 254
 255         if (info->primitive_restart)
 256                 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
 257
 258         /* These doesn't make much sense */
 259
 260         draw_flags |= 0x3000;
 261
 262         if (info->index_size) {
 263                 unsigned min_index = 0, max_index = 0;
 264
 265                 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
 266                                                                        info,
 267                                                                        &min_index,
 268                                                                        &max_index);
 269
 270                 /* Use the corresponding values */
 271                 *vertex_count = max_index - min_index + 1;
 272                 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
 273                 tiler_prefix->offset_bias_correction = -min_index;
 274                 tiler_prefix->index_count = MALI_POSITIVE(info->count);
 275                 draw_flags |= panfrost_translate_index_size(info->index_size);
 276         } else {
 277                 tiler_prefix->indices = 0;
 278                 *vertex_count = ctx->vertex_count;
 279                 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
 280                 tiler_prefix->offset_bias_correction = 0;
 281                 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
 282         }
 283
 284         tiler_prefix->unknown_draw = draw_flags;
 285
 286         /* Encode the padded vertex count */
 287
 288         if (info->instance_count > 1) {
 289                 *padded_count = panfrost_padded_vertex_count(*vertex_count);
 290
 291                 unsigned shift = __builtin_ctz(ctx->padded_count);
 292                 unsigned k = ctx->padded_count >> (shift + 1);
 293
 294                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
 295                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
 296         } else {
 297                 *padded_count = *vertex_count;
 298
 299                 /* Reset instancing state */
 300                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
 301                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
 302         }
 303 }
 304
 305 static void
 306 panfrost_shader_meta_init(struct panfrost_context *ctx,
 307                           enum pipe_shader_type st,
 308                           struct mali_shader_meta *meta)
 309 {
 310         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 311         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 312
 313         memset(meta, 0, sizeof(*meta));
 314         meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
 315         meta->attribute_count = ss->attribute_count;
 316         meta->varying_count = ss->varying_count;
 317         meta->texture_count = ctx->sampler_view_count[st];
 318         meta->sampler_count = ctx->sampler_count[st];
 319
 320         if (dev->quirks & IS_BIFROST) {
 321                 if (st == PIPE_SHADER_VERTEX)
 322                         meta->bifrost1.unk1 = 0x800000;
 323                 else {
 324                         /* First clause ATEST |= 0x4000000.
 325                          * Less than 32 regs |= 0x200 */
 326                         meta->bifrost1.unk1 = 0x950020;
 327                 }
 328
 329                 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 330                 if (st == PIPE_SHADER_VERTEX)
 331                         meta->bifrost2.preload_regs = 0xC0;
 332                 else {
 333                         meta->bifrost2.preload_regs = 0x1;
 334                         SET_BIT(meta->bifrost2.preload_regs, 0x10, ss->reads_frag_coord);
 335                 }
 336
 337                 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
 338                                                     ss->uniform_cutoff);
 339         } else {
 340                 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
 341                                                     ss->uniform_cutoff);
 342                 meta->midgard1.work_count = ss->work_reg_count;
 343
 344                 /* TODO: This is not conformant on ES3 */
 345                 meta->midgard1.flags_hi = MALI_SUPPRESS_INF_NAN;
 346
 347                 meta->midgard1.flags_lo = 0x20;
 348                 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 349
 350                 SET_BIT(meta->midgard1.flags_lo, MALI_WRITES_GLOBAL, ss->writes_global);
 351         }
 352 }
 353
 354 static unsigned
 355 translate_tex_wrap(enum pipe_tex_wrap w)
 356 {
 357         switch (w) {
 358         case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT;
 359         case PIPE_TEX_WRAP_CLAMP: return MALI_WRAP_MODE_CLAMP;
 360         case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE;
 361         case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER;
 362         case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT;
 363         case PIPE_TEX_WRAP_MIRROR_CLAMP: return MALI_WRAP_MODE_MIRRORED_CLAMP;
 364         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
 365         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
 366         default: unreachable("Invalid wrap");
 367         }
 368 }
 369
 370 /* The hardware compares in the wrong order order, so we have to flip before
 371  * encoding. Yes, really. */
 372
 373 static enum mali_func
 374 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
 375 {
 376         if (!cso->compare_mode)
 377                 return MALI_FUNC_NEVER;
 378
 379         enum mali_func f = panfrost_translate_compare_func(cso->compare_func);
 380         return panfrost_flip_compare_func(f);
 381 }
 382
 383 static enum mali_mipmap_mode
 384 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
 385 {
 386         switch (f) {
 387         case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST;
 388         case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR;
 389         case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE;
 390         default: unreachable("Invalid");
 391         }
 392 }
 393
 394 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 395                                 struct mali_midgard_sampler_packed *hw)
 396 {
 397         pan_pack(hw, MIDGARD_SAMPLER, cfg) {
 398                 cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 399                 cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 400                 cfg.mipmap_mode = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) ?
 401                         MALI_MIPMAP_MODE_TRILINEAR : MALI_MIPMAP_MODE_NEAREST;
 402                 cfg.normalized_coordinates = cso->normalized_coords;
 403
 404                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
 405
 406                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
 407
 408                 /* If necessary, we disable mipmapping in the sampler descriptor by
 409                  * clamping the LOD as tight as possible (from 0 to epsilon,
 410                  * essentially -- remember these are fixed point numbers, so
 411                  * epsilon=1/256) */
 412
 413                 cfg.maximum_lod = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) ?
 414                         cfg.minimum_lod + 1 :
 415                         FIXED_16(cso->max_lod, false);
 416
 417                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
 418                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
 419                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
 420
 421                 cfg.compare_function = panfrost_sampler_compare_func(cso);
 422                 cfg.seamless_cube_map = cso->seamless_cube_map;
 423
 424                 cfg.border_color_r = cso->border_color.f[0];
 425                 cfg.border_color_g = cso->border_color.f[1];
 426                 cfg.border_color_b = cso->border_color.f[2];
 427                 cfg.border_color_a = cso->border_color.f[3];
 428         }
 429 }
 430
 431 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
 432                                         struct mali_bifrost_sampler_packed *hw)
 433 {
 434         pan_pack(hw, BIFROST_SAMPLER, cfg) {
 435                 cfg.magnify_linear = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR;
 436                 cfg.minify_linear = cso->min_img_filter == PIPE_TEX_FILTER_LINEAR;
 437                 cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
 438                 cfg.normalized_coordinates = cso->normalized_coords;
 439
 440                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
 441                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
 442                 cfg.maximum_lod = FIXED_16(cso->max_lod, false);
 443
 444                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
 445                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
 446                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
 447
 448                 cfg.compare_function = panfrost_sampler_compare_func(cso);
 449                 cfg.seamless_cube_map = cso->seamless_cube_map;
 450         }
 451 }
 452
 453 static void
 454 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
 455                                      struct mali_shader_meta *fragmeta)
 456 {
 457         struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 458
 459         bool msaa = rast->multisample;
 460
 461         /* TODO: Sample size */
 462         SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
 463         SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
 464
 465         struct panfrost_shader_state *fs;
 466         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 467
 468         /* EXT_shader_framebuffer_fetch requires the shader to be run
 469          * per-sample when outputs are read. */
 470         bool per_sample = ctx->min_samples > 1 || fs->outputs_read;
 471         SET_BIT(fragmeta->unknown2_3, MALI_PER_SAMPLE, msaa && per_sample);
 472
 473         fragmeta->depth_units = rast->offset_units * 2.0f;
 474         fragmeta->depth_factor = rast->offset_scale;
 475
 476         /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
 477
 478         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, rast->offset_tri);
 479         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, rast->offset_tri);
 480
 481         SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, rast->depth_clip_near);
 482         SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, rast->depth_clip_far);
 483 }
 484
 485 static void
 486 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
 487                               struct mali_shader_meta *fragmeta)
 488 {
 489         const struct panfrost_zsa_state *so = ctx->depth_stencil;
 490
 491         SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
 492                 so->base.stencil[0].enabled);
 493
 494         fragmeta->stencil_mask_front = so->stencil_mask_front;
 495         fragmeta->stencil_mask_back = so->stencil_mask_back;
 496
 497         /* Bottom bits for stencil ref, exactly one word */
 498         fragmeta->stencil_front.opaque[0] = so->stencil_front.opaque[0] | ctx->stencil_ref.ref_value[0];
 499
 500         /* If back-stencil is not enabled, use the front values */
 501
 502         if (so->base.stencil[1].enabled)
 503                 fragmeta->stencil_back.opaque[0] = so->stencil_back.opaque[0] | ctx->stencil_ref.ref_value[1];
 504         else
 505                 fragmeta->stencil_back = fragmeta->stencil_front;
 506
 507         SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
 508                 so->base.depth.writemask);
 509
 510         fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
 511         fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(
 512                 so->base.depth.enabled ? so->base.depth.func : PIPE_FUNC_ALWAYS));
 513 }
 514
 515 static bool
 516 panfrost_fs_required(
 517                 struct panfrost_shader_state *fs,
 518                 struct panfrost_blend_final *blend,
 519                 unsigned rt_count)
 520 {
 521         /* If we generally have side effects */
 522         if (fs->fs_sidefx)
 523                 return true;
 524
 525         /* If colour is written we need to execute */
 526         for (unsigned i = 0; i < rt_count; ++i) {
 527                 if (!blend[i].no_colour)
 528                         return true;
 529         }
 530
 531         /* If depth is written and not implied we need to execute.
 532          * TODO: Predicate on Z/S writes being enabled */
 533         return (fs->writes_depth || fs->writes_stencil);
 534 }
 535
 536 static void
 537 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
 538                                 struct mali_shader_meta *fragmeta,
 539                                 void *rts)
 540 {
 541         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 542         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 543         struct panfrost_shader_state *fs;
 544         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 545
 546         SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
 547                 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
 548                 !ctx->blend->base.dither);
 549
 550         SET_BIT(fragmeta->unknown2_4, MALI_ALPHA_TO_COVERAGE,
 551                         ctx->blend->base.alpha_to_coverage);
 552
 553         /* Get blending setup */
 554         unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 555
 556         struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 557
 558         for (unsigned c = 0; c < rt_count; ++c)
 559                 blend[c] = panfrost_get_blend_for_context(ctx, c);
 560
 561         /* Disable shader execution if we can */
 562         if (dev->quirks & MIDGARD_SHADERLESS
 563                         && !panfrost_fs_required(fs, blend, rt_count)) {
 564                 fragmeta->shader = 0;
 565                 fragmeta->attribute_count = 0;
 566                 fragmeta->varying_count = 0;
 567                 fragmeta->texture_count = 0;
 568                 fragmeta->sampler_count = 0;
 569
 570                 /* This feature is not known to work on Bifrost */
 571                 fragmeta->midgard1.work_count = 1;
 572                 fragmeta->midgard1.uniform_count = 0;
 573                 fragmeta->midgard1.uniform_buffer_count = 0;
 574         }
 575
 576          /* If there is a blend shader, work registers are shared. We impose 8
 577           * work registers as a limit for blend shaders. Should be lower XXX */
 578
 579         if (!(dev->quirks & IS_BIFROST)) {
 580                 for (unsigned c = 0; c < rt_count; ++c) {
 581                         if (blend[c].is_shader) {
 582                                 fragmeta->midgard1.work_count =
 583                                         MAX2(fragmeta->midgard1.work_count, 8);
 584                         }
 585                 }
 586         }
 587
 588         /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
 589          * copied to the blend_meta appended (by convention), but this is the
 590          * field actually read by the hardware. (Or maybe both are read...?).
 591          * Specify the last RTi with a blend shader. */
 592
 593         fragmeta->blend.shader = 0;
 594
 595         for (signed rt = (rt_count - 1); rt >= 0; --rt) {
 596                 if (!blend[rt].is_shader)
 597                         continue;
 598
 599                 fragmeta->blend.shader = blend[rt].shader.gpu |
 600                                          blend[rt].shader.first_tag;
 601                 break;
 602         }
 603
 604         if (dev->quirks & MIDGARD_SFBD) {
 605                 /* When only a single render target platform is used, the blend
 606                  * information is inside the shader meta itself. We additionally
 607                  * need to signal CAN_DISCARD for nontrivial blend modes (so
 608                  * we're able to read back the destination buffer) */
 609
 610                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
 611                         blend[0].is_shader);
 612
 613                 if (!blend[0].is_shader) {
 614                         fragmeta->blend.equation = *blend[0].equation.equation;
 615                         fragmeta->blend.constant = blend[0].equation.constant;
 616                 }
 617
 618                 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
 619                         !blend[0].no_blending || fs->can_discard);
 620
 621                 batch->draws |= PIPE_CLEAR_COLOR0;
 622                 return;
 623         }
 624
 625         if (dev->quirks & IS_BIFROST) {
 626                 bool no_blend = true;
 627
 628                 for (unsigned i = 0; i < rt_count; ++i)
 629                         no_blend &= (blend[i].no_blending | blend[i].no_colour);
 630
 631                 SET_BIT(fragmeta->bifrost1.unk1, MALI_BIFROST_EARLY_Z,
 632                         !fs->can_discard && !fs->writes_depth && no_blend);
 633         }
 634
 635         /* Additional blend descriptor tacked on for jobs using MFBD */
 636
 637         for (unsigned i = 0; i < rt_count; ++i) {
 638                 unsigned flags = 0;
 639
 640                 if (ctx->pipe_framebuffer.nr_cbufs > i && !blend[i].no_colour) {
 641                         flags = 0x200;
 642                         batch->draws |= (PIPE_CLEAR_COLOR0 << i);
 643
 644                         bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
 645                                        (ctx->pipe_framebuffer.cbufs[i]) &&
 646                                        util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
 647
 648                         SET_BIT(flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
 649                         SET_BIT(flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
 650                         SET_BIT(flags, MALI_BLEND_SRGB, is_srgb);
 651                         SET_BIT(flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
 652                 }
 653
 654                 if (dev->quirks & IS_BIFROST) {
 655                         struct bifrost_blend_rt *brts = rts;
 656
 657                         brts[i].flags = flags;
 658
 659                         if (blend[i].is_shader) {
 660                                 /* The blend shader's address needs to be at
 661                                  * the same top 32 bit as the fragment shader.
 662                                  * TODO: Ensure that's always the case.
 663                                  */
 664                                 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
 665                                        (fs->bo->gpu & (0xffffffffull << 32)));
 666                                 brts[i].shader = blend[i].shader.gpu;
 667                                 brts[i].unk2 = 0x0;
 668                         } else if (ctx->pipe_framebuffer.nr_cbufs > i) {
 669                                 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
 670                                 const struct util_format_description *format_desc;
 671                                 format_desc = util_format_description(format);
 672
 673                                 brts[i].equation = *blend[i].equation.equation;
 674
 675                                 /* TODO: this is a bit more complicated */
 676                                 brts[i].constant = blend[i].equation.constant;
 677
 678                                 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
 679
 680                                 /* 0x19 disables blending and forces REPLACE
 681                                  * mode (equivalent to rgb_mode = alpha_mode =
 682                                  * x122, colour mask = 0xF). 0x1a allows
 683                                  * blending. */
 684                                 brts[i].unk2 = blend[i].no_blending ? 0x19 : 0x1a;
 685
 686                                 brts[i].shader_type = fs->blend_types[i];
 687                         } else {
 688                                 /* Dummy attachment for depth-only */
 689                                 brts[i].unk2 = 0x3;
 690                                 brts[i].shader_type = fs->blend_types[i];
 691                         }
 692                 } else {
 693                         struct midgard_blend_rt *mrts = rts;
 694                         mrts[i].flags = flags;
 695
 696                         if (blend[i].is_shader) {
 697                                 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 698                         } else {
 699                                 mrts[i].blend.equation = *blend[i].equation.equation;
 700                                 mrts[i].blend.constant = blend[i].equation.constant;
 701                         }
 702                 }
 703         }
 704 }
 705
 706 static void
 707 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
 708                                struct mali_shader_meta *fragmeta,
 709                                void *rts)
 710 {
 711         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 712         struct panfrost_shader_state *fs;
 713
 714         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 715
 716         bool msaa = ctx->rasterizer->base.multisample;
 717         fragmeta->coverage_mask = msaa ? ctx->sample_mask : ~0;
 718
 719         fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x10;
 720         fragmeta->unknown2_4 = 0x4e0;
 721
 722         /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
 723          * is required (independent of 32-bit/64-bit descriptors), or why it's
 724          * not used on later GPU revisions. Otherwise, all shader jobs fault on
 725          * these earlier chips (perhaps this is a chicken bit of some kind).
 726          * More investigation is needed. */
 727
 728         SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
 729
 730         if (dev->quirks & IS_BIFROST) {
 731                 /* TODO */
 732         } else {
 733                 /* Depending on whether it's legal to in the given shader, we try to
 734                  * enable early-z testing. TODO: respect e-z force */
 735
 736                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
 737                         !fs->can_discard && !fs->writes_global &&
 738                         !fs->writes_depth && !fs->writes_stencil &&
 739                         !ctx->blend->base.alpha_to_coverage);
 740
 741                 /* Add the writes Z/S flags if needed. */
 742                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
 743                 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
 744
 745                 /* Any time texturing is used, derivatives are implicitly calculated,
 746                  * so we need to enable helper invocations */
 747
 748                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
 749                         fs->helper_invocations);
 750
 751                 /* If discard is enabled, which bit we set to convey this
 752                  * depends on if depth/stencil is used for the draw or not.
 753                  * Just one of depth OR stencil is enough to trigger this. */
 754
 755                 const struct pipe_depth_stencil_alpha_state *zsa = &ctx->depth_stencil->base;
 756                 bool zs_enabled =
 757                         fs->writes_depth || fs->writes_stencil ||
 758                         (zsa->depth.enabled && zsa->depth.func != PIPE_FUNC_ALWAYS) ||
 759                         zsa->stencil[0].enabled;
 760
 761                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_TILEBUFFER,
 762                         fs->outputs_read || (!zs_enabled && fs->can_discard));
 763                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, zs_enabled && fs->can_discard);
 764         }
 765
 766         panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
 767         panfrost_frag_meta_zsa_update(ctx, fragmeta);
 768         panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
 769 }
 770
 771 void
 772 panfrost_emit_shader_meta(struct panfrost_batch *batch,
 773                           enum pipe_shader_type st,
 774                           struct mali_vertex_tiler_postfix *postfix)
 775 {
 776         struct panfrost_context *ctx = batch->ctx;
 777         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 778
 779         if (!ss) {
 780                 postfix->shader = 0;
 781                 return;
 782         }
 783
 784         struct mali_shader_meta meta;
 785
 786         panfrost_shader_meta_init(ctx, st, &meta);
 787
 788         /* Add the shader BO to the batch. */
 789         panfrost_batch_add_bo(batch, ss->bo,
 790                               PAN_BO_ACCESS_PRIVATE |
 791                               PAN_BO_ACCESS_READ |
 792                               panfrost_bo_access_for_stage(st));
 793
 794         mali_ptr shader_ptr;
 795
 796         if (st == PIPE_SHADER_FRAGMENT) {
 797                 struct panfrost_device *dev = pan_device(ctx->base.screen);
 798                 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 799                 size_t desc_size = sizeof(meta);
 800                 void *rts = NULL;
 801                 struct panfrost_transfer xfer;
 802                 unsigned rt_size;
 803
 804                 if (dev->quirks & MIDGARD_SFBD)
 805                         rt_size = 0;
 806                 else if (dev->quirks & IS_BIFROST)
 807                         rt_size = sizeof(struct bifrost_blend_rt);
 808                 else
 809                         rt_size = sizeof(struct midgard_blend_rt);
 810
 811                 desc_size += rt_size * rt_count;
 812
 813                 if (rt_size)
 814                         rts = rzalloc_size(ctx, rt_size * rt_count);
 815
 816                 panfrost_frag_shader_meta_init(ctx, &meta, rts);
 817
 818                 xfer = panfrost_pool_alloc(&batch->pool, desc_size);
 819
 820                 memcpy(xfer.cpu, &meta, sizeof(meta));
 821                 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
 822
 823                 if (rt_size)
 824                         ralloc_free(rts);
 825
 826                 shader_ptr = xfer.gpu;
 827         } else {
 828                 shader_ptr = panfrost_pool_upload(&batch->pool, &meta,
 829                                                        sizeof(meta));
 830         }
 831
 832         postfix->shader = shader_ptr;
 833 }
 834
 835 void
 836 panfrost_emit_viewport(struct panfrost_batch *batch,
 837                        struct mali_vertex_tiler_postfix *tiler_postfix)
 838 {
 839         struct panfrost_context *ctx = batch->ctx;
 840         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 841         const struct pipe_scissor_state *ss = &ctx->scissor;
 842         const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 843         const struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
 844
 845         /* Derive min/max from translate/scale. Note since |x| >= 0 by
 846          * definition, we have that -|x| <= |x| hence translate - |scale| <=
 847          * translate + |scale|, so the ordering is correct here. */
 848         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 849         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 850         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 851         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 852         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 853         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 854
 855         /* Scissor to the intersection of viewport and to the scissor, clamped
 856          * to the framebuffer */
 857
 858         unsigned minx = MIN2(fb->width, vp_minx);
 859         unsigned maxx = MIN2(fb->width, vp_maxx);
 860         unsigned miny = MIN2(fb->height, vp_miny);
 861         unsigned maxy = MIN2(fb->height, vp_maxy);
 862
 863         if (ss && rast->scissor) {
 864                 minx = MAX2(ss->minx, minx);
 865                 miny = MAX2(ss->miny, miny);
 866                 maxx = MIN2(ss->maxx, maxx);
 867                 maxy = MIN2(ss->maxy, maxy);
 868         }
 869
 870         struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, MALI_VIEWPORT_LENGTH);
 871
 872         pan_pack(T.cpu, VIEWPORT, cfg) {
 873                 cfg.scissor_minimum_x = minx;
 874                 cfg.scissor_minimum_y = miny;
 875                 cfg.scissor_maximum_x = maxx - 1;
 876                 cfg.scissor_maximum_y = maxy - 1;
 877
 878                 cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY;
 879                 cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY;
 880         }
 881
 882         tiler_postfix->viewport = T.gpu;
 883         panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
 884 }
 885
 886 static mali_ptr
 887 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
 888                                  enum pipe_shader_type st,
 889                                  struct panfrost_constant_buffer *buf,
 890                                  unsigned index)
 891 {
 892         struct pipe_constant_buffer *cb = &buf->cb[index];
 893         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
 894
 895         if (rsrc) {
 896                 panfrost_batch_add_bo(batch, rsrc->bo,
 897                                       PAN_BO_ACCESS_SHARED |
 898                                       PAN_BO_ACCESS_READ |
 899                                       panfrost_bo_access_for_stage(st));
 900
 901                 /* Alignment gauranteed by
 902                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
 903                 return rsrc->bo->gpu + cb->buffer_offset;
 904         } else if (cb->user_buffer) {
 905                 return panfrost_pool_upload(&batch->pool,
 906                                                  cb->user_buffer +
 907                                                  cb->buffer_offset,
 908                                                  cb->buffer_size);
 909         } else {
 910                 unreachable("No constant buffer");
 911         }
 912 }
 913
 914 struct sysval_uniform {
 915         union {
 916                 float f[4];
 917                 int32_t i[4];
 918                 uint32_t u[4];
 919                 uint64_t du[2];
 920         };
 921 };
 922
 923 static void
 924 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
 925                                       struct sysval_uniform *uniform)
 926 {
 927         struct panfrost_context *ctx = batch->ctx;
 928         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 929
 930         uniform->f[0] = vp->scale[0];
 931         uniform->f[1] = vp->scale[1];
 932         uniform->f[2] = vp->scale[2];
 933 }
 934
 935 static void
 936 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
 937                                        struct sysval_uniform *uniform)
 938 {
 939         struct panfrost_context *ctx = batch->ctx;
 940         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 941
 942         uniform->f[0] = vp->translate[0];
 943         uniform->f[1] = vp->translate[1];
 944         uniform->f[2] = vp->translate[2];
 945 }
 946
 947 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
 948                                        enum pipe_shader_type st,
 949                                        unsigned int sysvalid,
 950                                        struct sysval_uniform *uniform)
 951 {
 952         struct panfrost_context *ctx = batch->ctx;
 953         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
 954         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
 955         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
 956         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
 957
 958         assert(dim);
 959         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
 960
 961         if (dim > 1)
 962                 uniform->i[1] = u_minify(tex->texture->height0,
 963                                          tex->u.tex.first_level);
 964
 965         if (dim > 2)
 966                 uniform->i[2] = u_minify(tex->texture->depth0,
 967                                          tex->u.tex.first_level);
 968
 969         if (is_array)
 970                 uniform->i[dim] = tex->texture->array_size;
 971 }
 972
 973 static void
 974 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
 975                             enum pipe_shader_type st,
 976                             unsigned ssbo_id,
 977                             struct sysval_uniform *uniform)
 978 {
 979         struct panfrost_context *ctx = batch->ctx;
 980
 981         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
 982         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
 983
 984         /* Compute address */
 985         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
 986
 987         panfrost_batch_add_bo(batch, bo,
 988                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
 989                               panfrost_bo_access_for_stage(st));
 990
 991         /* Upload address and size as sysval */
 992         uniform->du[0] = bo->gpu + sb.buffer_offset;
 993         uniform->u[2] = sb.buffer_size;
 994 }
 995
 996 static void
 997 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
 998                                enum pipe_shader_type st,
 999                                unsigned samp_idx,
1000                                struct sysval_uniform *uniform)
1001 {
1002         struct panfrost_context *ctx = batch->ctx;
1003         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1004
1005         uniform->f[0] = sampl->min_lod;
1006         uniform->f[1] = sampl->max_lod;
1007         uniform->f[2] = sampl->lod_bias;
1008
1009         /* Even without any errata, Midgard represents "no mipmapping" as
1010          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1011          * panfrost_create_sampler_state which also explains our choice of
1012          * epsilon value (again to keep behaviour consistent) */
1013
1014         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1015                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1016 }
1017
1018 static void
1019 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1020                                        struct sysval_uniform *uniform)
1021 {
1022         struct panfrost_context *ctx = batch->ctx;
1023
1024         uniform->u[0] = ctx->compute_grid->grid[0];
1025         uniform->u[1] = ctx->compute_grid->grid[1];
1026         uniform->u[2] = ctx->compute_grid->grid[2];
1027 }
1028
1029 static void
1030 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1031                         struct panfrost_shader_state *ss,
1032                         enum pipe_shader_type st)
1033 {
1034         struct sysval_uniform *uniforms = (void *)buf;
1035
1036         for (unsigned i = 0; i < ss->sysval_count; ++i) {
1037                 int sysval = ss->sysval[i];
1038
1039                 switch (PAN_SYSVAL_TYPE(sysval)) {
1040                 case PAN_SYSVAL_VIEWPORT_SCALE:
1041                         panfrost_upload_viewport_scale_sysval(batch,
1042                                                               &uniforms[i]);
1043                         break;
1044                 case PAN_SYSVAL_VIEWPORT_OFFSET:
1045                         panfrost_upload_viewport_offset_sysval(batch,
1046                                                                &uniforms[i]);
1047                         break;
1048                 case PAN_SYSVAL_TEXTURE_SIZE:
1049                         panfrost_upload_txs_sysval(batch, st,
1050                                                    PAN_SYSVAL_ID(sysval),
1051                                                    &uniforms[i]);
1052                         break;
1053                 case PAN_SYSVAL_SSBO:
1054                         panfrost_upload_ssbo_sysval(batch, st,
1055                                                     PAN_SYSVAL_ID(sysval),
1056                                                     &uniforms[i]);
1057                         break;
1058                 case PAN_SYSVAL_NUM_WORK_GROUPS:
1059                         panfrost_upload_num_work_groups_sysval(batch,
1060                                                                &uniforms[i]);
1061                         break;
1062                 case PAN_SYSVAL_SAMPLER:
1063                         panfrost_upload_sampler_sysval(batch, st,
1064                                                        PAN_SYSVAL_ID(sysval),
1065                                                        &uniforms[i]);
1066                         break;
1067                 default:
1068                         assert(0);
1069                 }
1070         }
1071 }
1072
1073 static const void *
1074 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1075                                  unsigned index)
1076 {
1077         struct pipe_constant_buffer *cb = &buf->cb[index];
1078         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1079
1080         if (rsrc)
1081                 return rsrc->bo->cpu;
1082         else if (cb->user_buffer)
1083                 return cb->user_buffer;
1084         else
1085                 unreachable("No constant buffer");
1086 }
1087
1088 void
1089 panfrost_emit_const_buf(struct panfrost_batch *batch,
1090                         enum pipe_shader_type stage,
1091                         struct mali_vertex_tiler_postfix *postfix)
1092 {
1093         struct panfrost_context *ctx = batch->ctx;
1094         struct panfrost_shader_variants *all = ctx->shader[stage];
1095
1096         if (!all)
1097                 return;
1098
1099         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1100
1101         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1102
1103         /* Uniforms are implicitly UBO #0 */
1104         bool has_uniforms = buf->enabled_mask & (1 << 0);
1105
1106         /* Allocate room for the sysval and the uniforms */
1107         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1108         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1109         size_t size = sys_size + uniform_size;
1110         struct panfrost_transfer transfer = panfrost_pool_alloc(&batch->pool,
1111                                                                         size);
1112
1113         /* Upload sysvals requested by the shader */
1114         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1115
1116         /* Upload uniforms */
1117         if (has_uniforms && uniform_size) {
1118                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1119                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1120         }
1121
1122         /* Next up, attach UBOs. UBO #0 is the uniforms we just
1123          * uploaded */
1124
1125         unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1126         assert(ubo_count >= 1);
1127
1128         size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count;
1129         struct panfrost_transfer ubos = panfrost_pool_alloc(&batch->pool, sz);
1130         uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
1131
1132         /* Upload uniforms as a UBO */
1133
1134         if (ss->uniform_count) {
1135                 pan_pack(ubo_ptr, UNIFORM_BUFFER, cfg) {
1136                         cfg.entries = ss->uniform_count;
1137                         cfg.pointer = transfer.gpu;
1138                 }
1139         } else {
1140                 *ubo_ptr = 0;
1141         }
1142
1143         /* The rest are honest-to-goodness UBOs */
1144
1145         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1146                 size_t usz = buf->cb[ubo].buffer_size;
1147                 bool enabled = buf->enabled_mask & (1 << ubo);
1148                 bool empty = usz == 0;
1149
1150                 if (!enabled || empty) {
1151                         ubo_ptr[ubo] = 0;
1152                         continue;
1153                 }
1154
1155                 pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) {
1156                         cfg.entries = DIV_ROUND_UP(usz, 16);
1157                         cfg.pointer = panfrost_map_constant_buffer_gpu(batch,
1158                                         stage, buf, ubo);
1159                 }
1160         }
1161
1162         postfix->uniforms = transfer.gpu;
1163         postfix->uniform_buffers = ubos.gpu;
1164
1165         buf->dirty_mask = 0;
1166 }
1167
1168 void
1169 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1170                             const struct pipe_grid_info *info,
1171                             struct midgard_payload_vertex_tiler *vtp)
1172 {
1173         struct panfrost_context *ctx = batch->ctx;
1174         struct panfrost_device *dev = pan_device(ctx->base.screen);
1175         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1176         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1177         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1178                                                            128));
1179
1180         unsigned log2_instances =
1181                 util_logbase2_ceil(info->grid[0]) +
1182                 util_logbase2_ceil(info->grid[1]) +
1183                 util_logbase2_ceil(info->grid[2]);
1184
1185         unsigned shared_size = single_size * (1 << log2_instances) * dev->core_count;
1186         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1187                                                                   shared_size,
1188                                                                   1);
1189
1190         struct mali_shared_memory shared = {
1191                 .shared_memory = bo->gpu,
1192                 .shared_workgroup_count = log2_instances,
1193                 .shared_shift = util_logbase2(single_size) + 1
1194         };
1195
1196         vtp->postfix.shared_memory = panfrost_pool_upload(&batch->pool, &shared,
1197                                                                sizeof(shared));
1198 }
1199
1200 static mali_ptr
1201 panfrost_get_tex_desc(struct panfrost_batch *batch,
1202                       enum pipe_shader_type st,
1203                       struct panfrost_sampler_view *view)
1204 {
1205         if (!view)
1206                 return (mali_ptr) 0;
1207
1208         struct pipe_sampler_view *pview = &view->base;
1209         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1210
1211         /* Add the BO to the job so it's retained until the job is done. */
1212
1213         panfrost_batch_add_bo(batch, rsrc->bo,
1214                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1215                               panfrost_bo_access_for_stage(st));
1216
1217         panfrost_batch_add_bo(batch, view->bo,
1218                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1219                               panfrost_bo_access_for_stage(st));
1220
1221         return view->bo->gpu;
1222 }
1223
1224 static void
1225 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1226                              struct pipe_context *pctx)
1227 {
1228         struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1229         if (view->texture_bo != rsrc->bo->gpu ||
1230             view->modifier != rsrc->modifier) {
1231                 panfrost_bo_unreference(view->bo);
1232                 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1233         }
1234 }
1235
1236 void
1237 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1238                                   enum pipe_shader_type stage,
1239                                   struct mali_vertex_tiler_postfix *postfix)
1240 {
1241         struct panfrost_context *ctx = batch->ctx;
1242         struct panfrost_device *device = pan_device(ctx->base.screen);
1243
1244         if (!ctx->sampler_view_count[stage])
1245                 return;
1246
1247         if (device->quirks & IS_BIFROST) {
1248                 struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool,
1249                                 MALI_BIFROST_TEXTURE_LENGTH *
1250                                 ctx->sampler_view_count[stage]);
1251
1252                 struct mali_bifrost_texture_packed *out =
1253                         (struct mali_bifrost_texture_packed *) T.cpu;
1254
1255                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1256                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1257                         struct pipe_sampler_view *pview = &view->base;
1258                         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1259
1260                         panfrost_update_sampler_view(view, &ctx->base);
1261                         out[i] = view->bifrost_descriptor;
1262
1263                         /* Add the BOs to the job so they are retained until the job is done. */
1264
1265                         panfrost_batch_add_bo(batch, rsrc->bo,
1266                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1267                                               panfrost_bo_access_for_stage(stage));
1268
1269                         panfrost_batch_add_bo(batch, view->bo,
1270                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1271                                               panfrost_bo_access_for_stage(stage));
1272                 }
1273
1274                 postfix->textures = T.gpu;
1275         } else {
1276                 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1277
1278                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1279                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1280
1281                         panfrost_update_sampler_view(view, &ctx->base);
1282
1283                         trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1284                 }
1285
1286                 postfix->textures = panfrost_pool_upload(&batch->pool,
1287                                                               trampolines,
1288                                                               sizeof(uint64_t) *
1289                                                               ctx->sampler_view_count[stage]);
1290         }
1291 }
1292
1293 void
1294 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1295                                   enum pipe_shader_type stage,
1296                                   struct mali_vertex_tiler_postfix *postfix)
1297 {
1298         struct panfrost_context *ctx = batch->ctx;
1299
1300         if (!ctx->sampler_count[stage])
1301                 return;
1302
1303         size_t desc_size = MALI_BIFROST_SAMPLER_LENGTH;
1304         assert(MALI_BIFROST_SAMPLER_LENGTH == MALI_MIDGARD_SAMPLER_LENGTH);
1305
1306         size_t sz = desc_size * ctx->sampler_count[stage];
1307         struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, sz);
1308         struct mali_midgard_sampler_packed *out = (struct mali_midgard_sampler_packed *) T.cpu;
1309
1310         for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i)
1311                 out[i] = ctx->samplers[stage][i]->hw;
1312
1313         postfix->sampler_descriptor = T.gpu;
1314 }
1315
1316 void
1317 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1318                           struct mali_vertex_tiler_postfix *vertex_postfix)
1319 {
1320         struct panfrost_context *ctx = batch->ctx;
1321         struct panfrost_vertex_state *so = ctx->vertex;
1322
1323         unsigned instance_shift = vertex_postfix->instance_shift;
1324         unsigned instance_odd = vertex_postfix->instance_odd;
1325
1326         /* Worst case: everything is NPOT */
1327
1328         struct panfrost_transfer S = panfrost_pool_alloc(&batch->pool,
1329                         MALI_ATTRIBUTE_LENGTH * PIPE_MAX_ATTRIBS * 2);
1330
1331         struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool,
1332                         MALI_ATTRIBUTE_LENGTH * (PAN_INSTANCE_ID + 1));
1333
1334         struct mali_attribute_buffer_packed *bufs =
1335                 (struct mali_attribute_buffer_packed *) S.cpu;
1336
1337         struct mali_attribute_packed *out =
1338                 (struct mali_attribute_packed *) T.cpu;
1339
1340         unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 };
1341         unsigned k = 0;
1342
1343         for (unsigned i = 0; i < so->num_elements; ++i) {
1344                 /* We map buffers 1:1 with the attributes, which
1345                  * means duplicating some vertex buffers (who cares? aside from
1346                  * maybe some caching implications but I somehow doubt that
1347                  * matters) */
1348
1349                 struct pipe_vertex_element *elem = &so->pipe[i];
1350                 unsigned vbi = elem->vertex_buffer_index;
1351                 attrib_to_buffer[i] = k;
1352
1353                 if (!(ctx->vb_mask & (1 << vbi)))
1354                         continue;
1355
1356                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1357                 struct panfrost_resource *rsrc;
1358
1359                 rsrc = pan_resource(buf->buffer.resource);
1360                 if (!rsrc)
1361                         continue;
1362
1363                 /* Add a dependency of the batch on the vertex buffer */
1364                 panfrost_batch_add_bo(batch, rsrc->bo,
1365                                       PAN_BO_ACCESS_SHARED |
1366                                       PAN_BO_ACCESS_READ |
1367                                       PAN_BO_ACCESS_VERTEX_TILER);
1368
1369                 /* Mask off lower bits, see offset fixup below */
1370                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1371                 mali_ptr addr = raw_addr & ~63;
1372
1373                 /* Since we advanced the base pointer, we shrink the buffer
1374                  * size, but add the offset we subtracted */
1375                 unsigned size = rsrc->base.width0 + (raw_addr - addr)
1376                         - buf->buffer_offset;
1377
1378                 /* When there is a divisor, the hardware-level divisor is
1379                  * the product of the instance divisor and the padded count */
1380                 unsigned divisor = elem->instance_divisor;
1381                 unsigned hw_divisor = ctx->padded_count * divisor;
1382                 unsigned stride = buf->stride;
1383
1384                 /* If there's a divisor(=1) but no instancing, we want every
1385                  * attribute to be the same */
1386
1387                 if (divisor && ctx->instance_count == 1)
1388                         stride = 0;
1389
1390                 if (!divisor || ctx->instance_count <= 1) {
1391                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1392                                 if (ctx->instance_count > 1)
1393                                         cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
1394
1395                                 cfg.pointer = addr;
1396                                 cfg.stride = stride;
1397                                 cfg.size = size;
1398                                 cfg.divisor_r = instance_shift;
1399                                 cfg.divisor_p = instance_odd;
1400                         }
1401                 } else if (util_is_power_of_two_or_zero(hw_divisor)) {
1402                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1403                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
1404                                 cfg.pointer = addr;
1405                                 cfg.stride = stride;
1406                                 cfg.size = size;
1407                                 cfg.divisor_r = __builtin_ctz(hw_divisor);
1408                         }
1409
1410                 } else {
1411                         unsigned shift = 0, extra_flags = 0;
1412
1413                         unsigned magic_divisor =
1414                                 panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
1415
1416                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1417                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
1418                                 cfg.pointer = addr;
1419                                 cfg.stride = stride;
1420                                 cfg.size = size;
1421
1422                                 cfg.divisor_r = shift;
1423                                 cfg.divisor_e = extra_flags;
1424                         }
1425
1426                         pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
1427                                 cfg.divisor_numerator = magic_divisor;
1428                                 cfg.divisor = divisor;
1429                         }
1430
1431                         ++k;
1432                 }
1433
1434                 ++k;
1435         }
1436
1437         /* Add special gl_VertexID/gl_InstanceID buffers */
1438
1439         panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1440
1441         pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
1442                 cfg.buffer_index = k++;
1443                 cfg.format = so->formats[PAN_VERTEX_ID];
1444         }
1445
1446         panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1447
1448         pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
1449                 cfg.buffer_index = k++;
1450                 cfg.format = so->formats[PAN_INSTANCE_ID];
1451         }
1452
1453         /* Attribute addresses require 64-byte alignment, so let:
1454          *
1455          *      base' = base & ~63 = base - (base & 63)
1456          *      offset' = offset + (base & 63)
1457          *
1458          * Since base' + offset' = base + offset, these are equivalent
1459          * addressing modes and now base is 64 aligned.
1460          */
1461
1462         unsigned start = vertex_postfix->offset_start;
1463
1464         for (unsigned i = 0; i < so->num_elements; ++i) {
1465                 unsigned vbi = so->pipe[i].vertex_buffer_index;
1466                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1467
1468                 /* Adjust by the masked off bits of the offset. Make sure we
1469                  * read src_offset from so->hw (which is not GPU visible)
1470                  * rather than target (which is) due to caching effects */
1471
1472                 unsigned src_offset = so->pipe[i].src_offset;
1473
1474                 /* BOs aligned to 4k so guaranteed aligned to 64 */
1475                 src_offset += (buf->buffer_offset & 63);
1476
1477                 /* Also, somewhat obscurely per-instance data needs to be
1478                  * offset in response to a delayed start in an indexed draw */
1479
1480                 if (so->pipe[i].instance_divisor && ctx->instance_count > 1 && start)
1481                         src_offset -= buf->stride * start;
1482
1483                 pan_pack(out + i, ATTRIBUTE, cfg) {
1484                         cfg.buffer_index = attrib_to_buffer[i];
1485                         cfg.format = so->formats[i];
1486                         cfg.offset = src_offset;
1487                 }
1488         }
1489
1490         vertex_postfix->attributes = S.gpu;
1491         vertex_postfix->attribute_meta = T.gpu;
1492 }
1493
1494 static mali_ptr
1495 panfrost_emit_varyings(struct panfrost_batch *batch,
1496                 struct mali_attribute_buffer_packed *slot,
1497                 unsigned stride, unsigned count)
1498 {
1499         unsigned size = stride * count;
1500         mali_ptr ptr = panfrost_pool_alloc(&batch->invisible_pool, size).gpu;
1501
1502         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1503                 cfg.stride = stride;
1504                 cfg.size = size;
1505                 cfg.pointer = ptr;
1506         }
1507
1508         return ptr;
1509 }
1510
1511 static unsigned
1512 panfrost_streamout_offset(unsigned stride, unsigned offset,
1513                         struct pipe_stream_output_target *target)
1514 {
1515         return (target->buffer_offset + (offset * stride * 4)) & 63;
1516 }
1517
1518 static void
1519 panfrost_emit_streamout(struct panfrost_batch *batch,
1520                         struct mali_attribute_buffer_packed *slot,
1521                         unsigned stride_words, unsigned offset, unsigned count,
1522                         struct pipe_stream_output_target *target)
1523 {
1524         unsigned stride = stride_words * 4;
1525         unsigned max_size = target->buffer_size;
1526         unsigned expected_size = stride * count;
1527
1528         /* Grab the BO and bind it to the batch */
1529         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1530
1531         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1532          * the perspective of the TILER and FRAGMENT.
1533          */
1534         panfrost_batch_add_bo(batch, bo,
1535                               PAN_BO_ACCESS_SHARED |
1536                               PAN_BO_ACCESS_RW |
1537                               PAN_BO_ACCESS_VERTEX_TILER |
1538                               PAN_BO_ACCESS_FRAGMENT);
1539
1540         /* We will have an offset applied to get alignment */
1541         mali_ptr addr = bo->gpu + target->buffer_offset + (offset * stride);
1542
1543         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1544                 cfg.pointer = (addr & ~63);
1545                 cfg.stride = stride;
1546                 cfg.size = MIN2(max_size, expected_size) + (addr & 63);
1547         }
1548 }
1549
1550 static bool
1551 has_point_coord(unsigned mask, gl_varying_slot loc)
1552 {
1553         if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1554                 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1555         else if (loc == VARYING_SLOT_PNTC)
1556                 return (mask & (1 << 8));
1557         else
1558                 return false;
1559 }
1560
1561 /* Helpers for manipulating stream out information so we can pack varyings
1562  * accordingly. Compute the src_offset for a given captured varying */
1563
1564 static struct pipe_stream_output *
1565 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1566 {
1567         for (unsigned i = 0; i < info->num_outputs; ++i) {
1568                 if (info->output[i].register_index == loc)
1569                         return &info->output[i];
1570         }
1571
1572         unreachable("Varying not captured");
1573 }
1574
1575 static unsigned
1576 pan_varying_size(enum mali_format fmt)
1577 {
1578         unsigned type = MALI_EXTRACT_TYPE(fmt);
1579         unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1580         unsigned bits = MALI_EXTRACT_BITS(fmt);
1581         unsigned bpc = 0;
1582
1583         if (bits == MALI_CHANNEL_FLOAT) {
1584                 /* No doubles */
1585                 bool fp16 = (type == MALI_FORMAT_SINT);
1586                 assert(fp16 || (type == MALI_FORMAT_UNORM));
1587
1588                 bpc = fp16 ? 2 : 4;
1589         } else {
1590                 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1591
1592                 /* See the enums */
1593                 bits = 1 << bits;
1594                 assert(bits >= 8);
1595                 bpc = bits / 8;
1596         }
1597
1598         return bpc * chan;
1599 }
1600
1601 /* Indices for named (non-XFB) varyings that are present. These are packed
1602  * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1603  * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1604  * of a given special field given a shift S by:
1605  *
1606  *      idx = popcount(P & ((1 << S) - 1))
1607  *
1608  * That is... look at all of the varyings that come earlier and count them, the
1609  * count is the new index since plus one. Likewise, the total number of special
1610  * buffers required is simply popcount(P)
1611  */
1612
1613 enum pan_special_varying {
1614         PAN_VARY_GENERAL = 0,
1615         PAN_VARY_POSITION = 1,
1616         PAN_VARY_PSIZ = 2,
1617         PAN_VARY_PNTCOORD = 3,
1618         PAN_VARY_FACE = 4,
1619         PAN_VARY_FRAGCOORD = 5,
1620
1621         /* Keep last */
1622         PAN_VARY_MAX,
1623 };
1624
1625 /* Given a varying, figure out which index it correpsonds to */
1626
1627 static inline unsigned
1628 pan_varying_index(unsigned present, enum pan_special_varying v)
1629 {
1630         unsigned mask = (1 << v) - 1;
1631         return util_bitcount(present & mask);
1632 }
1633
1634 /* Get the base offset for XFB buffers, which by convention come after
1635  * everything else. Wrapper function for semantic reasons; by construction this
1636  * is just popcount. */
1637
1638 static inline unsigned
1639 pan_xfb_base(unsigned present)
1640 {
1641         return util_bitcount(present);
1642 }
1643
1644 /* Computes the present mask for varyings so we can start emitting varying records */
1645
1646 static inline unsigned
1647 pan_varying_present(
1648         struct panfrost_shader_state *vs,
1649         struct panfrost_shader_state *fs,
1650         unsigned quirks)
1651 {
1652         /* At the moment we always emit general and position buffers. Not
1653          * strictly necessary but usually harmless */
1654
1655         unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1656
1657         /* Enable special buffers by the shader info */
1658
1659         if (vs->writes_point_size)
1660                 present |= (1 << PAN_VARY_PSIZ);
1661
1662         if (fs->reads_point_coord)
1663                 present |= (1 << PAN_VARY_PNTCOORD);
1664
1665         if (fs->reads_face)
1666                 present |= (1 << PAN_VARY_FACE);
1667
1668         if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1669                 present |= (1 << PAN_VARY_FRAGCOORD);
1670
1671         /* Also, if we have a point sprite, we need a point coord buffer */
1672
1673         for (unsigned i = 0; i < fs->varying_count; i++)  {
1674                 gl_varying_slot loc = fs->varyings_loc[i];
1675
1676                 if (has_point_coord(fs->point_sprite_mask, loc))
1677                         present |= (1 << PAN_VARY_PNTCOORD);
1678         }
1679
1680         return present;
1681 }
1682
1683 /* Emitters for varying records */
1684
1685 static void
1686 pan_emit_vary(struct mali_attribute_packed *out,
1687                 unsigned present, enum pan_special_varying buf,
1688                 unsigned quirks, enum mali_format format,
1689                 unsigned offset)
1690 {
1691         unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1692         unsigned swizzle = quirks & HAS_SWIZZLES ?
1693                         panfrost_get_default_swizzle(nr_channels) :
1694                         panfrost_bifrost_swizzle(nr_channels);
1695
1696         pan_pack(out, ATTRIBUTE, cfg) {
1697                 cfg.buffer_index = pan_varying_index(present, buf);
1698                 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1699                 cfg.format = (format << 12) | swizzle;
1700                 cfg.offset = offset;
1701         }
1702 }
1703
1704 /* General varying that is unused */
1705
1706 static void
1707 pan_emit_vary_only(struct mali_attribute_packed *out,
1708                 unsigned present, unsigned quirks)
1709 {
1710         pan_emit_vary(out, present, 0, quirks, MALI_VARYING_DISCARD, 0);
1711 }
1712
1713 /* Special records */
1714
1715 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1716         [PAN_VARY_POSITION]     = MALI_VARYING_POS,
1717         [PAN_VARY_PSIZ]         = MALI_R16F,
1718         [PAN_VARY_PNTCOORD]     = MALI_R16F,
1719         [PAN_VARY_FACE]         = MALI_R32I,
1720         [PAN_VARY_FRAGCOORD]    = MALI_RGBA32F
1721 };
1722
1723 static void
1724 pan_emit_vary_special(struct mali_attribute_packed *out,
1725                 unsigned present, enum pan_special_varying buf,
1726                 unsigned quirks)
1727 {
1728         assert(buf < PAN_VARY_MAX);
1729         pan_emit_vary(out, present, buf, quirks, pan_varying_formats[buf], 0);
1730 }
1731
1732 static enum mali_format
1733 pan_xfb_format(enum mali_format format, unsigned nr)
1734 {
1735         if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1736                 return MALI_R32F | MALI_NR_CHANNELS(nr);
1737         else
1738                 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1739 }
1740
1741 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1742  * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1743  * value. */
1744
1745 static void
1746 pan_emit_vary_xfb(struct mali_attribute_packed *out,
1747                 unsigned present,
1748                 unsigned max_xfb,
1749                 unsigned *streamout_offsets,
1750                 unsigned quirks,
1751                 enum mali_format format,
1752                 struct pipe_stream_output o)
1753 {
1754         unsigned swizzle = quirks & HAS_SWIZZLES ?
1755                         panfrost_get_default_swizzle(o.num_components) :
1756                         panfrost_bifrost_swizzle(o.num_components);
1757
1758         pan_pack(out, ATTRIBUTE, cfg) {
1759                 /* XFB buffers come after everything else */
1760                 cfg.buffer_index = pan_xfb_base(present) + o.output_buffer;
1761                 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1762
1763                 /* Override number of channels and precision to highp */
1764                 cfg.format = (pan_xfb_format(format, o.num_components) << 12) | swizzle;
1765
1766                 /* Apply given offsets together */
1767                 cfg.offset = (o.dst_offset * 4) /* dwords */
1768                         + streamout_offsets[o.output_buffer];
1769         }
1770 }
1771
1772 /* Determine if we should capture a varying for XFB. This requires actually
1773  * having a buffer for it. If we don't capture it, we'll fallback to a general
1774  * varying path (linked or unlinked, possibly discarding the write) */
1775
1776 static bool
1777 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1778                 unsigned loc, unsigned max_xfb)
1779 {
1780         if (!(xfb->so_mask & (1ll << loc)))
1781                 return false;
1782
1783         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1784         return o->output_buffer < max_xfb;
1785 }
1786
1787 static void
1788 pan_emit_general_varying(struct mali_attribute_packed *out,
1789                 struct panfrost_shader_state *other,
1790                 struct panfrost_shader_state *xfb,
1791                 gl_varying_slot loc,
1792                 enum mali_format format,
1793                 unsigned present,
1794                 unsigned quirks,
1795                 unsigned *gen_offsets,
1796                 enum mali_format *gen_formats,
1797                 unsigned *gen_stride,
1798                 unsigned idx,
1799                 bool should_alloc)
1800 {
1801         /* Check if we're linked */
1802         signed other_idx = -1;
1803
1804         for (unsigned j = 0; j < other->varying_count; ++j) {
1805                 if (other->varyings_loc[j] == loc) {
1806                         other_idx = j;
1807                         break;
1808                 }
1809         }
1810
1811         if (other_idx < 0) {
1812                 pan_emit_vary_only(out, present, quirks);
1813                 return;
1814         }
1815
1816         unsigned offset = gen_offsets[other_idx];
1817
1818         if (should_alloc) {
1819                 /* We're linked, so allocate a space via a watermark allocation */
1820                 enum mali_format alt = other->varyings[other_idx];
1821
1822                 /* Do interpolation at minimum precision */
1823                 unsigned size_main = pan_varying_size(format);
1824                 unsigned size_alt = pan_varying_size(alt);
1825                 unsigned size = MIN2(size_main, size_alt);
1826
1827                 /* If a varying is marked for XFB but not actually captured, we
1828                  * should match the format to the format that would otherwise
1829                  * be used for XFB, since dEQP checks for invariance here. It's
1830                  * unclear if this is required by the spec. */
1831
1832                 if (xfb->so_mask & (1ull << loc)) {
1833                         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1834                         format = pan_xfb_format(format, o->num_components);
1835                         size = pan_varying_size(format);
1836                 } else if (size == size_alt) {
1837                         format = alt;
1838                 }
1839
1840                 gen_offsets[idx] = *gen_stride;
1841                 gen_formats[other_idx] = format;
1842                 offset = *gen_stride;
1843                 *gen_stride += size;
1844         }
1845
1846         pan_emit_vary(out, present, PAN_VARY_GENERAL, quirks, format, offset);
1847 }
1848
1849 /* Higher-level wrapper around all of the above, classifying a varying into one
1850  * of the above types */
1851
1852 static void
1853 panfrost_emit_varying(
1854                 struct mali_attribute_packed *out,
1855                 struct panfrost_shader_state *stage,
1856                 struct panfrost_shader_state *other,
1857                 struct panfrost_shader_state *xfb,
1858                 unsigned present,
1859                 unsigned max_xfb,
1860                 unsigned *streamout_offsets,
1861                 unsigned quirks,
1862                 unsigned *gen_offsets,
1863                 enum mali_format *gen_formats,
1864                 unsigned *gen_stride,
1865                 unsigned idx,
1866                 bool should_alloc,
1867                 bool is_fragment)
1868 {
1869         gl_varying_slot loc = stage->varyings_loc[idx];
1870         enum mali_format format = stage->varyings[idx];
1871
1872         /* Override format to match linkage */
1873         if (!should_alloc && gen_formats[idx])
1874                 format = gen_formats[idx];
1875
1876         if (has_point_coord(stage->point_sprite_mask, loc)) {
1877                 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1878         } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1879                 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1880                 pan_emit_vary_xfb(out, present, max_xfb, streamout_offsets, quirks, format, *o);
1881         } else if (loc == VARYING_SLOT_POS) {
1882                 if (is_fragment)
1883                         pan_emit_vary_special(out, present, PAN_VARY_FRAGCOORD, quirks);
1884                 else
1885                         pan_emit_vary_special(out, present, PAN_VARY_POSITION, quirks);
1886         } else if (loc == VARYING_SLOT_PSIZ) {
1887                 pan_emit_vary_special(out, present, PAN_VARY_PSIZ, quirks);
1888         } else if (loc == VARYING_SLOT_PNTC) {
1889                 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1890         } else if (loc == VARYING_SLOT_FACE) {
1891                 pan_emit_vary_special(out, present, PAN_VARY_FACE, quirks);
1892         } else {
1893                 pan_emit_general_varying(out, other, xfb, loc, format, present,
1894                                 quirks, gen_offsets, gen_formats, gen_stride,
1895                                 idx, should_alloc);
1896         }
1897 }
1898
1899 static void
1900 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
1901                 unsigned present,
1902                 enum pan_special_varying v,
1903                 unsigned special)
1904 {
1905         if (present & (1 << v)) {
1906                 unsigned idx = pan_varying_index(present, v);
1907
1908                 pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
1909                         cfg.special = special;
1910                         cfg.type = 0;
1911                 }
1912         }
1913 }
1914
1915 void
1916 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1917                                  unsigned vertex_count,
1918                                  struct mali_vertex_tiler_postfix *vertex_postfix,
1919                                  struct mali_vertex_tiler_postfix *tiler_postfix,
1920                                  union midgard_primitive_size *primitive_size)
1921 {
1922         /* Load the shaders */
1923         struct panfrost_context *ctx = batch->ctx;
1924         struct panfrost_device *dev = pan_device(ctx->base.screen);
1925         struct panfrost_shader_state *vs, *fs;
1926         size_t vs_size, fs_size;
1927
1928         /* Allocate the varying descriptor */
1929
1930         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1931         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1932         vs_size = MALI_ATTRIBUTE_LENGTH * vs->varying_count;
1933         fs_size = MALI_ATTRIBUTE_LENGTH * fs->varying_count;
1934
1935         struct panfrost_transfer trans = panfrost_pool_alloc(&batch->pool,
1936                                                                      vs_size +
1937                                                                      fs_size);
1938
1939         struct pipe_stream_output_info *so = &vs->stream_output;
1940         unsigned present = pan_varying_present(vs, fs, dev->quirks);
1941
1942         /* Check if this varying is linked by us. This is the case for
1943          * general-purpose, non-captured varyings. If it is, link it. If it's
1944          * not, use the provided stream out information to determine the
1945          * offset, since it was already linked for us. */
1946
1947         unsigned gen_offsets[32];
1948         enum mali_format gen_formats[32];
1949         memset(gen_offsets, 0, sizeof(gen_offsets));
1950         memset(gen_formats, 0, sizeof(gen_formats));
1951
1952         unsigned gen_stride = 0;
1953         assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
1954         assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
1955
1956         unsigned streamout_offsets[32];
1957
1958         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1959                 streamout_offsets[i] = panfrost_streamout_offset(
1960                                         so->stride[i],
1961                                         ctx->streamout.offsets[i],
1962                                         ctx->streamout.targets[i]);
1963         }
1964
1965         struct mali_attribute_packed *ovs = (struct mali_attribute_packed *)trans.cpu;
1966         struct mali_attribute_packed *ofs = ovs + vs->varying_count;
1967
1968         for (unsigned i = 0; i < vs->varying_count; i++) {
1969                 panfrost_emit_varying(ovs + i, vs, fs, vs, present,
1970                                 ctx->streamout.num_targets, streamout_offsets,
1971                                 dev->quirks,
1972                                 gen_offsets, gen_formats, &gen_stride, i, true, false);
1973         }
1974
1975         for (unsigned i = 0; i < fs->varying_count; i++) {
1976                 panfrost_emit_varying(ofs + i, fs, vs, vs, present,
1977                                 ctx->streamout.num_targets, streamout_offsets,
1978                                 dev->quirks,
1979                                 gen_offsets, gen_formats, &gen_stride, i, false, true);
1980         }
1981
1982         unsigned xfb_base = pan_xfb_base(present);
1983         struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool,
1984                         MALI_ATTRIBUTE_BUFFER_LENGTH * (xfb_base + ctx->streamout.num_targets));
1985         struct mali_attribute_buffer_packed *varyings =
1986                 (struct mali_attribute_buffer_packed *) T.cpu;
1987
1988         /* Emit the stream out buffers */
1989
1990         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1991                                                            ctx->vertex_count);
1992
1993         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1994                 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
1995                                         so->stride[i],
1996                                         ctx->streamout.offsets[i],
1997                                         out_count,
1998                                         ctx->streamout.targets[i]);
1999         }
2000
2001         panfrost_emit_varyings(batch,
2002                         &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
2003                         gen_stride, vertex_count);
2004
2005         /* fp32 vec4 gl_Position */
2006         tiler_postfix->position_varying = panfrost_emit_varyings(batch,
2007                         &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
2008                         sizeof(float) * 4, vertex_count);
2009
2010         if (present & (1 << PAN_VARY_PSIZ)) {
2011                 primitive_size->pointer = panfrost_emit_varyings(batch,
2012                                 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
2013                                 2, vertex_count);
2014         }
2015
2016         pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD);
2017         pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
2018         pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
2019
2020         vertex_postfix->varyings = T.gpu;
2021         tiler_postfix->varyings = T.gpu;
2022
2023         vertex_postfix->varying_meta = trans.gpu;
2024         tiler_postfix->varying_meta = trans.gpu + vs_size;
2025 }
2026
2027 void
2028 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
2029                                 struct mali_vertex_tiler_prefix *vertex_prefix,
2030                                 struct mali_vertex_tiler_postfix *vertex_postfix,
2031                                 struct mali_vertex_tiler_prefix *tiler_prefix,
2032                                 struct mali_vertex_tiler_postfix *tiler_postfix,
2033                                 union midgard_primitive_size *primitive_size)
2034 {
2035         struct panfrost_context *ctx = batch->ctx;
2036         struct panfrost_device *device = pan_device(ctx->base.screen);
2037         bool wallpapering = ctx->wallpaper_batch && batch->scoreboard.tiler_dep;
2038         struct bifrost_payload_vertex bifrost_vertex = {0,};
2039         struct bifrost_payload_tiler bifrost_tiler = {0,};
2040         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
2041         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
2042         void *vp, *tp;
2043         size_t vp_size, tp_size;
2044
2045         if (device->quirks & IS_BIFROST) {
2046                 bifrost_vertex.prefix = *vertex_prefix;
2047                 bifrost_vertex.postfix = *vertex_postfix;
2048                 vp = &bifrost_vertex;
2049                 vp_size = sizeof(bifrost_vertex);
2050
2051                 bifrost_tiler.prefix = *tiler_prefix;
2052                 bifrost_tiler.tiler.primitive_size = *primitive_size;
2053                 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2054                 bifrost_tiler.postfix = *tiler_postfix;
2055                 tp = &bifrost_tiler;
2056                 tp_size = sizeof(bifrost_tiler);
2057         } else {
2058                 midgard_vertex.prefix = *vertex_prefix;
2059                 midgard_vertex.postfix = *vertex_postfix;
2060                 vp = &midgard_vertex;
2061                 vp_size = sizeof(midgard_vertex);
2062
2063                 midgard_tiler.prefix = *tiler_prefix;
2064                 midgard_tiler.postfix = *tiler_postfix;
2065                 midgard_tiler.primitive_size = *primitive_size;
2066                 tp = &midgard_tiler;
2067                 tp_size = sizeof(midgard_tiler);
2068         }
2069
2070         if (wallpapering) {
2071                 /* Inject in reverse order, with "predicted" job indices.
2072                  * THIS IS A HACK XXX */
2073                 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false,
2074                                  batch->scoreboard.job_index + 2, tp, tp_size, true);
2075                 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2076                                  vp, vp_size, true);
2077                 return;
2078         }
2079
2080         /* If rasterizer discard is enable, only submit the vertex */
2081
2082         unsigned vertex = panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2083                                            vp, vp_size, false);
2084
2085         if (ctx->rasterizer->base.rasterizer_discard)
2086                 return;
2087
2088         panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false, vertex, tp, tp_size,
2089                          false);
2090 }
2091
2092 /* TODO: stop hardcoding this */
2093 mali_ptr
2094 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2095 {
2096         uint16_t locations[] = {
2097             128, 128,
2098             0, 256,
2099             0, 256,
2100             0, 256,
2101             0, 256,
2102             0, 256,
2103             0, 256,
2104             0, 256,
2105             0, 256,
2106             0, 256,
2107             0, 256,
2108             0, 256,
2109             0, 256,
2110             0, 256,
2111             0, 256,
2112             0, 256,
2113             0, 256,
2114             0, 256,
2115             0, 256,
2116             0, 256,
2117             0, 256,
2118             0, 256,
2119             0, 256,
2120             0, 256,
2121             0, 256,
2122             0, 256,
2123             0, 256,
2124             0, 256,
2125             0, 256,
2126             0, 256,
2127             0, 256,
2128             0, 256,
2129             128, 128,
2130             0, 0,
2131             0, 0,
2132             0, 0,
2133             0, 0,
2134             0, 0,
2135             0, 0,
2136             0, 0,
2137             0, 0,
2138             0, 0,
2139             0, 0,
2140             0, 0,
2141             0, 0,
2142             0, 0,
2143             0, 0,
2144             0, 0,
2145         };
2146
2147         return panfrost_pool_upload(&batch->pool, locations, 96 * sizeof(uint16_t));
2148 }