src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28
  29 #include "panfrost-quirks.h"
  30
  31 #include "pan_pool.h"
  32 #include "pan_bo.h"
  33 #include "pan_cmdstream.h"
  34 #include "pan_context.h"
  35 #include "pan_job.h"
  36
  37 /* If a BO is accessed for a particular shader stage, will it be in the primary
  38  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  39  * fragment will be primary, e.g. compute jobs will be considered
  40  * "vertex/tiler" by analogy */
  41
  42 static inline uint32_t
  43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  44 {
  45         assert(stage == PIPE_SHADER_FRAGMENT ||
  46                stage == PIPE_SHADER_VERTEX ||
  47                stage == PIPE_SHADER_COMPUTE);
  48
  49         return stage == PIPE_SHADER_FRAGMENT ?
  50                PAN_BO_ACCESS_FRAGMENT :
  51                PAN_BO_ACCESS_VERTEX_TILER;
  52 }
  53
  54 static void
  55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
  56                                struct mali_vertex_tiler_postfix *postfix)
  57 {
  58         struct panfrost_device *dev = pan_device(ctx->base.screen);
  59         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  60
  61         struct mali_shared_memory shared = {
  62                 .shared_workgroup_count = ~0,
  63         };
  64
  65         if (batch->stack_size) {
  66                 struct panfrost_bo *stack =
  67                         panfrost_batch_get_scratchpad(batch, batch->stack_size,
  68                                         dev->thread_tls_alloc,
  69                                         dev->core_count);
  70
  71                 shared.stack_shift = panfrost_get_stack_shift(batch->stack_size);
  72                 shared.scratchpad = stack->gpu;
  73         }
  74
  75         postfix->shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared, sizeof(shared), 64);
  76 }
  77
  78 static void
  79 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
  80                                struct mali_vertex_tiler_postfix *postfix)
  81 {
  82         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  83         postfix->shared_memory = panfrost_batch_reserve_framebuffer(batch);
  84 }
  85
  86 static void
  87 panfrost_vt_update_rasterizer(struct panfrost_rasterizer *rasterizer,
  88                               struct mali_vertex_tiler_prefix *prefix,
  89                               struct mali_vertex_tiler_postfix *postfix)
  90 {
  91         postfix->gl_enables |= 0x7;
  92         SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
  93                 rasterizer->base.front_ccw);
  94         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
  95                 (rasterizer->base.cull_face & PIPE_FACE_FRONT));
  96         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
  97                 (rasterizer->base.cull_face & PIPE_FACE_BACK));
  98         SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
  99                 rasterizer->base.flatshade_first);
 100 }
 101
 102 void
 103 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
 104                                   struct mali_vertex_tiler_prefix *prefix,
 105                                   union midgard_primitive_size *primitive_size)
 106 {
 107         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 108
 109         if (!panfrost_writes_point_size(ctx)) {
 110                 float val = (prefix->draw_mode == MALI_DRAW_MODE_POINTS) ?
 111                               rasterizer->base.point_size :
 112                               rasterizer->base.line_width;
 113
 114                 primitive_size->constant = val;
 115         }
 116 }
 117
 118 static void
 119 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
 120                                    struct mali_vertex_tiler_postfix *postfix)
 121 {
 122         SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
 123         if (ctx->occlusion_query) {
 124                 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
 125                 panfrost_batch_add_bo(ctx->batch, ctx->occlusion_query->bo,
 126                                       PAN_BO_ACCESS_SHARED |
 127                                       PAN_BO_ACCESS_RW |
 128                                       PAN_BO_ACCESS_FRAGMENT);
 129         } else {
 130                 postfix->occlusion_counter = 0;
 131         }
 132 }
 133
 134 void
 135 panfrost_vt_init(struct panfrost_context *ctx,
 136                  enum pipe_shader_type stage,
 137                  struct mali_vertex_tiler_prefix *prefix,
 138                  struct mali_vertex_tiler_postfix *postfix)
 139 {
 140         struct panfrost_device *device = pan_device(ctx->base.screen);
 141
 142         if (!ctx->shader[stage])
 143                 return;
 144
 145         memset(prefix, 0, sizeof(*prefix));
 146         memset(postfix, 0, sizeof(*postfix));
 147
 148         if (device->quirks & IS_BIFROST) {
 149                 postfix->gl_enables = 0x2;
 150                 panfrost_vt_emit_shared_memory(ctx, postfix);
 151         } else {
 152                 postfix->gl_enables = 0x6;
 153                 panfrost_vt_attach_framebuffer(ctx, postfix);
 154         }
 155
 156         if (stage == PIPE_SHADER_FRAGMENT) {
 157                 panfrost_vt_update_occlusion_query(ctx, postfix);
 158                 panfrost_vt_update_rasterizer(ctx->rasterizer, prefix, postfix);
 159         }
 160 }
 161
 162 static unsigned
 163 panfrost_translate_index_size(unsigned size)
 164 {
 165         switch (size) {
 166         case 1:
 167                 return MALI_DRAW_INDEXED_UINT8;
 168
 169         case 2:
 170                 return MALI_DRAW_INDEXED_UINT16;
 171
 172         case 4:
 173                 return MALI_DRAW_INDEXED_UINT32;
 174
 175         default:
 176                 unreachable("Invalid index size");
 177         }
 178 }
 179
 180 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
 181  * good for the duration of the draw (transient), could last longer. Also get
 182  * the bounds on the index buffer for the range accessed by the draw. We do
 183  * these operations together because there are natural optimizations which
 184  * require them to be together. */
 185
 186 static mali_ptr
 187 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 188                                   const struct pipe_draw_info *info,
 189                                   unsigned *min_index, unsigned *max_index)
 190 {
 191         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 192         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 193         off_t offset = info->start * info->index_size;
 194         bool needs_indices = true;
 195         mali_ptr out = 0;
 196
 197         if (info->max_index != ~0u) {
 198                 *min_index = info->min_index;
 199                 *max_index = info->max_index;
 200                 needs_indices = false;
 201         }
 202
 203         if (!info->has_user_indices) {
 204                 /* Only resources can be directly mapped */
 205                 panfrost_batch_add_bo(batch, rsrc->bo,
 206                                       PAN_BO_ACCESS_SHARED |
 207                                       PAN_BO_ACCESS_READ |
 208                                       PAN_BO_ACCESS_VERTEX_TILER);
 209                 out = rsrc->bo->gpu + offset;
 210
 211                 /* Check the cache */
 212                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 213                                                            info->start,
 214                                                            info->count,
 215                                                            min_index,
 216                                                            max_index);
 217         } else {
 218                 /* Otherwise, we need to upload to transient memory */
 219                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 220                 struct panfrost_transfer T =
 221                         panfrost_pool_alloc_aligned(&batch->pool,
 222                                 info->count * info->index_size,
 223                                 info->index_size);
 224
 225                 memcpy(T.cpu, ibuf8 + offset, info->count * info->index_size);
 226                 out = T.gpu;
 227         }
 228
 229         if (needs_indices) {
 230                 /* Fallback */
 231                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 232
 233                 if (!info->has_user_indices)
 234                         panfrost_minmax_cache_add(rsrc->index_cache,
 235                                                   info->start, info->count,
 236                                                   *min_index, *max_index);
 237         }
 238
 239         return out;
 240 }
 241
 242 void
 243 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
 244                           const struct pipe_draw_info *info,
 245                           enum mali_draw_mode draw_mode,
 246                           struct mali_vertex_tiler_postfix *vertex_postfix,
 247                           struct mali_vertex_tiler_prefix *tiler_prefix,
 248                           struct mali_vertex_tiler_postfix *tiler_postfix,
 249                           unsigned *vertex_count,
 250                           unsigned *padded_count)
 251 {
 252         tiler_prefix->draw_mode = draw_mode;
 253
 254         unsigned draw_flags = 0;
 255
 256         if (panfrost_writes_point_size(ctx))
 257                 draw_flags |= MALI_DRAW_VARYING_SIZE;
 258
 259         if (info->primitive_restart)
 260                 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
 261
 262         /* These doesn't make much sense */
 263
 264         draw_flags |= 0x3000;
 265
 266         if (info->index_size) {
 267                 unsigned min_index = 0, max_index = 0;
 268
 269                 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
 270                                                                        info,
 271                                                                        &min_index,
 272                                                                        &max_index);
 273
 274                 /* Use the corresponding values */
 275                 *vertex_count = max_index - min_index + 1;
 276                 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
 277                 tiler_prefix->offset_bias_correction = -min_index;
 278                 tiler_prefix->index_count = MALI_POSITIVE(info->count);
 279                 draw_flags |= panfrost_translate_index_size(info->index_size);
 280         } else {
 281                 tiler_prefix->indices = 0;
 282                 *vertex_count = ctx->vertex_count;
 283                 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
 284                 tiler_prefix->offset_bias_correction = 0;
 285                 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
 286         }
 287
 288         tiler_prefix->unknown_draw = draw_flags;
 289
 290         /* Encode the padded vertex count */
 291
 292         if (info->instance_count > 1) {
 293                 *padded_count = panfrost_padded_vertex_count(*vertex_count);
 294
 295                 unsigned shift = __builtin_ctz(ctx->padded_count);
 296                 unsigned k = ctx->padded_count >> (shift + 1);
 297
 298                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
 299                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
 300         } else {
 301                 *padded_count = *vertex_count;
 302
 303                 /* Reset instancing state */
 304                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
 305                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
 306         }
 307 }
 308
 309 static void
 310 panfrost_shader_meta_init(struct panfrost_context *ctx,
 311                           enum pipe_shader_type st,
 312                           struct mali_shader_meta *meta)
 313 {
 314         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 315         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 316
 317         memset(meta, 0, sizeof(*meta));
 318         meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
 319         meta->attribute_count = ss->attribute_count;
 320         meta->varying_count = ss->varying_count;
 321         meta->texture_count = ctx->sampler_view_count[st];
 322         meta->sampler_count = ctx->sampler_count[st];
 323
 324         if (dev->quirks & IS_BIFROST) {
 325                 if (st == PIPE_SHADER_VERTEX)
 326                         meta->bifrost1.unk1 = 0x800000;
 327                 else {
 328                         /* First clause ATEST |= 0x4000000.
 329                          * Less than 32 regs |= 0x200 */
 330                         meta->bifrost1.unk1 = 0x950020;
 331                 }
 332
 333                 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 334                 if (st == PIPE_SHADER_VERTEX)
 335                         meta->bifrost2.preload_regs = 0xC0;
 336                 else {
 337                         meta->bifrost2.preload_regs = 0x1;
 338                         SET_BIT(meta->bifrost2.preload_regs, 0x10, ss->reads_frag_coord);
 339                 }
 340
 341                 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
 342                                                     ss->uniform_cutoff);
 343         } else {
 344                 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
 345                                                     ss->uniform_cutoff);
 346                 meta->midgard1.work_count = ss->work_reg_count;
 347
 348                 /* TODO: This is not conformant on ES3 */
 349                 meta->midgard1.flags_hi = MALI_SUPPRESS_INF_NAN;
 350
 351                 meta->midgard1.flags_lo = 0x20;
 352                 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 353
 354                 SET_BIT(meta->midgard1.flags_lo, MALI_WRITES_GLOBAL, ss->writes_global);
 355         }
 356 }
 357
 358 static unsigned
 359 translate_tex_wrap(enum pipe_tex_wrap w)
 360 {
 361         switch (w) {
 362         case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT;
 363         case PIPE_TEX_WRAP_CLAMP: return MALI_WRAP_MODE_CLAMP;
 364         case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE;
 365         case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER;
 366         case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT;
 367         case PIPE_TEX_WRAP_MIRROR_CLAMP: return MALI_WRAP_MODE_MIRRORED_CLAMP;
 368         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
 369         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
 370         default: unreachable("Invalid wrap");
 371         }
 372 }
 373
 374 /* The hardware compares in the wrong order order, so we have to flip before
 375  * encoding. Yes, really. */
 376
 377 static enum mali_func
 378 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
 379 {
 380         if (!cso->compare_mode)
 381                 return MALI_FUNC_NEVER;
 382
 383         enum mali_func f = panfrost_translate_compare_func(cso->compare_func);
 384         return panfrost_flip_compare_func(f);
 385 }
 386
 387 static enum mali_mipmap_mode
 388 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
 389 {
 390         switch (f) {
 391         case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST;
 392         case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR;
 393         case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE;
 394         default: unreachable("Invalid");
 395         }
 396 }
 397
 398 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 399                                 struct mali_midgard_sampler_packed *hw)
 400 {
 401         pan_pack(hw, MIDGARD_SAMPLER, cfg) {
 402                 cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 403                 cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 404                 cfg.mipmap_mode = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) ?
 405                         MALI_MIPMAP_MODE_TRILINEAR : MALI_MIPMAP_MODE_NEAREST;
 406                 cfg.normalized_coordinates = cso->normalized_coords;
 407
 408                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
 409
 410                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
 411
 412                 /* If necessary, we disable mipmapping in the sampler descriptor by
 413                  * clamping the LOD as tight as possible (from 0 to epsilon,
 414                  * essentially -- remember these are fixed point numbers, so
 415                  * epsilon=1/256) */
 416
 417                 cfg.maximum_lod = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) ?
 418                         cfg.minimum_lod + 1 :
 419                         FIXED_16(cso->max_lod, false);
 420
 421                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
 422                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
 423                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
 424
 425                 cfg.compare_function = panfrost_sampler_compare_func(cso);
 426                 cfg.seamless_cube_map = cso->seamless_cube_map;
 427
 428                 cfg.border_color_r = cso->border_color.f[0];
 429                 cfg.border_color_g = cso->border_color.f[1];
 430                 cfg.border_color_b = cso->border_color.f[2];
 431                 cfg.border_color_a = cso->border_color.f[3];
 432         }
 433 }
 434
 435 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
 436                                         struct mali_bifrost_sampler_packed *hw)
 437 {
 438         pan_pack(hw, BIFROST_SAMPLER, cfg) {
 439                 cfg.magnify_linear = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR;
 440                 cfg.minify_linear = cso->min_img_filter == PIPE_TEX_FILTER_LINEAR;
 441                 cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
 442                 cfg.normalized_coordinates = cso->normalized_coords;
 443
 444                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
 445                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
 446                 cfg.maximum_lod = FIXED_16(cso->max_lod, false);
 447
 448                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
 449                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
 450                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
 451
 452                 cfg.compare_function = panfrost_sampler_compare_func(cso);
 453                 cfg.seamless_cube_map = cso->seamless_cube_map;
 454         }
 455 }
 456
 457 static void
 458 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
 459                                      struct mali_shader_meta *fragmeta)
 460 {
 461         struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 462
 463         bool msaa = rast->multisample;
 464
 465         /* TODO: Sample size */
 466         SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
 467         SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
 468
 469         struct panfrost_shader_state *fs;
 470         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 471
 472         /* EXT_shader_framebuffer_fetch requires the shader to be run
 473          * per-sample when outputs are read. */
 474         bool per_sample = ctx->min_samples > 1 || fs->outputs_read;
 475         SET_BIT(fragmeta->unknown2_3, MALI_PER_SAMPLE, msaa && per_sample);
 476
 477         fragmeta->depth_units = rast->offset_units * 2.0f;
 478         fragmeta->depth_factor = rast->offset_scale;
 479
 480         /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
 481
 482         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, rast->offset_tri);
 483         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, rast->offset_tri);
 484
 485         SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, rast->depth_clip_near);
 486         SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, rast->depth_clip_far);
 487 }
 488
 489 static void
 490 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
 491                               struct mali_shader_meta *fragmeta)
 492 {
 493         const struct panfrost_zsa_state *so = ctx->depth_stencil;
 494
 495         SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
 496                 so->base.stencil[0].enabled);
 497
 498         fragmeta->stencil_mask_front = so->stencil_mask_front;
 499         fragmeta->stencil_mask_back = so->stencil_mask_back;
 500
 501         /* Bottom bits for stencil ref, exactly one word */
 502         fragmeta->stencil_front.opaque[0] = so->stencil_front.opaque[0] | ctx->stencil_ref.ref_value[0];
 503
 504         /* If back-stencil is not enabled, use the front values */
 505
 506         if (so->base.stencil[1].enabled)
 507                 fragmeta->stencil_back.opaque[0] = so->stencil_back.opaque[0] | ctx->stencil_ref.ref_value[1];
 508         else
 509                 fragmeta->stencil_back = fragmeta->stencil_front;
 510
 511         SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
 512                 so->base.depth.writemask);
 513
 514         fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
 515         fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(
 516                 so->base.depth.enabled ? so->base.depth.func : PIPE_FUNC_ALWAYS));
 517 }
 518
 519 static bool
 520 panfrost_fs_required(
 521                 struct panfrost_shader_state *fs,
 522                 struct panfrost_blend_final *blend,
 523                 unsigned rt_count)
 524 {
 525         /* If we generally have side effects */
 526         if (fs->fs_sidefx)
 527                 return true;
 528
 529         /* If colour is written we need to execute */
 530         for (unsigned i = 0; i < rt_count; ++i) {
 531                 if (!blend[i].no_colour)
 532                         return true;
 533         }
 534
 535         /* If depth is written and not implied we need to execute.
 536          * TODO: Predicate on Z/S writes being enabled */
 537         return (fs->writes_depth || fs->writes_stencil);
 538 }
 539
 540 static void
 541 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
 542                                 struct mali_shader_meta *fragmeta,
 543                                 struct panfrost_blend_final *blend)
 544 {
 545         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 546         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 547         struct panfrost_shader_state *fs;
 548         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 549
 550         SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
 551                 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
 552                 !ctx->blend->base.dither);
 553
 554         SET_BIT(fragmeta->unknown2_4, MALI_ALPHA_TO_COVERAGE,
 555                         ctx->blend->base.alpha_to_coverage);
 556
 557         /* Get blending setup */
 558         unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
 559
 560         /* Disable shader execution if we can */
 561         if (dev->quirks & MIDGARD_SHADERLESS
 562                         && !panfrost_fs_required(fs, blend, rt_count)) {
 563                 fragmeta->shader = 0;
 564                 fragmeta->attribute_count = 0;
 565                 fragmeta->varying_count = 0;
 566                 fragmeta->texture_count = 0;
 567                 fragmeta->sampler_count = 0;
 568
 569                 /* This feature is not known to work on Bifrost */
 570                 fragmeta->midgard1.work_count = 1;
 571                 fragmeta->midgard1.uniform_count = 0;
 572                 fragmeta->midgard1.uniform_buffer_count = 0;
 573         }
 574
 575          /* If there is a blend shader, work registers are shared. We impose 8
 576           * work registers as a limit for blend shaders. Should be lower XXX */
 577
 578         if (!(dev->quirks & IS_BIFROST)) {
 579                 for (unsigned c = 0; c < rt_count; ++c) {
 580                         if (blend[c].is_shader) {
 581                                 fragmeta->midgard1.work_count =
 582                                         MAX2(fragmeta->midgard1.work_count, 8);
 583                         }
 584                 }
 585         }
 586
 587         /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
 588          * copied to the blend_meta appended (by convention), but this is the
 589          * field actually read by the hardware. (Or maybe both are read...?).
 590          * Specify the last RTi with a blend shader. */
 591
 592         fragmeta->blend.shader = 0;
 593
 594         for (signed rt = ((signed) rt_count - 1); rt >= 0; --rt) {
 595                 if (!blend[rt].is_shader)
 596                         continue;
 597
 598                 fragmeta->blend.shader = blend[rt].shader.gpu |
 599                                          blend[rt].shader.first_tag;
 600                 break;
 601         }
 602
 603         if (dev->quirks & MIDGARD_SFBD) {
 604                 /* When only a single render target platform is used, the blend
 605                  * information is inside the shader meta itself. We additionally
 606                  * need to signal CAN_DISCARD for nontrivial blend modes (so
 607                  * we're able to read back the destination buffer) */
 608
 609                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
 610                         blend[0].is_shader);
 611
 612                 if (!blend[0].is_shader) {
 613                         fragmeta->blend.equation = *blend[0].equation.equation;
 614                         fragmeta->blend.constant = blend[0].equation.constant;
 615                 }
 616
 617                 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
 618                         !blend[0].no_blending || fs->can_discard);
 619
 620                 batch->draws |= PIPE_CLEAR_COLOR0;
 621                 return;
 622         }
 623
 624         if (dev->quirks & IS_BIFROST) {
 625                 bool no_blend = true;
 626
 627                 for (unsigned i = 0; i < rt_count; ++i)
 628                         no_blend &= (blend[i].no_blending | blend[i].no_colour);
 629
 630                 SET_BIT(fragmeta->bifrost1.unk1, MALI_BIFROST_EARLY_Z,
 631                         !fs->can_discard && !fs->writes_depth && no_blend);
 632         }
 633 }
 634
 635 static void
 636 panfrost_emit_blend(struct panfrost_batch *batch, void *rts,
 637                 struct panfrost_blend_final *blend)
 638 {
 639         const struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
 640         struct panfrost_shader_state *fs = panfrost_get_shader_state(batch->ctx, PIPE_SHADER_FRAGMENT);
 641         unsigned rt_count = batch->key.nr_cbufs;
 642
 643         struct bifrost_blend_rt *brts = rts;
 644         struct midgard_blend_rt *mrts = rts;
 645
 646         /* Disable blending for depth-only on Bifrost */
 647
 648         if (rt_count == 0 && dev->quirks & IS_BIFROST)
 649                 brts[0].unk2 = 0x3;
 650
 651         for (unsigned i = 0; i < rt_count; ++i) {
 652                 unsigned flags = 0;
 653
 654                 pan_pack(&flags, BLEND_FLAGS, cfg) {
 655                         if (blend[i].no_colour)
 656                                 break;
 657
 658                         batch->draws |= (PIPE_CLEAR_COLOR0 << i);
 659
 660                         cfg.srgb = util_format_is_srgb(batch->key.cbufs[i]->format);
 661                         cfg.load_destination = !blend[i].no_blending; /* XXX */
 662                         cfg.dither_disable = !batch->ctx->blend->base.dither;
 663
 664                         if (!(dev->quirks & IS_BIFROST))
 665                                 cfg.midgard_blend_shader = blend[i].is_shader;
 666                 }
 667
 668                 if (dev->quirks & IS_BIFROST) {
 669                         brts[i].flags = flags;
 670
 671                         if (blend[i].is_shader) {
 672                                 /* The blend shader's address needs to be at
 673                                  * the same top 32 bit as the fragment shader.
 674                                  * TODO: Ensure that's always the case.
 675                                  */
 676                                 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
 677                                        (fs->bo->gpu & (0xffffffffull << 32)));
 678                                 brts[i].shader = blend[i].shader.gpu;
 679                                 brts[i].unk2 = 0x0;
 680                         } else {
 681                                 enum pipe_format format = batch->key.cbufs[i]->format;
 682                                 const struct util_format_description *format_desc;
 683                                 format_desc = util_format_description(format);
 684
 685                                 brts[i].equation = *blend[i].equation.equation;
 686
 687                                 /* TODO: this is a bit more complicated */
 688                                 brts[i].constant = blend[i].equation.constant;
 689
 690                                 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
 691
 692                                 /* 0x19 disables blending and forces REPLACE
 693                                  * mode (equivalent to rgb_mode = alpha_mode =
 694                                  * x122, colour mask = 0xF). 0x1a allows
 695                                  * blending. */
 696                                 brts[i].unk2 = blend[i].no_blending ? 0x19 : 0x1a;
 697
 698                                 brts[i].shader_type = fs->blend_types[i];
 699                         }
 700                 } else {
 701                         memcpy(&mrts[i].flags, &flags, sizeof(flags));
 702
 703                         if (blend[i].is_shader) {
 704                                 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 705                         } else {
 706                                 mrts[i].blend.equation = *blend[i].equation.equation;
 707                                 mrts[i].blend.constant = blend[i].equation.constant;
 708                         }
 709                 }
 710         }
 711 }
 712
 713 static void
 714 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
 715                                struct mali_shader_meta *fragmeta,
 716                                struct panfrost_blend_final *blend)
 717 {
 718         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 719         struct panfrost_shader_state *fs;
 720
 721         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 722
 723         bool msaa = ctx->rasterizer->base.multisample;
 724         fragmeta->coverage_mask = msaa ? ctx->sample_mask : ~0;
 725
 726         fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x10;
 727         fragmeta->unknown2_4 = 0x4e0;
 728
 729         /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
 730          * is required (independent of 32-bit/64-bit descriptors), or why it's
 731          * not used on later GPU revisions. Otherwise, all shader jobs fault on
 732          * these earlier chips (perhaps this is a chicken bit of some kind).
 733          * More investigation is needed. */
 734
 735         SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
 736
 737         if (dev->quirks & IS_BIFROST) {
 738                 /* TODO */
 739         } else {
 740                 /* Depending on whether it's legal to in the given shader, we try to
 741                  * enable early-z testing. TODO: respect e-z force */
 742
 743                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
 744                         !fs->can_discard && !fs->writes_global &&
 745                         !fs->writes_depth && !fs->writes_stencil &&
 746                         !ctx->blend->base.alpha_to_coverage);
 747
 748                 /* Add the writes Z/S flags if needed. */
 749                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
 750                 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
 751
 752                 /* Any time texturing is used, derivatives are implicitly calculated,
 753                  * so we need to enable helper invocations */
 754
 755                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
 756                         fs->helper_invocations);
 757
 758                 /* If discard is enabled, which bit we set to convey this
 759                  * depends on if depth/stencil is used for the draw or not.
 760                  * Just one of depth OR stencil is enough to trigger this. */
 761
 762                 const struct pipe_depth_stencil_alpha_state *zsa = &ctx->depth_stencil->base;
 763                 bool zs_enabled =
 764                         fs->writes_depth || fs->writes_stencil ||
 765                         (zsa->depth.enabled && zsa->depth.func != PIPE_FUNC_ALWAYS) ||
 766                         zsa->stencil[0].enabled;
 767
 768                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_TILEBUFFER,
 769                         fs->outputs_read || (!zs_enabled && fs->can_discard));
 770                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, zs_enabled && fs->can_discard);
 771         }
 772
 773         panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
 774         panfrost_frag_meta_zsa_update(ctx, fragmeta);
 775         panfrost_frag_meta_blend_update(ctx, fragmeta, blend);
 776 }
 777
 778 void
 779 panfrost_emit_shader_meta(struct panfrost_batch *batch,
 780                           enum pipe_shader_type st,
 781                           struct mali_vertex_tiler_postfix *postfix)
 782 {
 783         struct panfrost_context *ctx = batch->ctx;
 784         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 785
 786         if (!ss) {
 787                 postfix->shader = 0;
 788                 return;
 789         }
 790
 791         struct mali_shader_meta meta;
 792
 793         panfrost_shader_meta_init(ctx, st, &meta);
 794
 795         /* Add the shader BO to the batch. */
 796         panfrost_batch_add_bo(batch, ss->bo,
 797                               PAN_BO_ACCESS_PRIVATE |
 798                               PAN_BO_ACCESS_READ |
 799                               panfrost_bo_access_for_stage(st));
 800
 801         mali_ptr shader_ptr;
 802
 803         if (st == PIPE_SHADER_FRAGMENT) {
 804                 struct panfrost_device *dev = pan_device(ctx->base.screen);
 805                 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 806                 size_t desc_size = sizeof(meta);
 807                 void *rts = NULL;
 808                 struct panfrost_transfer xfer;
 809                 unsigned rt_size;
 810
 811                 if (dev->quirks & MIDGARD_SFBD)
 812                         rt_size = 0;
 813                 else if (dev->quirks & IS_BIFROST)
 814                         rt_size = sizeof(struct bifrost_blend_rt);
 815                 else
 816                         rt_size = sizeof(struct midgard_blend_rt);
 817
 818                 desc_size += rt_size * rt_count;
 819
 820                 if (rt_size)
 821                         rts = rzalloc_size(ctx, rt_size * rt_count);
 822
 823                 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 824
 825                 for (unsigned c = 0; c < ctx->pipe_framebuffer.nr_cbufs; ++c)
 826                         blend[c] = panfrost_get_blend_for_context(ctx, c);
 827
 828                 panfrost_frag_shader_meta_init(ctx, &meta, blend);
 829
 830                 if (!(dev->quirks & MIDGARD_SFBD))
 831                         panfrost_emit_blend(batch, rts, blend);
 832
 833                 xfer = panfrost_pool_alloc_aligned(&batch->pool, desc_size, sizeof(meta));
 834
 835                 memcpy(xfer.cpu, &meta, sizeof(meta));
 836                 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
 837
 838                 if (rt_size)
 839                         ralloc_free(rts);
 840
 841                 shader_ptr = xfer.gpu;
 842         } else {
 843                 shader_ptr = panfrost_pool_upload(&batch->pool, &meta,
 844                                                        sizeof(meta));
 845         }
 846
 847         postfix->shader = shader_ptr;
 848 }
 849
 850 void
 851 panfrost_emit_viewport(struct panfrost_batch *batch,
 852                        struct mali_vertex_tiler_postfix *tiler_postfix)
 853 {
 854         struct panfrost_context *ctx = batch->ctx;
 855         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 856         const struct pipe_scissor_state *ss = &ctx->scissor;
 857         const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 858         const struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
 859
 860         /* Derive min/max from translate/scale. Note since |x| >= 0 by
 861          * definition, we have that -|x| <= |x| hence translate - |scale| <=
 862          * translate + |scale|, so the ordering is correct here. */
 863         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 864         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 865         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 866         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 867         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 868         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 869
 870         /* Scissor to the intersection of viewport and to the scissor, clamped
 871          * to the framebuffer */
 872
 873         unsigned minx = MIN2(fb->width, vp_minx);
 874         unsigned maxx = MIN2(fb->width, vp_maxx);
 875         unsigned miny = MIN2(fb->height, vp_miny);
 876         unsigned maxy = MIN2(fb->height, vp_maxy);
 877
 878         if (ss && rast->scissor) {
 879                 minx = MAX2(ss->minx, minx);
 880                 miny = MAX2(ss->miny, miny);
 881                 maxx = MIN2(ss->maxx, maxx);
 882                 maxy = MIN2(ss->maxy, maxy);
 883         }
 884
 885         struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, MALI_VIEWPORT_LENGTH);
 886
 887         pan_pack(T.cpu, VIEWPORT, cfg) {
 888                 cfg.scissor_minimum_x = minx;
 889                 cfg.scissor_minimum_y = miny;
 890                 cfg.scissor_maximum_x = maxx - 1;
 891                 cfg.scissor_maximum_y = maxy - 1;
 892
 893                 cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY;
 894                 cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY;
 895         }
 896
 897         tiler_postfix->viewport = T.gpu;
 898         panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
 899 }
 900
 901 static mali_ptr
 902 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
 903                                  enum pipe_shader_type st,
 904                                  struct panfrost_constant_buffer *buf,
 905                                  unsigned index)
 906 {
 907         struct pipe_constant_buffer *cb = &buf->cb[index];
 908         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
 909
 910         if (rsrc) {
 911                 panfrost_batch_add_bo(batch, rsrc->bo,
 912                                       PAN_BO_ACCESS_SHARED |
 913                                       PAN_BO_ACCESS_READ |
 914                                       panfrost_bo_access_for_stage(st));
 915
 916                 /* Alignment gauranteed by
 917                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
 918                 return rsrc->bo->gpu + cb->buffer_offset;
 919         } else if (cb->user_buffer) {
 920                 return panfrost_pool_upload_aligned(&batch->pool,
 921                                                  cb->user_buffer +
 922                                                  cb->buffer_offset,
 923                                                  cb->buffer_size, 16);
 924         } else {
 925                 unreachable("No constant buffer");
 926         }
 927 }
 928
 929 struct sysval_uniform {
 930         union {
 931                 float f[4];
 932                 int32_t i[4];
 933                 uint32_t u[4];
 934                 uint64_t du[2];
 935         };
 936 };
 937
 938 static void
 939 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
 940                                       struct sysval_uniform *uniform)
 941 {
 942         struct panfrost_context *ctx = batch->ctx;
 943         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 944
 945         uniform->f[0] = vp->scale[0];
 946         uniform->f[1] = vp->scale[1];
 947         uniform->f[2] = vp->scale[2];
 948 }
 949
 950 static void
 951 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
 952                                        struct sysval_uniform *uniform)
 953 {
 954         struct panfrost_context *ctx = batch->ctx;
 955         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 956
 957         uniform->f[0] = vp->translate[0];
 958         uniform->f[1] = vp->translate[1];
 959         uniform->f[2] = vp->translate[2];
 960 }
 961
 962 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
 963                                        enum pipe_shader_type st,
 964                                        unsigned int sysvalid,
 965                                        struct sysval_uniform *uniform)
 966 {
 967         struct panfrost_context *ctx = batch->ctx;
 968         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
 969         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
 970         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
 971         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
 972
 973         assert(dim);
 974         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
 975
 976         if (dim > 1)
 977                 uniform->i[1] = u_minify(tex->texture->height0,
 978                                          tex->u.tex.first_level);
 979
 980         if (dim > 2)
 981                 uniform->i[2] = u_minify(tex->texture->depth0,
 982                                          tex->u.tex.first_level);
 983
 984         if (is_array)
 985                 uniform->i[dim] = tex->texture->array_size;
 986 }
 987
 988 static void
 989 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
 990                             enum pipe_shader_type st,
 991                             unsigned ssbo_id,
 992                             struct sysval_uniform *uniform)
 993 {
 994         struct panfrost_context *ctx = batch->ctx;
 995
 996         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
 997         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
 998
 999         /* Compute address */
1000         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1001
1002         panfrost_batch_add_bo(batch, bo,
1003                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1004                               panfrost_bo_access_for_stage(st));
1005
1006         /* Upload address and size as sysval */
1007         uniform->du[0] = bo->gpu + sb.buffer_offset;
1008         uniform->u[2] = sb.buffer_size;
1009 }
1010
1011 static void
1012 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1013                                enum pipe_shader_type st,
1014                                unsigned samp_idx,
1015                                struct sysval_uniform *uniform)
1016 {
1017         struct panfrost_context *ctx = batch->ctx;
1018         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1019
1020         uniform->f[0] = sampl->min_lod;
1021         uniform->f[1] = sampl->max_lod;
1022         uniform->f[2] = sampl->lod_bias;
1023
1024         /* Even without any errata, Midgard represents "no mipmapping" as
1025          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1026          * panfrost_create_sampler_state which also explains our choice of
1027          * epsilon value (again to keep behaviour consistent) */
1028
1029         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1030                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1031 }
1032
1033 static void
1034 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1035                                        struct sysval_uniform *uniform)
1036 {
1037         struct panfrost_context *ctx = batch->ctx;
1038
1039         uniform->u[0] = ctx->compute_grid->grid[0];
1040         uniform->u[1] = ctx->compute_grid->grid[1];
1041         uniform->u[2] = ctx->compute_grid->grid[2];
1042 }
1043
1044 static void
1045 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1046                         struct panfrost_shader_state *ss,
1047                         enum pipe_shader_type st)
1048 {
1049         struct sysval_uniform *uniforms = (void *)buf;
1050
1051         for (unsigned i = 0; i < ss->sysval_count; ++i) {
1052                 int sysval = ss->sysval[i];
1053
1054                 switch (PAN_SYSVAL_TYPE(sysval)) {
1055                 case PAN_SYSVAL_VIEWPORT_SCALE:
1056                         panfrost_upload_viewport_scale_sysval(batch,
1057                                                               &uniforms[i]);
1058                         break;
1059                 case PAN_SYSVAL_VIEWPORT_OFFSET:
1060                         panfrost_upload_viewport_offset_sysval(batch,
1061                                                                &uniforms[i]);
1062                         break;
1063                 case PAN_SYSVAL_TEXTURE_SIZE:
1064                         panfrost_upload_txs_sysval(batch, st,
1065                                                    PAN_SYSVAL_ID(sysval),
1066                                                    &uniforms[i]);
1067                         break;
1068                 case PAN_SYSVAL_SSBO:
1069                         panfrost_upload_ssbo_sysval(batch, st,
1070                                                     PAN_SYSVAL_ID(sysval),
1071                                                     &uniforms[i]);
1072                         break;
1073                 case PAN_SYSVAL_NUM_WORK_GROUPS:
1074                         panfrost_upload_num_work_groups_sysval(batch,
1075                                                                &uniforms[i]);
1076                         break;
1077                 case PAN_SYSVAL_SAMPLER:
1078                         panfrost_upload_sampler_sysval(batch, st,
1079                                                        PAN_SYSVAL_ID(sysval),
1080                                                        &uniforms[i]);
1081                         break;
1082                 default:
1083                         assert(0);
1084                 }
1085         }
1086 }
1087
1088 static const void *
1089 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1090                                  unsigned index)
1091 {
1092         struct pipe_constant_buffer *cb = &buf->cb[index];
1093         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1094
1095         if (rsrc)
1096                 return rsrc->bo->cpu;
1097         else if (cb->user_buffer)
1098                 return cb->user_buffer;
1099         else
1100                 unreachable("No constant buffer");
1101 }
1102
1103 void
1104 panfrost_emit_const_buf(struct panfrost_batch *batch,
1105                         enum pipe_shader_type stage,
1106                         struct mali_vertex_tiler_postfix *postfix)
1107 {
1108         struct panfrost_context *ctx = batch->ctx;
1109         struct panfrost_shader_variants *all = ctx->shader[stage];
1110
1111         if (!all)
1112                 return;
1113
1114         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1115
1116         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1117
1118         /* Uniforms are implicitly UBO #0 */
1119         bool has_uniforms = buf->enabled_mask & (1 << 0);
1120
1121         /* Allocate room for the sysval and the uniforms */
1122         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1123         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1124         size_t size = sys_size + uniform_size;
1125         struct panfrost_transfer transfer =
1126                 panfrost_pool_alloc_aligned(&batch->pool, size, 16);
1127
1128         /* Upload sysvals requested by the shader */
1129         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1130
1131         /* Upload uniforms */
1132         if (has_uniforms && uniform_size) {
1133                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1134                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1135         }
1136
1137         /* Next up, attach UBOs. UBO #0 is the uniforms we just
1138          * uploaded */
1139
1140         unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1141         assert(ubo_count >= 1);
1142
1143         size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count;
1144         struct panfrost_transfer ubos =
1145                 panfrost_pool_alloc_aligned(&batch->pool, sz,
1146                                 MALI_UNIFORM_BUFFER_LENGTH);
1147
1148         uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
1149
1150         /* Upload uniforms as a UBO */
1151
1152         if (ss->uniform_count) {
1153                 pan_pack(ubo_ptr, UNIFORM_BUFFER, cfg) {
1154                         cfg.entries = ss->uniform_count;
1155                         cfg.pointer = transfer.gpu;
1156                 }
1157         } else {
1158                 *ubo_ptr = 0;
1159         }
1160
1161         /* The rest are honest-to-goodness UBOs */
1162
1163         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1164                 size_t usz = buf->cb[ubo].buffer_size;
1165                 bool enabled = buf->enabled_mask & (1 << ubo);
1166                 bool empty = usz == 0;
1167
1168                 if (!enabled || empty) {
1169                         ubo_ptr[ubo] = 0;
1170                         continue;
1171                 }
1172
1173                 pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) {
1174                         cfg.entries = DIV_ROUND_UP(usz, 16);
1175                         cfg.pointer = panfrost_map_constant_buffer_gpu(batch,
1176                                         stage, buf, ubo);
1177                 }
1178         }
1179
1180         postfix->uniforms = transfer.gpu;
1181         postfix->uniform_buffers = ubos.gpu;
1182
1183         buf->dirty_mask = 0;
1184 }
1185
1186 void
1187 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1188                             const struct pipe_grid_info *info,
1189                             struct midgard_payload_vertex_tiler *vtp)
1190 {
1191         struct panfrost_context *ctx = batch->ctx;
1192         struct panfrost_device *dev = pan_device(ctx->base.screen);
1193         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1194         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1195         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1196                                                            128));
1197
1198         unsigned log2_instances =
1199                 util_logbase2_ceil(info->grid[0]) +
1200                 util_logbase2_ceil(info->grid[1]) +
1201                 util_logbase2_ceil(info->grid[2]);
1202
1203         unsigned shared_size = single_size * (1 << log2_instances) * dev->core_count;
1204         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1205                                                                   shared_size,
1206                                                                   1);
1207
1208         struct mali_shared_memory shared = {
1209                 .shared_memory = bo->gpu,
1210                 .shared_workgroup_count = log2_instances,
1211                 .shared_shift = util_logbase2(single_size) + 1
1212         };
1213
1214         vtp->postfix.shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared,
1215                                                                sizeof(shared), 64);
1216 }
1217
1218 static mali_ptr
1219 panfrost_get_tex_desc(struct panfrost_batch *batch,
1220                       enum pipe_shader_type st,
1221                       struct panfrost_sampler_view *view)
1222 {
1223         if (!view)
1224                 return (mali_ptr) 0;
1225
1226         struct pipe_sampler_view *pview = &view->base;
1227         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1228
1229         /* Add the BO to the job so it's retained until the job is done. */
1230
1231         panfrost_batch_add_bo(batch, rsrc->bo,
1232                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1233                               panfrost_bo_access_for_stage(st));
1234
1235         panfrost_batch_add_bo(batch, view->bo,
1236                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1237                               panfrost_bo_access_for_stage(st));
1238
1239         return view->bo->gpu;
1240 }
1241
1242 static void
1243 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1244                              struct pipe_context *pctx)
1245 {
1246         struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1247         if (view->texture_bo != rsrc->bo->gpu ||
1248             view->modifier != rsrc->modifier) {
1249                 panfrost_bo_unreference(view->bo);
1250                 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1251         }
1252 }
1253
1254 void
1255 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1256                                   enum pipe_shader_type stage,
1257                                   struct mali_vertex_tiler_postfix *postfix)
1258 {
1259         struct panfrost_context *ctx = batch->ctx;
1260         struct panfrost_device *device = pan_device(ctx->base.screen);
1261
1262         if (!ctx->sampler_view_count[stage])
1263                 return;
1264
1265         if (device->quirks & IS_BIFROST) {
1266                 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1267                                 MALI_BIFROST_TEXTURE_LENGTH *
1268                                 ctx->sampler_view_count[stage],
1269                                 MALI_BIFROST_TEXTURE_LENGTH);
1270
1271                 struct mali_bifrost_texture_packed *out =
1272                         (struct mali_bifrost_texture_packed *) T.cpu;
1273
1274                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1275                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1276                         struct pipe_sampler_view *pview = &view->base;
1277                         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1278
1279                         panfrost_update_sampler_view(view, &ctx->base);
1280                         out[i] = view->bifrost_descriptor;
1281
1282                         /* Add the BOs to the job so they are retained until the job is done. */
1283
1284                         panfrost_batch_add_bo(batch, rsrc->bo,
1285                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1286                                               panfrost_bo_access_for_stage(stage));
1287
1288                         panfrost_batch_add_bo(batch, view->bo,
1289                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1290                                               panfrost_bo_access_for_stage(stage));
1291                 }
1292
1293                 postfix->textures = T.gpu;
1294         } else {
1295                 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1296
1297                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1298                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1299
1300                         panfrost_update_sampler_view(view, &ctx->base);
1301
1302                         trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1303                 }
1304
1305                 postfix->textures = panfrost_pool_upload_aligned(&batch->pool,
1306                                                               trampolines,
1307                                                               sizeof(uint64_t) *
1308                                                               ctx->sampler_view_count[stage],
1309                                                               sizeof(uint64_t));
1310         }
1311 }
1312
1313 void
1314 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1315                                   enum pipe_shader_type stage,
1316                                   struct mali_vertex_tiler_postfix *postfix)
1317 {
1318         struct panfrost_context *ctx = batch->ctx;
1319
1320         if (!ctx->sampler_count[stage])
1321                 return;
1322
1323         size_t desc_size = MALI_BIFROST_SAMPLER_LENGTH;
1324         assert(MALI_BIFROST_SAMPLER_LENGTH == MALI_MIDGARD_SAMPLER_LENGTH);
1325
1326         size_t sz = desc_size * ctx->sampler_count[stage];
1327         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool, sz, desc_size);
1328         struct mali_midgard_sampler_packed *out = (struct mali_midgard_sampler_packed *) T.cpu;
1329
1330         for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i)
1331                 out[i] = ctx->samplers[stage][i]->hw;
1332
1333         postfix->sampler_descriptor = T.gpu;
1334 }
1335
1336 void
1337 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1338                           struct mali_vertex_tiler_postfix *vertex_postfix)
1339 {
1340         struct panfrost_context *ctx = batch->ctx;
1341         struct panfrost_vertex_state *so = ctx->vertex;
1342         struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1343
1344         unsigned instance_shift = vertex_postfix->instance_shift;
1345         unsigned instance_odd = vertex_postfix->instance_odd;
1346
1347         /* Worst case: everything is NPOT, which is only possible if instancing
1348          * is enabled. Otherwise single record is gauranteed */
1349         bool could_npot = instance_shift || instance_odd;
1350
1351         struct panfrost_transfer S = panfrost_pool_alloc_aligned(&batch->pool,
1352                         MALI_ATTRIBUTE_BUFFER_LENGTH * vs->attribute_count *
1353                         (could_npot ? 2 : 1),
1354                         MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1355
1356         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1357                         MALI_ATTRIBUTE_LENGTH * vs->attribute_count,
1358                         MALI_ATTRIBUTE_LENGTH);
1359
1360         struct mali_attribute_buffer_packed *bufs =
1361                 (struct mali_attribute_buffer_packed *) S.cpu;
1362
1363         struct mali_attribute_packed *out =
1364                 (struct mali_attribute_packed *) T.cpu;
1365
1366         unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 };
1367         unsigned k = 0;
1368
1369         for (unsigned i = 0; i < so->num_elements; ++i) {
1370                 /* We map buffers 1:1 with the attributes, which
1371                  * means duplicating some vertex buffers (who cares? aside from
1372                  * maybe some caching implications but I somehow doubt that
1373                  * matters) */
1374
1375                 struct pipe_vertex_element *elem = &so->pipe[i];
1376                 unsigned vbi = elem->vertex_buffer_index;
1377                 attrib_to_buffer[i] = k;
1378
1379                 if (!(ctx->vb_mask & (1 << vbi)))
1380                         continue;
1381
1382                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1383                 struct panfrost_resource *rsrc;
1384
1385                 rsrc = pan_resource(buf->buffer.resource);
1386                 if (!rsrc)
1387                         continue;
1388
1389                 /* Add a dependency of the batch on the vertex buffer */
1390                 panfrost_batch_add_bo(batch, rsrc->bo,
1391                                       PAN_BO_ACCESS_SHARED |
1392                                       PAN_BO_ACCESS_READ |
1393                                       PAN_BO_ACCESS_VERTEX_TILER);
1394
1395                 /* Mask off lower bits, see offset fixup below */
1396                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1397                 mali_ptr addr = raw_addr & ~63;
1398
1399                 /* Since we advanced the base pointer, we shrink the buffer
1400                  * size, but add the offset we subtracted */
1401                 unsigned size = rsrc->base.width0 + (raw_addr - addr)
1402                         - buf->buffer_offset;
1403
1404                 /* When there is a divisor, the hardware-level divisor is
1405                  * the product of the instance divisor and the padded count */
1406                 unsigned divisor = elem->instance_divisor;
1407                 unsigned hw_divisor = ctx->padded_count * divisor;
1408                 unsigned stride = buf->stride;
1409
1410                 /* If there's a divisor(=1) but no instancing, we want every
1411                  * attribute to be the same */
1412
1413                 if (divisor && ctx->instance_count == 1)
1414                         stride = 0;
1415
1416                 if (!divisor || ctx->instance_count <= 1) {
1417                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1418                                 if (ctx->instance_count > 1)
1419                                         cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
1420
1421                                 cfg.pointer = addr;
1422                                 cfg.stride = stride;
1423                                 cfg.size = size;
1424                                 cfg.divisor_r = instance_shift;
1425                                 cfg.divisor_p = instance_odd;
1426                         }
1427                 } else if (util_is_power_of_two_or_zero(hw_divisor)) {
1428                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1429                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
1430                                 cfg.pointer = addr;
1431                                 cfg.stride = stride;
1432                                 cfg.size = size;
1433                                 cfg.divisor_r = __builtin_ctz(hw_divisor);
1434                         }
1435
1436                 } else {
1437                         unsigned shift = 0, extra_flags = 0;
1438
1439                         unsigned magic_divisor =
1440                                 panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
1441
1442                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1443                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
1444                                 cfg.pointer = addr;
1445                                 cfg.stride = stride;
1446                                 cfg.size = size;
1447
1448                                 cfg.divisor_r = shift;
1449                                 cfg.divisor_e = extra_flags;
1450                         }
1451
1452                         pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
1453                                 cfg.divisor_numerator = magic_divisor;
1454                                 cfg.divisor = divisor;
1455                         }
1456
1457                         ++k;
1458                 }
1459
1460                 ++k;
1461         }
1462
1463         /* Add special gl_VertexID/gl_InstanceID buffers */
1464
1465         if (unlikely(vs->attribute_count >= PAN_VERTEX_ID)) {
1466                 panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1467
1468                 pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
1469                         cfg.buffer_index = k++;
1470                         cfg.format = so->formats[PAN_VERTEX_ID];
1471                 }
1472
1473                 panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1474
1475                 pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
1476                         cfg.buffer_index = k++;
1477                         cfg.format = so->formats[PAN_INSTANCE_ID];
1478                 }
1479         }
1480
1481         /* Attribute addresses require 64-byte alignment, so let:
1482          *
1483          *      base' = base & ~63 = base - (base & 63)
1484          *      offset' = offset + (base & 63)
1485          *
1486          * Since base' + offset' = base + offset, these are equivalent
1487          * addressing modes and now base is 64 aligned.
1488          */
1489
1490         unsigned start = vertex_postfix->offset_start;
1491
1492         for (unsigned i = 0; i < so->num_elements; ++i) {
1493                 unsigned vbi = so->pipe[i].vertex_buffer_index;
1494                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1495
1496                 /* Adjust by the masked off bits of the offset. Make sure we
1497                  * read src_offset from so->hw (which is not GPU visible)
1498                  * rather than target (which is) due to caching effects */
1499
1500                 unsigned src_offset = so->pipe[i].src_offset;
1501
1502                 /* BOs aligned to 4k so guaranteed aligned to 64 */
1503                 src_offset += (buf->buffer_offset & 63);
1504
1505                 /* Also, somewhat obscurely per-instance data needs to be
1506                  * offset in response to a delayed start in an indexed draw */
1507
1508                 if (so->pipe[i].instance_divisor && ctx->instance_count > 1 && start)
1509                         src_offset -= buf->stride * start;
1510
1511                 pan_pack(out + i, ATTRIBUTE, cfg) {
1512                         cfg.buffer_index = attrib_to_buffer[i];
1513                         cfg.format = so->formats[i];
1514                         cfg.offset = src_offset;
1515                 }
1516         }
1517
1518         vertex_postfix->attributes = S.gpu;
1519         vertex_postfix->attribute_meta = T.gpu;
1520 }
1521
1522 static mali_ptr
1523 panfrost_emit_varyings(struct panfrost_batch *batch,
1524                 struct mali_attribute_buffer_packed *slot,
1525                 unsigned stride, unsigned count)
1526 {
1527         unsigned size = stride * count;
1528         mali_ptr ptr = panfrost_pool_alloc_aligned(&batch->invisible_pool, size, 64).gpu;
1529
1530         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1531                 cfg.stride = stride;
1532                 cfg.size = size;
1533                 cfg.pointer = ptr;
1534         }
1535
1536         return ptr;
1537 }
1538
1539 static unsigned
1540 panfrost_streamout_offset(unsigned stride, unsigned offset,
1541                         struct pipe_stream_output_target *target)
1542 {
1543         return (target->buffer_offset + (offset * stride * 4)) & 63;
1544 }
1545
1546 static void
1547 panfrost_emit_streamout(struct panfrost_batch *batch,
1548                         struct mali_attribute_buffer_packed *slot,
1549                         unsigned stride_words, unsigned offset, unsigned count,
1550                         struct pipe_stream_output_target *target)
1551 {
1552         unsigned stride = stride_words * 4;
1553         unsigned max_size = target->buffer_size;
1554         unsigned expected_size = stride * count;
1555
1556         /* Grab the BO and bind it to the batch */
1557         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1558
1559         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1560          * the perspective of the TILER and FRAGMENT.
1561          */
1562         panfrost_batch_add_bo(batch, bo,
1563                               PAN_BO_ACCESS_SHARED |
1564                               PAN_BO_ACCESS_RW |
1565                               PAN_BO_ACCESS_VERTEX_TILER |
1566                               PAN_BO_ACCESS_FRAGMENT);
1567
1568         /* We will have an offset applied to get alignment */
1569         mali_ptr addr = bo->gpu + target->buffer_offset + (offset * stride);
1570
1571         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1572                 cfg.pointer = (addr & ~63);
1573                 cfg.stride = stride;
1574                 cfg.size = MIN2(max_size, expected_size) + (addr & 63);
1575         }
1576 }
1577
1578 static bool
1579 has_point_coord(unsigned mask, gl_varying_slot loc)
1580 {
1581         if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1582                 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1583         else if (loc == VARYING_SLOT_PNTC)
1584                 return (mask & (1 << 8));
1585         else
1586                 return false;
1587 }
1588
1589 /* Helpers for manipulating stream out information so we can pack varyings
1590  * accordingly. Compute the src_offset for a given captured varying */
1591
1592 static struct pipe_stream_output *
1593 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1594 {
1595         for (unsigned i = 0; i < info->num_outputs; ++i) {
1596                 if (info->output[i].register_index == loc)
1597                         return &info->output[i];
1598         }
1599
1600         unreachable("Varying not captured");
1601 }
1602
1603 static unsigned
1604 pan_varying_size(enum mali_format fmt)
1605 {
1606         unsigned type = MALI_EXTRACT_TYPE(fmt);
1607         unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1608         unsigned bits = MALI_EXTRACT_BITS(fmt);
1609         unsigned bpc = 0;
1610
1611         if (bits == MALI_CHANNEL_FLOAT) {
1612                 /* No doubles */
1613                 bool fp16 = (type == MALI_FORMAT_SINT);
1614                 assert(fp16 || (type == MALI_FORMAT_UNORM));
1615
1616                 bpc = fp16 ? 2 : 4;
1617         } else {
1618                 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1619
1620                 /* See the enums */
1621                 bits = 1 << bits;
1622                 assert(bits >= 8);
1623                 bpc = bits / 8;
1624         }
1625
1626         return bpc * chan;
1627 }
1628
1629 /* Indices for named (non-XFB) varyings that are present. These are packed
1630  * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1631  * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1632  * of a given special field given a shift S by:
1633  *
1634  *      idx = popcount(P & ((1 << S) - 1))
1635  *
1636  * That is... look at all of the varyings that come earlier and count them, the
1637  * count is the new index since plus one. Likewise, the total number of special
1638  * buffers required is simply popcount(P)
1639  */
1640
1641 enum pan_special_varying {
1642         PAN_VARY_GENERAL = 0,
1643         PAN_VARY_POSITION = 1,
1644         PAN_VARY_PSIZ = 2,
1645         PAN_VARY_PNTCOORD = 3,
1646         PAN_VARY_FACE = 4,
1647         PAN_VARY_FRAGCOORD = 5,
1648
1649         /* Keep last */
1650         PAN_VARY_MAX,
1651 };
1652
1653 /* Given a varying, figure out which index it correpsonds to */
1654
1655 static inline unsigned
1656 pan_varying_index(unsigned present, enum pan_special_varying v)
1657 {
1658         unsigned mask = (1 << v) - 1;
1659         return util_bitcount(present & mask);
1660 }
1661
1662 /* Get the base offset for XFB buffers, which by convention come after
1663  * everything else. Wrapper function for semantic reasons; by construction this
1664  * is just popcount. */
1665
1666 static inline unsigned
1667 pan_xfb_base(unsigned present)
1668 {
1669         return util_bitcount(present);
1670 }
1671
1672 /* Computes the present mask for varyings so we can start emitting varying records */
1673
1674 static inline unsigned
1675 pan_varying_present(
1676         struct panfrost_shader_state *vs,
1677         struct panfrost_shader_state *fs,
1678         unsigned quirks)
1679 {
1680         /* At the moment we always emit general and position buffers. Not
1681          * strictly necessary but usually harmless */
1682
1683         unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1684
1685         /* Enable special buffers by the shader info */
1686
1687         if (vs->writes_point_size)
1688                 present |= (1 << PAN_VARY_PSIZ);
1689
1690         if (fs->reads_point_coord)
1691                 present |= (1 << PAN_VARY_PNTCOORD);
1692
1693         if (fs->reads_face)
1694                 present |= (1 << PAN_VARY_FACE);
1695
1696         if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1697                 present |= (1 << PAN_VARY_FRAGCOORD);
1698
1699         /* Also, if we have a point sprite, we need a point coord buffer */
1700
1701         for (unsigned i = 0; i < fs->varying_count; i++)  {
1702                 gl_varying_slot loc = fs->varyings_loc[i];
1703
1704                 if (has_point_coord(fs->point_sprite_mask, loc))
1705                         present |= (1 << PAN_VARY_PNTCOORD);
1706         }
1707
1708         return present;
1709 }
1710
1711 /* Emitters for varying records */
1712
1713 static void
1714 pan_emit_vary(struct mali_attribute_packed *out,
1715                 unsigned present, enum pan_special_varying buf,
1716                 unsigned quirks, enum mali_format format,
1717                 unsigned offset)
1718 {
1719         unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1720         unsigned swizzle = quirks & HAS_SWIZZLES ?
1721                         panfrost_get_default_swizzle(nr_channels) :
1722                         panfrost_bifrost_swizzle(nr_channels);
1723
1724         pan_pack(out, ATTRIBUTE, cfg) {
1725                 cfg.buffer_index = pan_varying_index(present, buf);
1726                 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1727                 cfg.format = (format << 12) | swizzle;
1728                 cfg.offset = offset;
1729         }
1730 }
1731
1732 /* General varying that is unused */
1733
1734 static void
1735 pan_emit_vary_only(struct mali_attribute_packed *out,
1736                 unsigned present, unsigned quirks)
1737 {
1738         pan_emit_vary(out, present, 0, quirks, MALI_VARYING_DISCARD, 0);
1739 }
1740
1741 /* Special records */
1742
1743 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1744         [PAN_VARY_POSITION]     = MALI_VARYING_POS,
1745         [PAN_VARY_PSIZ]         = MALI_R16F,
1746         [PAN_VARY_PNTCOORD]     = MALI_R16F,
1747         [PAN_VARY_FACE]         = MALI_R32I,
1748         [PAN_VARY_FRAGCOORD]    = MALI_RGBA32F
1749 };
1750
1751 static void
1752 pan_emit_vary_special(struct mali_attribute_packed *out,
1753                 unsigned present, enum pan_special_varying buf,
1754                 unsigned quirks)
1755 {
1756         assert(buf < PAN_VARY_MAX);
1757         pan_emit_vary(out, present, buf, quirks, pan_varying_formats[buf], 0);
1758 }
1759
1760 static enum mali_format
1761 pan_xfb_format(enum mali_format format, unsigned nr)
1762 {
1763         if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1764                 return MALI_R32F | MALI_NR_CHANNELS(nr);
1765         else
1766                 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1767 }
1768
1769 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1770  * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1771  * value. */
1772
1773 static void
1774 pan_emit_vary_xfb(struct mali_attribute_packed *out,
1775                 unsigned present,
1776                 unsigned max_xfb,
1777                 unsigned *streamout_offsets,
1778                 unsigned quirks,
1779                 enum mali_format format,
1780                 struct pipe_stream_output o)
1781 {
1782         unsigned swizzle = quirks & HAS_SWIZZLES ?
1783                         panfrost_get_default_swizzle(o.num_components) :
1784                         panfrost_bifrost_swizzle(o.num_components);
1785
1786         pan_pack(out, ATTRIBUTE, cfg) {
1787                 /* XFB buffers come after everything else */
1788                 cfg.buffer_index = pan_xfb_base(present) + o.output_buffer;
1789                 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1790
1791                 /* Override number of channels and precision to highp */
1792                 cfg.format = (pan_xfb_format(format, o.num_components) << 12) | swizzle;
1793
1794                 /* Apply given offsets together */
1795                 cfg.offset = (o.dst_offset * 4) /* dwords */
1796                         + streamout_offsets[o.output_buffer];
1797         }
1798 }
1799
1800 /* Determine if we should capture a varying for XFB. This requires actually
1801  * having a buffer for it. If we don't capture it, we'll fallback to a general
1802  * varying path (linked or unlinked, possibly discarding the write) */
1803
1804 static bool
1805 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1806                 unsigned loc, unsigned max_xfb)
1807 {
1808         if (!(xfb->so_mask & (1ll << loc)))
1809                 return false;
1810
1811         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1812         return o->output_buffer < max_xfb;
1813 }
1814
1815 static void
1816 pan_emit_general_varying(struct mali_attribute_packed *out,
1817                 struct panfrost_shader_state *other,
1818                 struct panfrost_shader_state *xfb,
1819                 gl_varying_slot loc,
1820                 enum mali_format format,
1821                 unsigned present,
1822                 unsigned quirks,
1823                 unsigned *gen_offsets,
1824                 enum mali_format *gen_formats,
1825                 unsigned *gen_stride,
1826                 unsigned idx,
1827                 bool should_alloc)
1828 {
1829         /* Check if we're linked */
1830         signed other_idx = -1;
1831
1832         for (unsigned j = 0; j < other->varying_count; ++j) {
1833                 if (other->varyings_loc[j] == loc) {
1834                         other_idx = j;
1835                         break;
1836                 }
1837         }
1838
1839         if (other_idx < 0) {
1840                 pan_emit_vary_only(out, present, quirks);
1841                 return;
1842         }
1843
1844         unsigned offset = gen_offsets[other_idx];
1845
1846         if (should_alloc) {
1847                 /* We're linked, so allocate a space via a watermark allocation */
1848                 enum mali_format alt = other->varyings[other_idx];
1849
1850                 /* Do interpolation at minimum precision */
1851                 unsigned size_main = pan_varying_size(format);
1852                 unsigned size_alt = pan_varying_size(alt);
1853                 unsigned size = MIN2(size_main, size_alt);
1854
1855                 /* If a varying is marked for XFB but not actually captured, we
1856                  * should match the format to the format that would otherwise
1857                  * be used for XFB, since dEQP checks for invariance here. It's
1858                  * unclear if this is required by the spec. */
1859
1860                 if (xfb->so_mask & (1ull << loc)) {
1861                         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1862                         format = pan_xfb_format(format, o->num_components);
1863                         size = pan_varying_size(format);
1864                 } else if (size == size_alt) {
1865                         format = alt;
1866                 }
1867
1868                 gen_offsets[idx] = *gen_stride;
1869                 gen_formats[other_idx] = format;
1870                 offset = *gen_stride;
1871                 *gen_stride += size;
1872         }
1873
1874         pan_emit_vary(out, present, PAN_VARY_GENERAL, quirks, format, offset);
1875 }
1876
1877 /* Higher-level wrapper around all of the above, classifying a varying into one
1878  * of the above types */
1879
1880 static void
1881 panfrost_emit_varying(
1882                 struct mali_attribute_packed *out,
1883                 struct panfrost_shader_state *stage,
1884                 struct panfrost_shader_state *other,
1885                 struct panfrost_shader_state *xfb,
1886                 unsigned present,
1887                 unsigned max_xfb,
1888                 unsigned *streamout_offsets,
1889                 unsigned quirks,
1890                 unsigned *gen_offsets,
1891                 enum mali_format *gen_formats,
1892                 unsigned *gen_stride,
1893                 unsigned idx,
1894                 bool should_alloc,
1895                 bool is_fragment)
1896 {
1897         gl_varying_slot loc = stage->varyings_loc[idx];
1898         enum mali_format format = stage->varyings[idx];
1899
1900         /* Override format to match linkage */
1901         if (!should_alloc && gen_formats[idx])
1902                 format = gen_formats[idx];
1903
1904         if (has_point_coord(stage->point_sprite_mask, loc)) {
1905                 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1906         } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1907                 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1908                 pan_emit_vary_xfb(out, present, max_xfb, streamout_offsets, quirks, format, *o);
1909         } else if (loc == VARYING_SLOT_POS) {
1910                 if (is_fragment)
1911                         pan_emit_vary_special(out, present, PAN_VARY_FRAGCOORD, quirks);
1912                 else
1913                         pan_emit_vary_special(out, present, PAN_VARY_POSITION, quirks);
1914         } else if (loc == VARYING_SLOT_PSIZ) {
1915                 pan_emit_vary_special(out, present, PAN_VARY_PSIZ, quirks);
1916         } else if (loc == VARYING_SLOT_PNTC) {
1917                 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1918         } else if (loc == VARYING_SLOT_FACE) {
1919                 pan_emit_vary_special(out, present, PAN_VARY_FACE, quirks);
1920         } else {
1921                 pan_emit_general_varying(out, other, xfb, loc, format, present,
1922                                 quirks, gen_offsets, gen_formats, gen_stride,
1923                                 idx, should_alloc);
1924         }
1925 }
1926
1927 static void
1928 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
1929                 unsigned present,
1930                 enum pan_special_varying v,
1931                 unsigned special)
1932 {
1933         if (present & (1 << v)) {
1934                 unsigned idx = pan_varying_index(present, v);
1935
1936                 pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
1937                         cfg.special = special;
1938                         cfg.type = 0;
1939                 }
1940         }
1941 }
1942
1943 void
1944 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1945                                  unsigned vertex_count,
1946                                  struct mali_vertex_tiler_postfix *vertex_postfix,
1947                                  struct mali_vertex_tiler_postfix *tiler_postfix,
1948                                  union midgard_primitive_size *primitive_size)
1949 {
1950         /* Load the shaders */
1951         struct panfrost_context *ctx = batch->ctx;
1952         struct panfrost_device *dev = pan_device(ctx->base.screen);
1953         struct panfrost_shader_state *vs, *fs;
1954         size_t vs_size, fs_size;
1955
1956         /* Allocate the varying descriptor */
1957
1958         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1959         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1960         vs_size = MALI_ATTRIBUTE_LENGTH * vs->varying_count;
1961         fs_size = MALI_ATTRIBUTE_LENGTH * fs->varying_count;
1962
1963         struct panfrost_transfer trans = panfrost_pool_alloc_aligned(
1964                         &batch->pool, vs_size + fs_size, MALI_ATTRIBUTE_LENGTH);
1965
1966         struct pipe_stream_output_info *so = &vs->stream_output;
1967         unsigned present = pan_varying_present(vs, fs, dev->quirks);
1968
1969         /* Check if this varying is linked by us. This is the case for
1970          * general-purpose, non-captured varyings. If it is, link it. If it's
1971          * not, use the provided stream out information to determine the
1972          * offset, since it was already linked for us. */
1973
1974         unsigned gen_offsets[32];
1975         enum mali_format gen_formats[32];
1976         memset(gen_offsets, 0, sizeof(gen_offsets));
1977         memset(gen_formats, 0, sizeof(gen_formats));
1978
1979         unsigned gen_stride = 0;
1980         assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
1981         assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
1982
1983         unsigned streamout_offsets[32];
1984
1985         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1986                 streamout_offsets[i] = panfrost_streamout_offset(
1987                                         so->stride[i],
1988                                         ctx->streamout.offsets[i],
1989                                         ctx->streamout.targets[i]);
1990         }
1991
1992         struct mali_attribute_packed *ovs = (struct mali_attribute_packed *)trans.cpu;
1993         struct mali_attribute_packed *ofs = ovs + vs->varying_count;
1994
1995         for (unsigned i = 0; i < vs->varying_count; i++) {
1996                 panfrost_emit_varying(ovs + i, vs, fs, vs, present,
1997                                 ctx->streamout.num_targets, streamout_offsets,
1998                                 dev->quirks,
1999                                 gen_offsets, gen_formats, &gen_stride, i, true, false);
2000         }
2001
2002         for (unsigned i = 0; i < fs->varying_count; i++) {
2003                 panfrost_emit_varying(ofs + i, fs, vs, vs, present,
2004                                 ctx->streamout.num_targets, streamout_offsets,
2005                                 dev->quirks,
2006                                 gen_offsets, gen_formats, &gen_stride, i, false, true);
2007         }
2008
2009         unsigned xfb_base = pan_xfb_base(present);
2010         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
2011                         MALI_ATTRIBUTE_BUFFER_LENGTH * (xfb_base + ctx->streamout.num_targets),
2012                         MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
2013         struct mali_attribute_buffer_packed *varyings =
2014                 (struct mali_attribute_buffer_packed *) T.cpu;
2015
2016         /* Emit the stream out buffers */
2017
2018         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
2019                                                            ctx->vertex_count);
2020
2021         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2022                 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
2023                                         so->stride[i],
2024                                         ctx->streamout.offsets[i],
2025                                         out_count,
2026                                         ctx->streamout.targets[i]);
2027         }
2028
2029         panfrost_emit_varyings(batch,
2030                         &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
2031                         gen_stride, vertex_count);
2032
2033         /* fp32 vec4 gl_Position */
2034         tiler_postfix->position_varying = panfrost_emit_varyings(batch,
2035                         &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
2036                         sizeof(float) * 4, vertex_count);
2037
2038         if (present & (1 << PAN_VARY_PSIZ)) {
2039                 primitive_size->pointer = panfrost_emit_varyings(batch,
2040                                 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
2041                                 2, vertex_count);
2042         }
2043
2044         pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD);
2045         pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
2046         pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
2047
2048         vertex_postfix->varyings = T.gpu;
2049         tiler_postfix->varyings = T.gpu;
2050
2051         vertex_postfix->varying_meta = trans.gpu;
2052         tiler_postfix->varying_meta = trans.gpu + vs_size;
2053 }
2054
2055 void
2056 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
2057                                 struct mali_vertex_tiler_prefix *vertex_prefix,
2058                                 struct mali_vertex_tiler_postfix *vertex_postfix,
2059                                 struct mali_vertex_tiler_prefix *tiler_prefix,
2060                                 struct mali_vertex_tiler_postfix *tiler_postfix,
2061                                 union midgard_primitive_size *primitive_size)
2062 {
2063         struct panfrost_context *ctx = batch->ctx;
2064         struct panfrost_device *device = pan_device(ctx->base.screen);
2065         bool wallpapering = ctx->wallpaper_batch && batch->scoreboard.tiler_dep;
2066         struct bifrost_payload_vertex bifrost_vertex = {0,};
2067         struct bifrost_payload_tiler bifrost_tiler = {0,};
2068         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
2069         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
2070         void *vp, *tp;
2071         size_t vp_size, tp_size;
2072
2073         if (device->quirks & IS_BIFROST) {
2074                 bifrost_vertex.prefix = *vertex_prefix;
2075                 bifrost_vertex.postfix = *vertex_postfix;
2076                 vp = &bifrost_vertex;
2077                 vp_size = sizeof(bifrost_vertex);
2078
2079                 bifrost_tiler.prefix = *tiler_prefix;
2080                 bifrost_tiler.tiler.primitive_size = *primitive_size;
2081                 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2082                 bifrost_tiler.postfix = *tiler_postfix;
2083                 tp = &bifrost_tiler;
2084                 tp_size = sizeof(bifrost_tiler);
2085         } else {
2086                 midgard_vertex.prefix = *vertex_prefix;
2087                 midgard_vertex.postfix = *vertex_postfix;
2088                 vp = &midgard_vertex;
2089                 vp_size = sizeof(midgard_vertex);
2090
2091                 midgard_tiler.prefix = *tiler_prefix;
2092                 midgard_tiler.postfix = *tiler_postfix;
2093                 midgard_tiler.primitive_size = *primitive_size;
2094                 tp = &midgard_tiler;
2095                 tp_size = sizeof(midgard_tiler);
2096         }
2097
2098         if (wallpapering) {
2099                 /* Inject in reverse order, with "predicted" job indices.
2100                  * THIS IS A HACK XXX */
2101                 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false,
2102                                  batch->scoreboard.job_index + 2, tp, tp_size, true);
2103                 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2104                                  vp, vp_size, true);
2105                 return;
2106         }
2107
2108         /* If rasterizer discard is enable, only submit the vertex */
2109
2110         unsigned vertex = panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2111                                            vp, vp_size, false);
2112
2113         if (ctx->rasterizer->base.rasterizer_discard)
2114                 return;
2115
2116         panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false, vertex, tp, tp_size,
2117                          false);
2118 }
2119
2120 /* TODO: stop hardcoding this */
2121 mali_ptr
2122 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2123 {
2124         uint16_t locations[] = {
2125             128, 128,
2126             0, 256,
2127             0, 256,
2128             0, 256,
2129             0, 256,
2130             0, 256,
2131             0, 256,
2132             0, 256,
2133             0, 256,
2134             0, 256,
2135             0, 256,
2136             0, 256,
2137             0, 256,
2138             0, 256,
2139             0, 256,
2140             0, 256,
2141             0, 256,
2142             0, 256,
2143             0, 256,
2144             0, 256,
2145             0, 256,
2146             0, 256,
2147             0, 256,
2148             0, 256,
2149             0, 256,
2150             0, 256,
2151             0, 256,
2152             0, 256,
2153             0, 256,
2154             0, 256,
2155             0, 256,
2156             0, 256,
2157             128, 128,
2158             0, 0,
2159             0, 0,
2160             0, 0,
2161             0, 0,
2162             0, 0,
2163             0, 0,
2164             0, 0,
2165             0, 0,
2166             0, 0,
2167             0, 0,
2168             0, 0,
2169             0, 0,
2170             0, 0,
2171             0, 0,
2172             0, 0,
2173         };
2174
2175         return panfrost_pool_upload_aligned(&batch->pool, locations, 96 * sizeof(uint16_t), 64);
2176 }