src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28
  29 #include "panfrost-quirks.h"
  30
  31 #include "pan_pool.h"
  32 #include "pan_bo.h"
  33 #include "pan_cmdstream.h"
  34 #include "pan_context.h"
  35 #include "pan_job.h"
  36
  37 /* If a BO is accessed for a particular shader stage, will it be in the primary
  38  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  39  * fragment will be primary, e.g. compute jobs will be considered
  40  * "vertex/tiler" by analogy */
  41
  42 static inline uint32_t
  43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  44 {
  45         assert(stage == PIPE_SHADER_FRAGMENT ||
  46                stage == PIPE_SHADER_VERTEX ||
  47                stage == PIPE_SHADER_COMPUTE);
  48
  49         return stage == PIPE_SHADER_FRAGMENT ?
  50                PAN_BO_ACCESS_FRAGMENT :
  51                PAN_BO_ACCESS_VERTEX_TILER;
  52 }
  53
  54 static void
  55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
  56                                struct mali_vertex_tiler_postfix *postfix)
  57 {
  58         struct panfrost_device *dev = pan_device(ctx->base.screen);
  59         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  60
  61         struct mali_shared_memory shared = {
  62                 .shared_workgroup_count = ~0,
  63         };
  64
  65         if (batch->stack_size) {
  66                 struct panfrost_bo *stack =
  67                         panfrost_batch_get_scratchpad(batch, batch->stack_size,
  68                                         dev->thread_tls_alloc,
  69                                         dev->core_count);
  70
  71                 shared.stack_shift = panfrost_get_stack_shift(batch->stack_size);
  72                 shared.scratchpad = stack->gpu;
  73         }
  74
  75         postfix->shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared, sizeof(shared), 64);
  76 }
  77
  78 static void
  79 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
  80                                struct mali_vertex_tiler_postfix *postfix)
  81 {
  82         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  83         postfix->shared_memory = panfrost_batch_reserve_framebuffer(batch);
  84 }
  85
  86 static void
  87 panfrost_vt_update_rasterizer(struct panfrost_rasterizer *rasterizer,
  88                               struct mali_vertex_tiler_prefix *prefix,
  89                               struct mali_vertex_tiler_postfix *postfix)
  90 {
  91         postfix->gl_enables |= 0x7;
  92         SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
  93                 rasterizer->base.front_ccw);
  94         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
  95                 (rasterizer->base.cull_face & PIPE_FACE_FRONT));
  96         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
  97                 (rasterizer->base.cull_face & PIPE_FACE_BACK));
  98         SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
  99                 rasterizer->base.flatshade_first);
 100 }
 101
 102 void
 103 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
 104                                   struct mali_vertex_tiler_prefix *prefix,
 105                                   union midgard_primitive_size *primitive_size)
 106 {
 107         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 108
 109         if (!panfrost_writes_point_size(ctx)) {
 110                 float val = (prefix->draw_mode == MALI_DRAW_MODE_POINTS) ?
 111                               rasterizer->base.point_size :
 112                               rasterizer->base.line_width;
 113
 114                 primitive_size->constant = val;
 115         }
 116 }
 117
 118 static void
 119 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
 120                                    struct mali_vertex_tiler_postfix *postfix)
 121 {
 122         SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
 123         if (ctx->occlusion_query) {
 124                 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
 125                 panfrost_batch_add_bo(ctx->batch, ctx->occlusion_query->bo,
 126                                       PAN_BO_ACCESS_SHARED |
 127                                       PAN_BO_ACCESS_RW |
 128                                       PAN_BO_ACCESS_FRAGMENT);
 129         } else {
 130                 postfix->occlusion_counter = 0;
 131         }
 132 }
 133
 134 void
 135 panfrost_vt_init(struct panfrost_context *ctx,
 136                  enum pipe_shader_type stage,
 137                  struct mali_vertex_tiler_prefix *prefix,
 138                  struct mali_vertex_tiler_postfix *postfix)
 139 {
 140         struct panfrost_device *device = pan_device(ctx->base.screen);
 141
 142         if (!ctx->shader[stage])
 143                 return;
 144
 145         memset(prefix, 0, sizeof(*prefix));
 146         memset(postfix, 0, sizeof(*postfix));
 147
 148         if (device->quirks & IS_BIFROST) {
 149                 postfix->gl_enables = 0x2;
 150                 panfrost_vt_emit_shared_memory(ctx, postfix);
 151         } else {
 152                 postfix->gl_enables = 0x6;
 153                 panfrost_vt_attach_framebuffer(ctx, postfix);
 154         }
 155
 156         if (stage == PIPE_SHADER_FRAGMENT) {
 157                 panfrost_vt_update_occlusion_query(ctx, postfix);
 158                 panfrost_vt_update_rasterizer(ctx->rasterizer, prefix, postfix);
 159         }
 160 }
 161
 162 static unsigned
 163 panfrost_translate_index_size(unsigned size)
 164 {
 165         switch (size) {
 166         case 1:
 167                 return MALI_DRAW_INDEXED_UINT8;
 168
 169         case 2:
 170                 return MALI_DRAW_INDEXED_UINT16;
 171
 172         case 4:
 173                 return MALI_DRAW_INDEXED_UINT32;
 174
 175         default:
 176                 unreachable("Invalid index size");
 177         }
 178 }
 179
 180 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
 181  * good for the duration of the draw (transient), could last longer. Also get
 182  * the bounds on the index buffer for the range accessed by the draw. We do
 183  * these operations together because there are natural optimizations which
 184  * require them to be together. */
 185
 186 static mali_ptr
 187 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 188                                   const struct pipe_draw_info *info,
 189                                   unsigned *min_index, unsigned *max_index)
 190 {
 191         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 192         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 193         off_t offset = info->start * info->index_size;
 194         bool needs_indices = true;
 195         mali_ptr out = 0;
 196
 197         if (info->max_index != ~0u) {
 198                 *min_index = info->min_index;
 199                 *max_index = info->max_index;
 200                 needs_indices = false;
 201         }
 202
 203         if (!info->has_user_indices) {
 204                 /* Only resources can be directly mapped */
 205                 panfrost_batch_add_bo(batch, rsrc->bo,
 206                                       PAN_BO_ACCESS_SHARED |
 207                                       PAN_BO_ACCESS_READ |
 208                                       PAN_BO_ACCESS_VERTEX_TILER);
 209                 out = rsrc->bo->gpu + offset;
 210
 211                 /* Check the cache */
 212                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 213                                                            info->start,
 214                                                            info->count,
 215                                                            min_index,
 216                                                            max_index);
 217         } else {
 218                 /* Otherwise, we need to upload to transient memory */
 219                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 220                 struct panfrost_transfer T =
 221                         panfrost_pool_alloc_aligned(&batch->pool,
 222                                 info->count * info->index_size,
 223                                 info->index_size);
 224
 225                 memcpy(T.cpu, ibuf8 + offset, info->count * info->index_size);
 226                 out = T.gpu;
 227         }
 228
 229         if (needs_indices) {
 230                 /* Fallback */
 231                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 232
 233                 if (!info->has_user_indices)
 234                         panfrost_minmax_cache_add(rsrc->index_cache,
 235                                                   info->start, info->count,
 236                                                   *min_index, *max_index);
 237         }
 238
 239         return out;
 240 }
 241
 242 void
 243 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
 244                           const struct pipe_draw_info *info,
 245                           enum mali_draw_mode draw_mode,
 246                           struct mali_vertex_tiler_postfix *vertex_postfix,
 247                           struct mali_vertex_tiler_prefix *tiler_prefix,
 248                           struct mali_vertex_tiler_postfix *tiler_postfix,
 249                           unsigned *vertex_count,
 250                           unsigned *padded_count)
 251 {
 252         tiler_prefix->draw_mode = draw_mode;
 253
 254         unsigned draw_flags = 0;
 255
 256         if (panfrost_writes_point_size(ctx))
 257                 draw_flags |= MALI_DRAW_VARYING_SIZE;
 258
 259         if (info->primitive_restart)
 260                 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
 261
 262         /* These doesn't make much sense */
 263
 264         draw_flags |= 0x3000;
 265
 266         if (info->index_size) {
 267                 unsigned min_index = 0, max_index = 0;
 268
 269                 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
 270                                                                        info,
 271                                                                        &min_index,
 272                                                                        &max_index);
 273
 274                 /* Use the corresponding values */
 275                 *vertex_count = max_index - min_index + 1;
 276                 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
 277                 tiler_prefix->offset_bias_correction = -min_index;
 278                 tiler_prefix->index_count = MALI_POSITIVE(info->count);
 279                 draw_flags |= panfrost_translate_index_size(info->index_size);
 280         } else {
 281                 tiler_prefix->indices = 0;
 282                 *vertex_count = ctx->vertex_count;
 283                 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
 284                 tiler_prefix->offset_bias_correction = 0;
 285                 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
 286         }
 287
 288         tiler_prefix->unknown_draw = draw_flags;
 289
 290         /* Encode the padded vertex count */
 291
 292         if (info->instance_count > 1) {
 293                 *padded_count = panfrost_padded_vertex_count(*vertex_count);
 294
 295                 unsigned shift = __builtin_ctz(ctx->padded_count);
 296                 unsigned k = ctx->padded_count >> (shift + 1);
 297
 298                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
 299                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
 300         } else {
 301                 *padded_count = *vertex_count;
 302
 303                 /* Reset instancing state */
 304                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
 305                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
 306         }
 307 }
 308
 309 static void
 310 panfrost_emit_compute_shader(struct panfrost_context *ctx,
 311                           enum pipe_shader_type st,
 312                           struct mali_shader_meta *meta)
 313 {
 314         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 315         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 316
 317         memset(meta, 0, sizeof(*meta));
 318         meta->shader = ss->shader;
 319         meta->attribute_count = ss->attribute_count;
 320         meta->varying_count = ss->varying_count;
 321         meta->texture_count = ctx->sampler_view_count[st];
 322         meta->sampler_count = ctx->sampler_count[st];
 323
 324         if (dev->quirks & IS_BIFROST) {
 325                 meta->bifrost1.unk1 = 0x800000;
 326                 meta->bifrost2.preload_regs = 0xC0;
 327                 meta->bifrost2.uniform_count = ss->uniform_count;
 328                 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 329         } else {
 330                 meta->midgard1.uniform_count = ss->uniform_count;
 331                 meta->midgard1.work_count = ss->work_reg_count;
 332
 333                 /* TODO: This is not conformant on ES3 */
 334                 meta->midgard1.flags_hi = MALI_SUPPRESS_INF_NAN;
 335
 336                 meta->midgard1.flags_lo = 0x20;
 337                 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 338
 339                 SET_BIT(meta->midgard1.flags_lo, MALI_WRITES_GLOBAL, ss->writes_global);
 340         }
 341 }
 342
 343 static unsigned
 344 translate_tex_wrap(enum pipe_tex_wrap w)
 345 {
 346         switch (w) {
 347         case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT;
 348         case PIPE_TEX_WRAP_CLAMP: return MALI_WRAP_MODE_CLAMP;
 349         case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE;
 350         case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER;
 351         case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT;
 352         case PIPE_TEX_WRAP_MIRROR_CLAMP: return MALI_WRAP_MODE_MIRRORED_CLAMP;
 353         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
 354         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
 355         default: unreachable("Invalid wrap");
 356         }
 357 }
 358
 359 /* The hardware compares in the wrong order order, so we have to flip before
 360  * encoding. Yes, really. */
 361
 362 static enum mali_func
 363 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
 364 {
 365         if (!cso->compare_mode)
 366                 return MALI_FUNC_NEVER;
 367
 368         enum mali_func f = panfrost_translate_compare_func(cso->compare_func);
 369         return panfrost_flip_compare_func(f);
 370 }
 371
 372 static enum mali_mipmap_mode
 373 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
 374 {
 375         switch (f) {
 376         case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST;
 377         case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR;
 378         case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE;
 379         default: unreachable("Invalid");
 380         }
 381 }
 382
 383 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 384                                 struct mali_midgard_sampler_packed *hw)
 385 {
 386         pan_pack(hw, MIDGARD_SAMPLER, cfg) {
 387                 cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 388                 cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 389                 cfg.mipmap_mode = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) ?
 390                         MALI_MIPMAP_MODE_TRILINEAR : MALI_MIPMAP_MODE_NEAREST;
 391                 cfg.normalized_coordinates = cso->normalized_coords;
 392
 393                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
 394
 395                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
 396
 397                 /* If necessary, we disable mipmapping in the sampler descriptor by
 398                  * clamping the LOD as tight as possible (from 0 to epsilon,
 399                  * essentially -- remember these are fixed point numbers, so
 400                  * epsilon=1/256) */
 401
 402                 cfg.maximum_lod = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) ?
 403                         cfg.minimum_lod + 1 :
 404                         FIXED_16(cso->max_lod, false);
 405
 406                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
 407                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
 408                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
 409
 410                 cfg.compare_function = panfrost_sampler_compare_func(cso);
 411                 cfg.seamless_cube_map = cso->seamless_cube_map;
 412
 413                 cfg.border_color_r = cso->border_color.f[0];
 414                 cfg.border_color_g = cso->border_color.f[1];
 415                 cfg.border_color_b = cso->border_color.f[2];
 416                 cfg.border_color_a = cso->border_color.f[3];
 417         }
 418 }
 419
 420 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
 421                                         struct mali_bifrost_sampler_packed *hw)
 422 {
 423         pan_pack(hw, BIFROST_SAMPLER, cfg) {
 424                 cfg.magnify_linear = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR;
 425                 cfg.minify_linear = cso->min_img_filter == PIPE_TEX_FILTER_LINEAR;
 426                 cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
 427                 cfg.normalized_coordinates = cso->normalized_coords;
 428
 429                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
 430                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
 431                 cfg.maximum_lod = FIXED_16(cso->max_lod, false);
 432
 433                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
 434                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
 435                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
 436
 437                 cfg.compare_function = panfrost_sampler_compare_func(cso);
 438                 cfg.seamless_cube_map = cso->seamless_cube_map;
 439         }
 440 }
 441
 442 static bool
 443 panfrost_fs_required(
 444                 struct panfrost_shader_state *fs,
 445                 struct panfrost_blend_final *blend,
 446                 unsigned rt_count)
 447 {
 448         /* If we generally have side effects */
 449         if (fs->fs_sidefx)
 450                 return true;
 451
 452         /* If colour is written we need to execute */
 453         for (unsigned i = 0; i < rt_count; ++i) {
 454                 if (!blend[i].no_colour)
 455                         return true;
 456         }
 457
 458         /* If depth is written and not implied we need to execute.
 459          * TODO: Predicate on Z/S writes being enabled */
 460         return (fs->writes_depth || fs->writes_stencil);
 461 }
 462
 463 static void
 464 panfrost_emit_blend(struct panfrost_batch *batch, void *rts,
 465                 struct panfrost_blend_final *blend)
 466 {
 467         const struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
 468         struct panfrost_shader_state *fs = panfrost_get_shader_state(batch->ctx, PIPE_SHADER_FRAGMENT);
 469         unsigned rt_count = batch->key.nr_cbufs;
 470
 471         struct bifrost_blend_rt *brts = rts;
 472         struct midgard_blend_rt *mrts = rts;
 473
 474         /* Disable blending for depth-only on Bifrost */
 475
 476         if (rt_count == 0 && dev->quirks & IS_BIFROST)
 477                 brts[0].unk2 = 0x3;
 478
 479         for (unsigned i = 0; i < rt_count; ++i) {
 480                 unsigned flags = 0;
 481
 482                 pan_pack(&flags, BLEND_FLAGS, cfg) {
 483                         if (blend[i].no_colour) {
 484                                 cfg.enable = false;
 485                                 break;
 486                         }
 487
 488                         batch->draws |= (PIPE_CLEAR_COLOR0 << i);
 489
 490                         cfg.srgb = util_format_is_srgb(batch->key.cbufs[i]->format);
 491                         cfg.load_destination = blend[i].load_dest;
 492                         cfg.dither_disable = !batch->ctx->blend->base.dither;
 493
 494                         if (!(dev->quirks & IS_BIFROST))
 495                                 cfg.midgard_blend_shader = blend[i].is_shader;
 496                 }
 497
 498                 if (dev->quirks & IS_BIFROST) {
 499                         brts[i].flags = flags;
 500
 501                         if (blend[i].is_shader) {
 502                                 /* The blend shader's address needs to be at
 503                                  * the same top 32 bit as the fragment shader.
 504                                  * TODO: Ensure that's always the case.
 505                                  */
 506                                 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
 507                                        (fs->bo->gpu & (0xffffffffull << 32)));
 508                                 brts[i].shader = blend[i].shader.gpu;
 509                                 brts[i].unk2 = 0x0;
 510                         } else {
 511                                 enum pipe_format format = batch->key.cbufs[i]->format;
 512                                 const struct util_format_description *format_desc;
 513                                 format_desc = util_format_description(format);
 514
 515                                 brts[i].equation = blend[i].equation.equation;
 516
 517                                 /* TODO: this is a bit more complicated */
 518                                 brts[i].constant = blend[i].equation.constant;
 519
 520                                 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
 521
 522                                 /* 0x19 disables blending and forces REPLACE
 523                                  * mode (equivalent to rgb_mode = alpha_mode =
 524                                  * x122, colour mask = 0xF). 0x1a allows
 525                                  * blending. */
 526                                 brts[i].unk2 = blend[i].opaque ? 0x19 : 0x1a;
 527
 528                                 brts[i].shader_type = fs->blend_types[i];
 529                         }
 530                 } else {
 531                         memcpy(&mrts[i].flags, &flags, sizeof(flags));
 532
 533                         if (blend[i].is_shader) {
 534                                 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 535                         } else {
 536                                 mrts[i].blend.equation = blend[i].equation.equation;
 537                                 mrts[i].blend.constant = blend[i].equation.constant;
 538                         }
 539                 }
 540         }
 541 }
 542
 543 static void
 544 panfrost_emit_frag_shader(struct panfrost_context *ctx,
 545                                struct mali_shader_meta *fragmeta,
 546                                struct panfrost_blend_final *blend)
 547 {
 548         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 549         struct panfrost_shader_state *fs;
 550
 551         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 552
 553         struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 554         const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
 555
 556         memset(fragmeta, 0, sizeof(*fragmeta));
 557
 558         fragmeta->shader = fs->shader;
 559         fragmeta->attribute_count = fs->attribute_count;
 560         fragmeta->varying_count = fs->varying_count;
 561         fragmeta->texture_count = ctx->sampler_view_count[PIPE_SHADER_FRAGMENT];
 562         fragmeta->sampler_count = ctx->sampler_count[PIPE_SHADER_FRAGMENT];
 563
 564         if (dev->quirks & IS_BIFROST) {
 565                 /* First clause ATEST |= 0x4000000.
 566                  * Lefs than 32 regs |= 0x200 */
 567                 fragmeta->bifrost1.unk1 = 0x950020;
 568
 569                 fragmeta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, PIPE_SHADER_FRAGMENT);
 570                 fragmeta->bifrost2.preload_regs = 0x1;
 571                 SET_BIT(fragmeta->bifrost2.preload_regs, 0x10, fs->reads_frag_coord);
 572
 573                 fragmeta->bifrost2.uniform_count = fs->uniform_count;
 574         } else {
 575                 fragmeta->midgard1.uniform_count = fs->uniform_count;
 576                 fragmeta->midgard1.work_count = fs->work_reg_count;
 577
 578                 /* TODO: This is not conformant on ES3 */
 579                 fragmeta->midgard1.flags_hi = MALI_SUPPRESS_INF_NAN;
 580
 581                 fragmeta->midgard1.flags_lo = 0x20;
 582                 fragmeta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, PIPE_SHADER_FRAGMENT);
 583
 584                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_GLOBAL, fs->writes_global);
 585         }
 586
 587         bool msaa = rast->multisample;
 588         fragmeta->coverage_mask = msaa ? ctx->sample_mask : ~0;
 589
 590         fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x10;
 591         fragmeta->unknown2_4 = 0x4e0;
 592
 593         if (dev->quirks & IS_BIFROST) {
 594                 /* TODO */
 595         } else {
 596                 /* Depending on whether it's legal to in the given shader, we try to
 597                  * enable early-z testing. TODO: respect e-z force */
 598
 599                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
 600                         !fs->can_discard && !fs->writes_global &&
 601                         !fs->writes_depth && !fs->writes_stencil &&
 602                         !ctx->blend->base.alpha_to_coverage);
 603
 604                 /* Add the writes Z/S flags if needed. */
 605                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
 606                 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
 607
 608                 /* Any time texturing is used, derivatives are implicitly calculated,
 609                  * so we need to enable helper invocations */
 610
 611                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
 612                         fs->helper_invocations);
 613
 614                 /* If discard is enabled, which bit we set to convey this
 615                  * depends on if depth/stencil is used for the draw or not.
 616                  * Just one of depth OR stencil is enough to trigger this. */
 617
 618                 bool zs_enabled =
 619                         fs->writes_depth || fs->writes_stencil ||
 620                         (zsa->base.depth.enabled && zsa->base.depth.func != PIPE_FUNC_ALWAYS) ||
 621                         zsa->base.stencil[0].enabled;
 622
 623                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_TILEBUFFER,
 624                         fs->outputs_read || (!zs_enabled && fs->can_discard));
 625                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, zs_enabled && fs->can_discard);
 626         }
 627
 628         /* TODO: Sample size */
 629         SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
 630         SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
 631
 632         /* EXT_shader_framebuffer_fetch requires the shader to be run
 633          * per-sample when outputs are read. */
 634         bool per_sample = ctx->min_samples > 1 || fs->outputs_read;
 635         SET_BIT(fragmeta->unknown2_3, MALI_PER_SAMPLE, msaa && per_sample);
 636
 637         fragmeta->depth_units = rast->offset_units * 2.0f;
 638         fragmeta->depth_factor = rast->offset_scale;
 639
 640         /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
 641
 642         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, rast->offset_tri);
 643         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, rast->offset_tri);
 644
 645         SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, rast->depth_clip_near);
 646         SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, rast->depth_clip_far);
 647
 648         SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
 649                 zsa->base.stencil[0].enabled);
 650
 651         fragmeta->stencil_mask_front = zsa->stencil_mask_front;
 652         fragmeta->stencil_mask_back = zsa->stencil_mask_back;
 653
 654         /* Bottom bits for stencil ref, exactly one word */
 655         fragmeta->stencil_front.opaque[0] = zsa->stencil_front.opaque[0] | ctx->stencil_ref.ref_value[0];
 656
 657         /* If back-stencil is not enabled, use the front values */
 658
 659         if (zsa->base.stencil[1].enabled)
 660                 fragmeta->stencil_back.opaque[0] = zsa->stencil_back.opaque[0] | ctx->stencil_ref.ref_value[1];
 661         else
 662                 fragmeta->stencil_back = fragmeta->stencil_front;
 663
 664         SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
 665                 zsa->base.depth.writemask);
 666
 667         fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
 668         fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(
 669                 zsa->base.depth.enabled ? zsa->base.depth.func : PIPE_FUNC_ALWAYS));
 670
 671         SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
 672                 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
 673                 !ctx->blend->base.dither);
 674
 675         SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
 676
 677         SET_BIT(fragmeta->unknown2_4, MALI_ALPHA_TO_COVERAGE,
 678                         ctx->blend->base.alpha_to_coverage);
 679
 680         /* Get blending setup */
 681         unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
 682
 683         /* Disable shader execution if we can */
 684         if (dev->quirks & MIDGARD_SHADERLESS
 685                         && !panfrost_fs_required(fs, blend, rt_count)) {
 686                 fragmeta->shader = 0;
 687                 fragmeta->attribute_count = 0;
 688                 fragmeta->varying_count = 0;
 689                 fragmeta->texture_count = 0;
 690                 fragmeta->sampler_count = 0;
 691
 692                 /* This feature is not known to work on Bifrost */
 693                 fragmeta->midgard1.work_count = 1;
 694                 fragmeta->midgard1.uniform_count = 0;
 695                 fragmeta->midgard1.uniform_buffer_count = 0;
 696         }
 697
 698          /* If there is a blend shader, work registers are shared. We impose 8
 699           * work registers as a limit for blend shaders. Should be lower XXX */
 700
 701         if (!(dev->quirks & IS_BIFROST)) {
 702                 for (unsigned c = 0; c < rt_count; ++c) {
 703                         if (blend[c].is_shader) {
 704                                 fragmeta->midgard1.work_count =
 705                                         MAX2(fragmeta->midgard1.work_count, 8);
 706                         }
 707                 }
 708         }
 709
 710         if (dev->quirks & MIDGARD_SFBD) {
 711                 /* When only a single render target platform is used, the blend
 712                  * information is inside the shader meta itself. We additionally
 713                  * need to signal CAN_DISCARD for nontrivial blend modes (so
 714                  * we're able to read back the destination buffer) */
 715
 716                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
 717                         blend[0].is_shader);
 718
 719                 if (blend[0].is_shader) {
 720                         fragmeta->blend.shader = blend[0].shader.gpu |
 721                                 blend[0].shader.first_tag;
 722                 } else {
 723                         fragmeta->blend.equation = blend[0].equation.equation;
 724                         fragmeta->blend.constant = blend[0].equation.constant;
 725                 }
 726
 727                 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
 728                         blend[0].load_dest);
 729         } else if (!(dev->quirks & IS_BIFROST)) {
 730                 /* Bug where MRT-capable hw apparently reads the last blend
 731                  * shader from here instead of the usual location? */
 732
 733                 for (signed rt = ((signed) rt_count - 1); rt >= 0; --rt) {
 734                         if (!blend[rt].is_shader)
 735                                 continue;
 736
 737                         fragmeta->blend.shader = blend[rt].shader.gpu |
 738                                                  blend[rt].shader.first_tag;
 739                         break;
 740                 }
 741         }
 742
 743         if (dev->quirks & IS_BIFROST) {
 744                 bool no_blend = true;
 745
 746                 for (unsigned i = 0; i < rt_count; ++i)
 747                         no_blend &= (!blend[i].load_dest | blend[i].no_colour);
 748
 749                 SET_BIT(fragmeta->bifrost1.unk1, MALI_BIFROST_EARLY_Z,
 750                         !fs->can_discard && !fs->writes_depth && no_blend);
 751         }
 752 }
 753
 754 void
 755 panfrost_emit_shader_meta(struct panfrost_batch *batch,
 756                           enum pipe_shader_type st,
 757                           struct mali_vertex_tiler_postfix *postfix)
 758 {
 759         struct panfrost_context *ctx = batch->ctx;
 760         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 761
 762         if (!ss) {
 763                 postfix->shader = 0;
 764                 return;
 765         }
 766
 767         struct mali_shader_meta meta;
 768
 769         /* Add the shader BO to the batch. */
 770         panfrost_batch_add_bo(batch, ss->bo,
 771                               PAN_BO_ACCESS_PRIVATE |
 772                               PAN_BO_ACCESS_READ |
 773                               panfrost_bo_access_for_stage(st));
 774
 775         mali_ptr shader_ptr;
 776
 777         if (st == PIPE_SHADER_FRAGMENT) {
 778                 struct panfrost_device *dev = pan_device(ctx->base.screen);
 779                 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 780                 size_t desc_size = sizeof(meta);
 781                 void *rts = NULL;
 782                 struct panfrost_transfer xfer;
 783                 unsigned rt_size;
 784
 785                 if (dev->quirks & MIDGARD_SFBD)
 786                         rt_size = 0;
 787                 else if (dev->quirks & IS_BIFROST)
 788                         rt_size = sizeof(struct bifrost_blend_rt);
 789                 else
 790                         rt_size = sizeof(struct midgard_blend_rt);
 791
 792                 desc_size += rt_size * rt_count;
 793
 794                 if (rt_size)
 795                         rts = rzalloc_size(ctx, rt_size * rt_count);
 796
 797                 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 798
 799                 for (unsigned c = 0; c < ctx->pipe_framebuffer.nr_cbufs; ++c)
 800                         blend[c] = panfrost_get_blend_for_context(ctx, c);
 801
 802                 panfrost_emit_frag_shader(ctx, &meta, blend);
 803
 804                 if (!(dev->quirks & MIDGARD_SFBD))
 805                         panfrost_emit_blend(batch, rts, blend);
 806                 else
 807                         batch->draws |= PIPE_CLEAR_COLOR0;
 808
 809                 xfer = panfrost_pool_alloc_aligned(&batch->pool, desc_size, sizeof(meta));
 810
 811                 memcpy(xfer.cpu, &meta, sizeof(meta));
 812                 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
 813
 814                 if (rt_size)
 815                         ralloc_free(rts);
 816
 817                 shader_ptr = xfer.gpu;
 818         } else {
 819                 panfrost_emit_compute_shader(ctx, st, &meta);
 820
 821                 shader_ptr = panfrost_pool_upload(&batch->pool, &meta,
 822                                                        sizeof(meta));
 823         }
 824
 825         postfix->shader = shader_ptr;
 826 }
 827
 828 void
 829 panfrost_emit_viewport(struct panfrost_batch *batch,
 830                        struct mali_vertex_tiler_postfix *tiler_postfix)
 831 {
 832         struct panfrost_context *ctx = batch->ctx;
 833         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 834         const struct pipe_scissor_state *ss = &ctx->scissor;
 835         const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 836         const struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
 837
 838         /* Derive min/max from translate/scale. Note since |x| >= 0 by
 839          * definition, we have that -|x| <= |x| hence translate - |scale| <=
 840          * translate + |scale|, so the ordering is correct here. */
 841         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 842         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 843         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 844         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 845         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 846         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 847
 848         /* Scissor to the intersection of viewport and to the scissor, clamped
 849          * to the framebuffer */
 850
 851         unsigned minx = MIN2(fb->width, vp_minx);
 852         unsigned maxx = MIN2(fb->width, vp_maxx);
 853         unsigned miny = MIN2(fb->height, vp_miny);
 854         unsigned maxy = MIN2(fb->height, vp_maxy);
 855
 856         if (ss && rast->scissor) {
 857                 minx = MAX2(ss->minx, minx);
 858                 miny = MAX2(ss->miny, miny);
 859                 maxx = MIN2(ss->maxx, maxx);
 860                 maxy = MIN2(ss->maxy, maxy);
 861         }
 862
 863         struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, MALI_VIEWPORT_LENGTH);
 864
 865         pan_pack(T.cpu, VIEWPORT, cfg) {
 866                 cfg.scissor_minimum_x = minx;
 867                 cfg.scissor_minimum_y = miny;
 868                 cfg.scissor_maximum_x = maxx - 1;
 869                 cfg.scissor_maximum_y = maxy - 1;
 870
 871                 cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY;
 872                 cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY;
 873         }
 874
 875         tiler_postfix->viewport = T.gpu;
 876         panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
 877 }
 878
 879 static mali_ptr
 880 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
 881                                  enum pipe_shader_type st,
 882                                  struct panfrost_constant_buffer *buf,
 883                                  unsigned index)
 884 {
 885         struct pipe_constant_buffer *cb = &buf->cb[index];
 886         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
 887
 888         if (rsrc) {
 889                 panfrost_batch_add_bo(batch, rsrc->bo,
 890                                       PAN_BO_ACCESS_SHARED |
 891                                       PAN_BO_ACCESS_READ |
 892                                       panfrost_bo_access_for_stage(st));
 893
 894                 /* Alignment gauranteed by
 895                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
 896                 return rsrc->bo->gpu + cb->buffer_offset;
 897         } else if (cb->user_buffer) {
 898                 return panfrost_pool_upload_aligned(&batch->pool,
 899                                                  cb->user_buffer +
 900                                                  cb->buffer_offset,
 901                                                  cb->buffer_size, 16);
 902         } else {
 903                 unreachable("No constant buffer");
 904         }
 905 }
 906
 907 struct sysval_uniform {
 908         union {
 909                 float f[4];
 910                 int32_t i[4];
 911                 uint32_t u[4];
 912                 uint64_t du[2];
 913         };
 914 };
 915
 916 static void
 917 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
 918                                       struct sysval_uniform *uniform)
 919 {
 920         struct panfrost_context *ctx = batch->ctx;
 921         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 922
 923         uniform->f[0] = vp->scale[0];
 924         uniform->f[1] = vp->scale[1];
 925         uniform->f[2] = vp->scale[2];
 926 }
 927
 928 static void
 929 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
 930                                        struct sysval_uniform *uniform)
 931 {
 932         struct panfrost_context *ctx = batch->ctx;
 933         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 934
 935         uniform->f[0] = vp->translate[0];
 936         uniform->f[1] = vp->translate[1];
 937         uniform->f[2] = vp->translate[2];
 938 }
 939
 940 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
 941                                        enum pipe_shader_type st,
 942                                        unsigned int sysvalid,
 943                                        struct sysval_uniform *uniform)
 944 {
 945         struct panfrost_context *ctx = batch->ctx;
 946         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
 947         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
 948         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
 949         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
 950
 951         assert(dim);
 952         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
 953
 954         if (dim > 1)
 955                 uniform->i[1] = u_minify(tex->texture->height0,
 956                                          tex->u.tex.first_level);
 957
 958         if (dim > 2)
 959                 uniform->i[2] = u_minify(tex->texture->depth0,
 960                                          tex->u.tex.first_level);
 961
 962         if (is_array)
 963                 uniform->i[dim] = tex->texture->array_size;
 964 }
 965
 966 static void
 967 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
 968                             enum pipe_shader_type st,
 969                             unsigned ssbo_id,
 970                             struct sysval_uniform *uniform)
 971 {
 972         struct panfrost_context *ctx = batch->ctx;
 973
 974         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
 975         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
 976
 977         /* Compute address */
 978         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
 979
 980         panfrost_batch_add_bo(batch, bo,
 981                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
 982                               panfrost_bo_access_for_stage(st));
 983
 984         /* Upload address and size as sysval */
 985         uniform->du[0] = bo->gpu + sb.buffer_offset;
 986         uniform->u[2] = sb.buffer_size;
 987 }
 988
 989 static void
 990 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
 991                                enum pipe_shader_type st,
 992                                unsigned samp_idx,
 993                                struct sysval_uniform *uniform)
 994 {
 995         struct panfrost_context *ctx = batch->ctx;
 996         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
 997
 998         uniform->f[0] = sampl->min_lod;
 999         uniform->f[1] = sampl->max_lod;
1000         uniform->f[2] = sampl->lod_bias;
1001
1002         /* Even without any errata, Midgard represents "no mipmapping" as
1003          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1004          * panfrost_create_sampler_state which also explains our choice of
1005          * epsilon value (again to keep behaviour consistent) */
1006
1007         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1008                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1009 }
1010
1011 static void
1012 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1013                                        struct sysval_uniform *uniform)
1014 {
1015         struct panfrost_context *ctx = batch->ctx;
1016
1017         uniform->u[0] = ctx->compute_grid->grid[0];
1018         uniform->u[1] = ctx->compute_grid->grid[1];
1019         uniform->u[2] = ctx->compute_grid->grid[2];
1020 }
1021
1022 static void
1023 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1024                         struct panfrost_shader_state *ss,
1025                         enum pipe_shader_type st)
1026 {
1027         struct sysval_uniform *uniforms = (void *)buf;
1028
1029         for (unsigned i = 0; i < ss->sysval_count; ++i) {
1030                 int sysval = ss->sysval[i];
1031
1032                 switch (PAN_SYSVAL_TYPE(sysval)) {
1033                 case PAN_SYSVAL_VIEWPORT_SCALE:
1034                         panfrost_upload_viewport_scale_sysval(batch,
1035                                                               &uniforms[i]);
1036                         break;
1037                 case PAN_SYSVAL_VIEWPORT_OFFSET:
1038                         panfrost_upload_viewport_offset_sysval(batch,
1039                                                                &uniforms[i]);
1040                         break;
1041                 case PAN_SYSVAL_TEXTURE_SIZE:
1042                         panfrost_upload_txs_sysval(batch, st,
1043                                                    PAN_SYSVAL_ID(sysval),
1044                                                    &uniforms[i]);
1045                         break;
1046                 case PAN_SYSVAL_SSBO:
1047                         panfrost_upload_ssbo_sysval(batch, st,
1048                                                     PAN_SYSVAL_ID(sysval),
1049                                                     &uniforms[i]);
1050                         break;
1051                 case PAN_SYSVAL_NUM_WORK_GROUPS:
1052                         panfrost_upload_num_work_groups_sysval(batch,
1053                                                                &uniforms[i]);
1054                         break;
1055                 case PAN_SYSVAL_SAMPLER:
1056                         panfrost_upload_sampler_sysval(batch, st,
1057                                                        PAN_SYSVAL_ID(sysval),
1058                                                        &uniforms[i]);
1059                         break;
1060                 default:
1061                         assert(0);
1062                 }
1063         }
1064 }
1065
1066 static const void *
1067 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1068                                  unsigned index)
1069 {
1070         struct pipe_constant_buffer *cb = &buf->cb[index];
1071         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1072
1073         if (rsrc)
1074                 return rsrc->bo->cpu;
1075         else if (cb->user_buffer)
1076                 return cb->user_buffer;
1077         else
1078                 unreachable("No constant buffer");
1079 }
1080
1081 void
1082 panfrost_emit_const_buf(struct panfrost_batch *batch,
1083                         enum pipe_shader_type stage,
1084                         struct mali_vertex_tiler_postfix *postfix)
1085 {
1086         struct panfrost_context *ctx = batch->ctx;
1087         struct panfrost_shader_variants *all = ctx->shader[stage];
1088
1089         if (!all)
1090                 return;
1091
1092         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1093
1094         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1095
1096         /* Uniforms are implicitly UBO #0 */
1097         bool has_uniforms = buf->enabled_mask & (1 << 0);
1098
1099         /* Allocate room for the sysval and the uniforms */
1100         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1101         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1102         size_t size = sys_size + uniform_size;
1103         struct panfrost_transfer transfer =
1104                 panfrost_pool_alloc_aligned(&batch->pool, size, 16);
1105
1106         /* Upload sysvals requested by the shader */
1107         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1108
1109         /* Upload uniforms */
1110         if (has_uniforms && uniform_size) {
1111                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1112                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1113         }
1114
1115         /* Next up, attach UBOs. UBO #0 is the uniforms we just
1116          * uploaded */
1117
1118         unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1119         assert(ubo_count >= 1);
1120
1121         size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count;
1122         struct panfrost_transfer ubos =
1123                 panfrost_pool_alloc_aligned(&batch->pool, sz,
1124                                 MALI_UNIFORM_BUFFER_LENGTH);
1125
1126         uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
1127
1128         /* Upload uniforms as a UBO */
1129
1130         if (size) {
1131                 pan_pack(ubo_ptr, UNIFORM_BUFFER, cfg) {
1132                         cfg.entries = DIV_ROUND_UP(size, 16);
1133                         cfg.pointer = transfer.gpu;
1134                 }
1135         } else {
1136                 *ubo_ptr = 0;
1137         }
1138
1139         /* The rest are honest-to-goodness UBOs */
1140
1141         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1142                 size_t usz = buf->cb[ubo].buffer_size;
1143                 bool enabled = buf->enabled_mask & (1 << ubo);
1144                 bool empty = usz == 0;
1145
1146                 if (!enabled || empty) {
1147                         ubo_ptr[ubo] = 0;
1148                         continue;
1149                 }
1150
1151                 pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) {
1152                         cfg.entries = DIV_ROUND_UP(usz, 16);
1153                         cfg.pointer = panfrost_map_constant_buffer_gpu(batch,
1154                                         stage, buf, ubo);
1155                 }
1156         }
1157
1158         postfix->uniforms = transfer.gpu;
1159         postfix->uniform_buffers = ubos.gpu;
1160
1161         buf->dirty_mask = 0;
1162 }
1163
1164 void
1165 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1166                             const struct pipe_grid_info *info,
1167                             struct midgard_payload_vertex_tiler *vtp)
1168 {
1169         struct panfrost_context *ctx = batch->ctx;
1170         struct panfrost_device *dev = pan_device(ctx->base.screen);
1171         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1172         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1173         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1174                                                            128));
1175
1176         unsigned log2_instances =
1177                 util_logbase2_ceil(info->grid[0]) +
1178                 util_logbase2_ceil(info->grid[1]) +
1179                 util_logbase2_ceil(info->grid[2]);
1180
1181         unsigned shared_size = single_size * (1 << log2_instances) * dev->core_count;
1182         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1183                                                                   shared_size,
1184                                                                   1);
1185
1186         struct mali_shared_memory shared = {
1187                 .shared_memory = bo->gpu,
1188                 .shared_workgroup_count = log2_instances,
1189                 .shared_shift = util_logbase2(single_size) + 1
1190         };
1191
1192         vtp->postfix.shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared,
1193                                                                sizeof(shared), 64);
1194 }
1195
1196 static mali_ptr
1197 panfrost_get_tex_desc(struct panfrost_batch *batch,
1198                       enum pipe_shader_type st,
1199                       struct panfrost_sampler_view *view)
1200 {
1201         if (!view)
1202                 return (mali_ptr) 0;
1203
1204         struct pipe_sampler_view *pview = &view->base;
1205         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1206
1207         /* Add the BO to the job so it's retained until the job is done. */
1208
1209         panfrost_batch_add_bo(batch, rsrc->bo,
1210                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1211                               panfrost_bo_access_for_stage(st));
1212
1213         panfrost_batch_add_bo(batch, view->bo,
1214                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1215                               panfrost_bo_access_for_stage(st));
1216
1217         return view->bo->gpu;
1218 }
1219
1220 static void
1221 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1222                              struct pipe_context *pctx)
1223 {
1224         struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1225         if (view->texture_bo != rsrc->bo->gpu ||
1226             view->modifier != rsrc->modifier) {
1227                 panfrost_bo_unreference(view->bo);
1228                 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1229         }
1230 }
1231
1232 void
1233 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1234                                   enum pipe_shader_type stage,
1235                                   struct mali_vertex_tiler_postfix *postfix)
1236 {
1237         struct panfrost_context *ctx = batch->ctx;
1238         struct panfrost_device *device = pan_device(ctx->base.screen);
1239
1240         if (!ctx->sampler_view_count[stage])
1241                 return;
1242
1243         if (device->quirks & IS_BIFROST) {
1244                 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1245                                 MALI_BIFROST_TEXTURE_LENGTH *
1246                                 ctx->sampler_view_count[stage],
1247                                 MALI_BIFROST_TEXTURE_LENGTH);
1248
1249                 struct mali_bifrost_texture_packed *out =
1250                         (struct mali_bifrost_texture_packed *) T.cpu;
1251
1252                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1253                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1254                         struct pipe_sampler_view *pview = &view->base;
1255                         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1256
1257                         panfrost_update_sampler_view(view, &ctx->base);
1258                         out[i] = view->bifrost_descriptor;
1259
1260                         /* Add the BOs to the job so they are retained until the job is done. */
1261
1262                         panfrost_batch_add_bo(batch, rsrc->bo,
1263                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1264                                               panfrost_bo_access_for_stage(stage));
1265
1266                         panfrost_batch_add_bo(batch, view->bo,
1267                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1268                                               panfrost_bo_access_for_stage(stage));
1269                 }
1270
1271                 postfix->textures = T.gpu;
1272         } else {
1273                 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1274
1275                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1276                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1277
1278                         panfrost_update_sampler_view(view, &ctx->base);
1279
1280                         trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1281                 }
1282
1283                 postfix->textures = panfrost_pool_upload_aligned(&batch->pool,
1284                                                               trampolines,
1285                                                               sizeof(uint64_t) *
1286                                                               ctx->sampler_view_count[stage],
1287                                                               sizeof(uint64_t));
1288         }
1289 }
1290
1291 void
1292 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1293                                   enum pipe_shader_type stage,
1294                                   struct mali_vertex_tiler_postfix *postfix)
1295 {
1296         struct panfrost_context *ctx = batch->ctx;
1297
1298         if (!ctx->sampler_count[stage])
1299                 return;
1300
1301         size_t desc_size = MALI_BIFROST_SAMPLER_LENGTH;
1302         assert(MALI_BIFROST_SAMPLER_LENGTH == MALI_MIDGARD_SAMPLER_LENGTH);
1303
1304         size_t sz = desc_size * ctx->sampler_count[stage];
1305         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool, sz, desc_size);
1306         struct mali_midgard_sampler_packed *out = (struct mali_midgard_sampler_packed *) T.cpu;
1307
1308         for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i)
1309                 out[i] = ctx->samplers[stage][i]->hw;
1310
1311         postfix->sampler_descriptor = T.gpu;
1312 }
1313
1314 void
1315 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1316                           struct mali_vertex_tiler_postfix *vertex_postfix)
1317 {
1318         struct panfrost_context *ctx = batch->ctx;
1319         struct panfrost_vertex_state *so = ctx->vertex;
1320         struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1321
1322         unsigned instance_shift = vertex_postfix->instance_shift;
1323         unsigned instance_odd = vertex_postfix->instance_odd;
1324
1325         /* Worst case: everything is NPOT, which is only possible if instancing
1326          * is enabled. Otherwise single record is gauranteed */
1327         bool could_npot = instance_shift || instance_odd;
1328
1329         struct panfrost_transfer S = panfrost_pool_alloc_aligned(&batch->pool,
1330                         MALI_ATTRIBUTE_BUFFER_LENGTH * vs->attribute_count *
1331                         (could_npot ? 2 : 1),
1332                         MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1333
1334         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1335                         MALI_ATTRIBUTE_LENGTH * vs->attribute_count,
1336                         MALI_ATTRIBUTE_LENGTH);
1337
1338         struct mali_attribute_buffer_packed *bufs =
1339                 (struct mali_attribute_buffer_packed *) S.cpu;
1340
1341         struct mali_attribute_packed *out =
1342                 (struct mali_attribute_packed *) T.cpu;
1343
1344         unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 };
1345         unsigned k = 0;
1346
1347         for (unsigned i = 0; i < so->num_elements; ++i) {
1348                 /* We map buffers 1:1 with the attributes, which
1349                  * means duplicating some vertex buffers (who cares? aside from
1350                  * maybe some caching implications but I somehow doubt that
1351                  * matters) */
1352
1353                 struct pipe_vertex_element *elem = &so->pipe[i];
1354                 unsigned vbi = elem->vertex_buffer_index;
1355                 attrib_to_buffer[i] = k;
1356
1357                 if (!(ctx->vb_mask & (1 << vbi)))
1358                         continue;
1359
1360                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1361                 struct panfrost_resource *rsrc;
1362
1363                 rsrc = pan_resource(buf->buffer.resource);
1364                 if (!rsrc)
1365                         continue;
1366
1367                 /* Add a dependency of the batch on the vertex buffer */
1368                 panfrost_batch_add_bo(batch, rsrc->bo,
1369                                       PAN_BO_ACCESS_SHARED |
1370                                       PAN_BO_ACCESS_READ |
1371                                       PAN_BO_ACCESS_VERTEX_TILER);
1372
1373                 /* Mask off lower bits, see offset fixup below */
1374                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1375                 mali_ptr addr = raw_addr & ~63;
1376
1377                 /* Since we advanced the base pointer, we shrink the buffer
1378                  * size, but add the offset we subtracted */
1379                 unsigned size = rsrc->base.width0 + (raw_addr - addr)
1380                         - buf->buffer_offset;
1381
1382                 /* When there is a divisor, the hardware-level divisor is
1383                  * the product of the instance divisor and the padded count */
1384                 unsigned divisor = elem->instance_divisor;
1385                 unsigned hw_divisor = ctx->padded_count * divisor;
1386                 unsigned stride = buf->stride;
1387
1388                 /* If there's a divisor(=1) but no instancing, we want every
1389                  * attribute to be the same */
1390
1391                 if (divisor && ctx->instance_count == 1)
1392                         stride = 0;
1393
1394                 if (!divisor || ctx->instance_count <= 1) {
1395                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1396                                 if (ctx->instance_count > 1)
1397                                         cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
1398
1399                                 cfg.pointer = addr;
1400                                 cfg.stride = stride;
1401                                 cfg.size = size;
1402                                 cfg.divisor_r = instance_shift;
1403                                 cfg.divisor_p = instance_odd;
1404                         }
1405                 } else if (util_is_power_of_two_or_zero(hw_divisor)) {
1406                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1407                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
1408                                 cfg.pointer = addr;
1409                                 cfg.stride = stride;
1410                                 cfg.size = size;
1411                                 cfg.divisor_r = __builtin_ctz(hw_divisor);
1412                         }
1413
1414                 } else {
1415                         unsigned shift = 0, extra_flags = 0;
1416
1417                         unsigned magic_divisor =
1418                                 panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
1419
1420                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1421                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
1422                                 cfg.pointer = addr;
1423                                 cfg.stride = stride;
1424                                 cfg.size = size;
1425
1426                                 cfg.divisor_r = shift;
1427                                 cfg.divisor_e = extra_flags;
1428                         }
1429
1430                         pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
1431                                 cfg.divisor_numerator = magic_divisor;
1432                                 cfg.divisor = divisor;
1433                         }
1434
1435                         ++k;
1436                 }
1437
1438                 ++k;
1439         }
1440
1441         /* Add special gl_VertexID/gl_InstanceID buffers */
1442
1443         if (unlikely(vs->attribute_count >= PAN_VERTEX_ID)) {
1444                 panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1445
1446                 pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
1447                         cfg.buffer_index = k++;
1448                         cfg.format = so->formats[PAN_VERTEX_ID];
1449                 }
1450
1451                 panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1452
1453                 pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
1454                         cfg.buffer_index = k++;
1455                         cfg.format = so->formats[PAN_INSTANCE_ID];
1456                 }
1457         }
1458
1459         /* Attribute addresses require 64-byte alignment, so let:
1460          *
1461          *      base' = base & ~63 = base - (base & 63)
1462          *      offset' = offset + (base & 63)
1463          *
1464          * Since base' + offset' = base + offset, these are equivalent
1465          * addressing modes and now base is 64 aligned.
1466          */
1467
1468         unsigned start = vertex_postfix->offset_start;
1469
1470         for (unsigned i = 0; i < so->num_elements; ++i) {
1471                 unsigned vbi = so->pipe[i].vertex_buffer_index;
1472                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1473
1474                 /* Adjust by the masked off bits of the offset. Make sure we
1475                  * read src_offset from so->hw (which is not GPU visible)
1476                  * rather than target (which is) due to caching effects */
1477
1478                 unsigned src_offset = so->pipe[i].src_offset;
1479
1480                 /* BOs aligned to 4k so guaranteed aligned to 64 */
1481                 src_offset += (buf->buffer_offset & 63);
1482
1483                 /* Also, somewhat obscurely per-instance data needs to be
1484                  * offset in response to a delayed start in an indexed draw */
1485
1486                 if (so->pipe[i].instance_divisor && ctx->instance_count > 1 && start)
1487                         src_offset -= buf->stride * start;
1488
1489                 pan_pack(out + i, ATTRIBUTE, cfg) {
1490                         cfg.buffer_index = attrib_to_buffer[i];
1491                         cfg.format = so->formats[i];
1492                         cfg.offset = src_offset;
1493                 }
1494         }
1495
1496         vertex_postfix->attributes = S.gpu;
1497         vertex_postfix->attribute_meta = T.gpu;
1498 }
1499
1500 static mali_ptr
1501 panfrost_emit_varyings(struct panfrost_batch *batch,
1502                 struct mali_attribute_buffer_packed *slot,
1503                 unsigned stride, unsigned count)
1504 {
1505         unsigned size = stride * count;
1506         mali_ptr ptr = panfrost_pool_alloc_aligned(&batch->invisible_pool, size, 64).gpu;
1507
1508         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1509                 cfg.stride = stride;
1510                 cfg.size = size;
1511                 cfg.pointer = ptr;
1512         }
1513
1514         return ptr;
1515 }
1516
1517 static unsigned
1518 panfrost_streamout_offset(unsigned stride, unsigned offset,
1519                         struct pipe_stream_output_target *target)
1520 {
1521         return (target->buffer_offset + (offset * stride * 4)) & 63;
1522 }
1523
1524 static void
1525 panfrost_emit_streamout(struct panfrost_batch *batch,
1526                         struct mali_attribute_buffer_packed *slot,
1527                         unsigned stride_words, unsigned offset, unsigned count,
1528                         struct pipe_stream_output_target *target)
1529 {
1530         unsigned stride = stride_words * 4;
1531         unsigned max_size = target->buffer_size;
1532         unsigned expected_size = stride * count;
1533
1534         /* Grab the BO and bind it to the batch */
1535         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1536
1537         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1538          * the perspective of the TILER and FRAGMENT.
1539          */
1540         panfrost_batch_add_bo(batch, bo,
1541                               PAN_BO_ACCESS_SHARED |
1542                               PAN_BO_ACCESS_RW |
1543                               PAN_BO_ACCESS_VERTEX_TILER |
1544                               PAN_BO_ACCESS_FRAGMENT);
1545
1546         /* We will have an offset applied to get alignment */
1547         mali_ptr addr = bo->gpu + target->buffer_offset + (offset * stride);
1548
1549         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1550                 cfg.pointer = (addr & ~63);
1551                 cfg.stride = stride;
1552                 cfg.size = MIN2(max_size, expected_size) + (addr & 63);
1553         }
1554 }
1555
1556 static bool
1557 has_point_coord(unsigned mask, gl_varying_slot loc)
1558 {
1559         if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1560                 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1561         else if (loc == VARYING_SLOT_PNTC)
1562                 return (mask & (1 << 8));
1563         else
1564                 return false;
1565 }
1566
1567 /* Helpers for manipulating stream out information so we can pack varyings
1568  * accordingly. Compute the src_offset for a given captured varying */
1569
1570 static struct pipe_stream_output *
1571 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1572 {
1573         for (unsigned i = 0; i < info->num_outputs; ++i) {
1574                 if (info->output[i].register_index == loc)
1575                         return &info->output[i];
1576         }
1577
1578         unreachable("Varying not captured");
1579 }
1580
1581 static unsigned
1582 pan_varying_size(enum mali_format fmt)
1583 {
1584         unsigned type = MALI_EXTRACT_TYPE(fmt);
1585         unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1586         unsigned bits = MALI_EXTRACT_BITS(fmt);
1587         unsigned bpc = 0;
1588
1589         if (bits == MALI_CHANNEL_FLOAT) {
1590                 /* No doubles */
1591                 bool fp16 = (type == MALI_FORMAT_SINT);
1592                 assert(fp16 || (type == MALI_FORMAT_UNORM));
1593
1594                 bpc = fp16 ? 2 : 4;
1595         } else {
1596                 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1597
1598                 /* See the enums */
1599                 bits = 1 << bits;
1600                 assert(bits >= 8);
1601                 bpc = bits / 8;
1602         }
1603
1604         return bpc * chan;
1605 }
1606
1607 /* Indices for named (non-XFB) varyings that are present. These are packed
1608  * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1609  * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1610  * of a given special field given a shift S by:
1611  *
1612  *      idx = popcount(P & ((1 << S) - 1))
1613  *
1614  * That is... look at all of the varyings that come earlier and count them, the
1615  * count is the new index since plus one. Likewise, the total number of special
1616  * buffers required is simply popcount(P)
1617  */
1618
1619 enum pan_special_varying {
1620         PAN_VARY_GENERAL = 0,
1621         PAN_VARY_POSITION = 1,
1622         PAN_VARY_PSIZ = 2,
1623         PAN_VARY_PNTCOORD = 3,
1624         PAN_VARY_FACE = 4,
1625         PAN_VARY_FRAGCOORD = 5,
1626
1627         /* Keep last */
1628         PAN_VARY_MAX,
1629 };
1630
1631 /* Given a varying, figure out which index it correpsonds to */
1632
1633 static inline unsigned
1634 pan_varying_index(unsigned present, enum pan_special_varying v)
1635 {
1636         unsigned mask = (1 << v) - 1;
1637         return util_bitcount(present & mask);
1638 }
1639
1640 /* Get the base offset for XFB buffers, which by convention come after
1641  * everything else. Wrapper function for semantic reasons; by construction this
1642  * is just popcount. */
1643
1644 static inline unsigned
1645 pan_xfb_base(unsigned present)
1646 {
1647         return util_bitcount(present);
1648 }
1649
1650 /* Computes the present mask for varyings so we can start emitting varying records */
1651
1652 static inline unsigned
1653 pan_varying_present(
1654         struct panfrost_shader_state *vs,
1655         struct panfrost_shader_state *fs,
1656         unsigned quirks)
1657 {
1658         /* At the moment we always emit general and position buffers. Not
1659          * strictly necessary but usually harmless */
1660
1661         unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1662
1663         /* Enable special buffers by the shader info */
1664
1665         if (vs->writes_point_size)
1666                 present |= (1 << PAN_VARY_PSIZ);
1667
1668         if (fs->reads_point_coord)
1669                 present |= (1 << PAN_VARY_PNTCOORD);
1670
1671         if (fs->reads_face)
1672                 present |= (1 << PAN_VARY_FACE);
1673
1674         if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1675                 present |= (1 << PAN_VARY_FRAGCOORD);
1676
1677         /* Also, if we have a point sprite, we need a point coord buffer */
1678
1679         for (unsigned i = 0; i < fs->varying_count; i++)  {
1680                 gl_varying_slot loc = fs->varyings_loc[i];
1681
1682                 if (has_point_coord(fs->point_sprite_mask, loc))
1683                         present |= (1 << PAN_VARY_PNTCOORD);
1684         }
1685
1686         return present;
1687 }
1688
1689 /* Emitters for varying records */
1690
1691 static void
1692 pan_emit_vary(struct mali_attribute_packed *out,
1693                 unsigned present, enum pan_special_varying buf,
1694                 unsigned quirks, enum mali_format format,
1695                 unsigned offset)
1696 {
1697         unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1698         unsigned swizzle = quirks & HAS_SWIZZLES ?
1699                         panfrost_get_default_swizzle(nr_channels) :
1700                         panfrost_bifrost_swizzle(nr_channels);
1701
1702         pan_pack(out, ATTRIBUTE, cfg) {
1703                 cfg.buffer_index = pan_varying_index(present, buf);
1704                 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1705                 cfg.format = (format << 12) | swizzle;
1706                 cfg.offset = offset;
1707         }
1708 }
1709
1710 /* General varying that is unused */
1711
1712 static void
1713 pan_emit_vary_only(struct mali_attribute_packed *out,
1714                 unsigned present, unsigned quirks)
1715 {
1716         pan_emit_vary(out, present, 0, quirks, MALI_VARYING_DISCARD, 0);
1717 }
1718
1719 /* Special records */
1720
1721 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1722         [PAN_VARY_POSITION]     = MALI_VARYING_POS,
1723         [PAN_VARY_PSIZ]         = MALI_R16F,
1724         [PAN_VARY_PNTCOORD]     = MALI_R16F,
1725         [PAN_VARY_FACE]         = MALI_R32I,
1726         [PAN_VARY_FRAGCOORD]    = MALI_RGBA32F
1727 };
1728
1729 static void
1730 pan_emit_vary_special(struct mali_attribute_packed *out,
1731                 unsigned present, enum pan_special_varying buf,
1732                 unsigned quirks)
1733 {
1734         assert(buf < PAN_VARY_MAX);
1735         pan_emit_vary(out, present, buf, quirks, pan_varying_formats[buf], 0);
1736 }
1737
1738 static enum mali_format
1739 pan_xfb_format(enum mali_format format, unsigned nr)
1740 {
1741         if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1742                 return MALI_R32F | MALI_NR_CHANNELS(nr);
1743         else
1744                 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1745 }
1746
1747 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1748  * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1749  * value. */
1750
1751 static void
1752 pan_emit_vary_xfb(struct mali_attribute_packed *out,
1753                 unsigned present,
1754                 unsigned max_xfb,
1755                 unsigned *streamout_offsets,
1756                 unsigned quirks,
1757                 enum mali_format format,
1758                 struct pipe_stream_output o)
1759 {
1760         unsigned swizzle = quirks & HAS_SWIZZLES ?
1761                         panfrost_get_default_swizzle(o.num_components) :
1762                         panfrost_bifrost_swizzle(o.num_components);
1763
1764         pan_pack(out, ATTRIBUTE, cfg) {
1765                 /* XFB buffers come after everything else */
1766                 cfg.buffer_index = pan_xfb_base(present) + o.output_buffer;
1767                 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1768
1769                 /* Override number of channels and precision to highp */
1770                 cfg.format = (pan_xfb_format(format, o.num_components) << 12) | swizzle;
1771
1772                 /* Apply given offsets together */
1773                 cfg.offset = (o.dst_offset * 4) /* dwords */
1774                         + streamout_offsets[o.output_buffer];
1775         }
1776 }
1777
1778 /* Determine if we should capture a varying for XFB. This requires actually
1779  * having a buffer for it. If we don't capture it, we'll fallback to a general
1780  * varying path (linked or unlinked, possibly discarding the write) */
1781
1782 static bool
1783 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1784                 unsigned loc, unsigned max_xfb)
1785 {
1786         if (!(xfb->so_mask & (1ll << loc)))
1787                 return false;
1788
1789         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1790         return o->output_buffer < max_xfb;
1791 }
1792
1793 static void
1794 pan_emit_general_varying(struct mali_attribute_packed *out,
1795                 struct panfrost_shader_state *other,
1796                 struct panfrost_shader_state *xfb,
1797                 gl_varying_slot loc,
1798                 enum mali_format format,
1799                 unsigned present,
1800                 unsigned quirks,
1801                 unsigned *gen_offsets,
1802                 enum mali_format *gen_formats,
1803                 unsigned *gen_stride,
1804                 unsigned idx,
1805                 bool should_alloc)
1806 {
1807         /* Check if we're linked */
1808         signed other_idx = -1;
1809
1810         for (unsigned j = 0; j < other->varying_count; ++j) {
1811                 if (other->varyings_loc[j] == loc) {
1812                         other_idx = j;
1813                         break;
1814                 }
1815         }
1816
1817         if (other_idx < 0) {
1818                 pan_emit_vary_only(out, present, quirks);
1819                 return;
1820         }
1821
1822         unsigned offset = gen_offsets[other_idx];
1823
1824         if (should_alloc) {
1825                 /* We're linked, so allocate a space via a watermark allocation */
1826                 enum mali_format alt = other->varyings[other_idx];
1827
1828                 /* Do interpolation at minimum precision */
1829                 unsigned size_main = pan_varying_size(format);
1830                 unsigned size_alt = pan_varying_size(alt);
1831                 unsigned size = MIN2(size_main, size_alt);
1832
1833                 /* If a varying is marked for XFB but not actually captured, we
1834                  * should match the format to the format that would otherwise
1835                  * be used for XFB, since dEQP checks for invariance here. It's
1836                  * unclear if this is required by the spec. */
1837
1838                 if (xfb->so_mask & (1ull << loc)) {
1839                         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1840                         format = pan_xfb_format(format, o->num_components);
1841                         size = pan_varying_size(format);
1842                 } else if (size == size_alt) {
1843                         format = alt;
1844                 }
1845
1846                 gen_offsets[idx] = *gen_stride;
1847                 gen_formats[other_idx] = format;
1848                 offset = *gen_stride;
1849                 *gen_stride += size;
1850         }
1851
1852         pan_emit_vary(out, present, PAN_VARY_GENERAL, quirks, format, offset);
1853 }
1854
1855 /* Higher-level wrapper around all of the above, classifying a varying into one
1856  * of the above types */
1857
1858 static void
1859 panfrost_emit_varying(
1860                 struct mali_attribute_packed *out,
1861                 struct panfrost_shader_state *stage,
1862                 struct panfrost_shader_state *other,
1863                 struct panfrost_shader_state *xfb,
1864                 unsigned present,
1865                 unsigned max_xfb,
1866                 unsigned *streamout_offsets,
1867                 unsigned quirks,
1868                 unsigned *gen_offsets,
1869                 enum mali_format *gen_formats,
1870                 unsigned *gen_stride,
1871                 unsigned idx,
1872                 bool should_alloc,
1873                 bool is_fragment)
1874 {
1875         gl_varying_slot loc = stage->varyings_loc[idx];
1876         enum mali_format format = stage->varyings[idx];
1877
1878         /* Override format to match linkage */
1879         if (!should_alloc && gen_formats[idx])
1880                 format = gen_formats[idx];
1881
1882         if (has_point_coord(stage->point_sprite_mask, loc)) {
1883                 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1884         } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1885                 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1886                 pan_emit_vary_xfb(out, present, max_xfb, streamout_offsets, quirks, format, *o);
1887         } else if (loc == VARYING_SLOT_POS) {
1888                 if (is_fragment)
1889                         pan_emit_vary_special(out, present, PAN_VARY_FRAGCOORD, quirks);
1890                 else
1891                         pan_emit_vary_special(out, present, PAN_VARY_POSITION, quirks);
1892         } else if (loc == VARYING_SLOT_PSIZ) {
1893                 pan_emit_vary_special(out, present, PAN_VARY_PSIZ, quirks);
1894         } else if (loc == VARYING_SLOT_PNTC) {
1895                 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1896         } else if (loc == VARYING_SLOT_FACE) {
1897                 pan_emit_vary_special(out, present, PAN_VARY_FACE, quirks);
1898         } else {
1899                 pan_emit_general_varying(out, other, xfb, loc, format, present,
1900                                 quirks, gen_offsets, gen_formats, gen_stride,
1901                                 idx, should_alloc);
1902         }
1903 }
1904
1905 static void
1906 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
1907                 unsigned present,
1908                 enum pan_special_varying v,
1909                 unsigned special)
1910 {
1911         if (present & (1 << v)) {
1912                 unsigned idx = pan_varying_index(present, v);
1913
1914                 pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
1915                         cfg.special = special;
1916                         cfg.type = 0;
1917                 }
1918         }
1919 }
1920
1921 void
1922 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1923                                  unsigned vertex_count,
1924                                  struct mali_vertex_tiler_postfix *vertex_postfix,
1925                                  struct mali_vertex_tiler_postfix *tiler_postfix,
1926                                  union midgard_primitive_size *primitive_size)
1927 {
1928         /* Load the shaders */
1929         struct panfrost_context *ctx = batch->ctx;
1930         struct panfrost_device *dev = pan_device(ctx->base.screen);
1931         struct panfrost_shader_state *vs, *fs;
1932         size_t vs_size, fs_size;
1933
1934         /* Allocate the varying descriptor */
1935
1936         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1937         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1938         vs_size = MALI_ATTRIBUTE_LENGTH * vs->varying_count;
1939         fs_size = MALI_ATTRIBUTE_LENGTH * fs->varying_count;
1940
1941         struct panfrost_transfer trans = panfrost_pool_alloc_aligned(
1942                         &batch->pool, vs_size + fs_size, MALI_ATTRIBUTE_LENGTH);
1943
1944         struct pipe_stream_output_info *so = &vs->stream_output;
1945         unsigned present = pan_varying_present(vs, fs, dev->quirks);
1946
1947         /* Check if this varying is linked by us. This is the case for
1948          * general-purpose, non-captured varyings. If it is, link it. If it's
1949          * not, use the provided stream out information to determine the
1950          * offset, since it was already linked for us. */
1951
1952         unsigned gen_offsets[32];
1953         enum mali_format gen_formats[32];
1954         memset(gen_offsets, 0, sizeof(gen_offsets));
1955         memset(gen_formats, 0, sizeof(gen_formats));
1956
1957         unsigned gen_stride = 0;
1958         assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
1959         assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
1960
1961         unsigned streamout_offsets[32];
1962
1963         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1964                 streamout_offsets[i] = panfrost_streamout_offset(
1965                                         so->stride[i],
1966                                         ctx->streamout.offsets[i],
1967                                         ctx->streamout.targets[i]);
1968         }
1969
1970         struct mali_attribute_packed *ovs = (struct mali_attribute_packed *)trans.cpu;
1971         struct mali_attribute_packed *ofs = ovs + vs->varying_count;
1972
1973         for (unsigned i = 0; i < vs->varying_count; i++) {
1974                 panfrost_emit_varying(ovs + i, vs, fs, vs, present,
1975                                 ctx->streamout.num_targets, streamout_offsets,
1976                                 dev->quirks,
1977                                 gen_offsets, gen_formats, &gen_stride, i, true, false);
1978         }
1979
1980         for (unsigned i = 0; i < fs->varying_count; i++) {
1981                 panfrost_emit_varying(ofs + i, fs, vs, vs, present,
1982                                 ctx->streamout.num_targets, streamout_offsets,
1983                                 dev->quirks,
1984                                 gen_offsets, gen_formats, &gen_stride, i, false, true);
1985         }
1986
1987         unsigned xfb_base = pan_xfb_base(present);
1988         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1989                         MALI_ATTRIBUTE_BUFFER_LENGTH * (xfb_base + ctx->streamout.num_targets),
1990                         MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1991         struct mali_attribute_buffer_packed *varyings =
1992                 (struct mali_attribute_buffer_packed *) T.cpu;
1993
1994         /* Emit the stream out buffers */
1995
1996         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1997                                                            ctx->vertex_count);
1998
1999         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2000                 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
2001                                         so->stride[i],
2002                                         ctx->streamout.offsets[i],
2003                                         out_count,
2004                                         ctx->streamout.targets[i]);
2005         }
2006
2007         panfrost_emit_varyings(batch,
2008                         &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
2009                         gen_stride, vertex_count);
2010
2011         /* fp32 vec4 gl_Position */
2012         tiler_postfix->position_varying = panfrost_emit_varyings(batch,
2013                         &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
2014                         sizeof(float) * 4, vertex_count);
2015
2016         if (present & (1 << PAN_VARY_PSIZ)) {
2017                 primitive_size->pointer = panfrost_emit_varyings(batch,
2018                                 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
2019                                 2, vertex_count);
2020         }
2021
2022         pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD);
2023         pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
2024         pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
2025
2026         vertex_postfix->varyings = T.gpu;
2027         tiler_postfix->varyings = T.gpu;
2028
2029         vertex_postfix->varying_meta = trans.gpu;
2030         tiler_postfix->varying_meta = trans.gpu + vs_size;
2031 }
2032
2033 void
2034 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
2035                                 struct mali_vertex_tiler_prefix *vertex_prefix,
2036                                 struct mali_vertex_tiler_postfix *vertex_postfix,
2037                                 struct mali_vertex_tiler_prefix *tiler_prefix,
2038                                 struct mali_vertex_tiler_postfix *tiler_postfix,
2039                                 union midgard_primitive_size *primitive_size)
2040 {
2041         struct panfrost_context *ctx = batch->ctx;
2042         struct panfrost_device *device = pan_device(ctx->base.screen);
2043         bool wallpapering = ctx->wallpaper_batch && batch->scoreboard.tiler_dep;
2044         struct bifrost_payload_vertex bifrost_vertex = {0,};
2045         struct bifrost_payload_tiler bifrost_tiler = {0,};
2046         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
2047         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
2048         void *vp, *tp;
2049         size_t vp_size, tp_size;
2050
2051         if (device->quirks & IS_BIFROST) {
2052                 bifrost_vertex.prefix = *vertex_prefix;
2053                 bifrost_vertex.postfix = *vertex_postfix;
2054                 vp = &bifrost_vertex;
2055                 vp_size = sizeof(bifrost_vertex);
2056
2057                 bifrost_tiler.prefix = *tiler_prefix;
2058                 bifrost_tiler.tiler.primitive_size = *primitive_size;
2059                 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2060                 bifrost_tiler.postfix = *tiler_postfix;
2061                 tp = &bifrost_tiler;
2062                 tp_size = sizeof(bifrost_tiler);
2063         } else {
2064                 midgard_vertex.prefix = *vertex_prefix;
2065                 midgard_vertex.postfix = *vertex_postfix;
2066                 vp = &midgard_vertex;
2067                 vp_size = sizeof(midgard_vertex);
2068
2069                 midgard_tiler.prefix = *tiler_prefix;
2070                 midgard_tiler.postfix = *tiler_postfix;
2071                 midgard_tiler.primitive_size = *primitive_size;
2072                 tp = &midgard_tiler;
2073                 tp_size = sizeof(midgard_tiler);
2074         }
2075
2076         if (wallpapering) {
2077                 /* Inject in reverse order, with "predicted" job indices.
2078                  * THIS IS A HACK XXX */
2079                 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false,
2080                                  batch->scoreboard.job_index + 2, tp, tp_size, true);
2081                 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2082                                  vp, vp_size, true);
2083                 return;
2084         }
2085
2086         /* If rasterizer discard is enable, only submit the vertex */
2087
2088         unsigned vertex = panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2089                                            vp, vp_size, false);
2090
2091         if (ctx->rasterizer->base.rasterizer_discard)
2092                 return;
2093
2094         panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false, vertex, tp, tp_size,
2095                          false);
2096 }
2097
2098 /* TODO: stop hardcoding this */
2099 mali_ptr
2100 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2101 {
2102         uint16_t locations[] = {
2103             128, 128,
2104             0, 256,
2105             0, 256,
2106             0, 256,
2107             0, 256,
2108             0, 256,
2109             0, 256,
2110             0, 256,
2111             0, 256,
2112             0, 256,
2113             0, 256,
2114             0, 256,
2115             0, 256,
2116             0, 256,
2117             0, 256,
2118             0, 256,
2119             0, 256,
2120             0, 256,
2121             0, 256,
2122             0, 256,
2123             0, 256,
2124             0, 256,
2125             0, 256,
2126             0, 256,
2127             0, 256,
2128             0, 256,
2129             0, 256,
2130             0, 256,
2131             0, 256,
2132             0, 256,
2133             0, 256,
2134             0, 256,
2135             128, 128,
2136             0, 0,
2137             0, 0,
2138             0, 0,
2139             0, 0,
2140             0, 0,
2141             0, 0,
2142             0, 0,
2143             0, 0,
2144             0, 0,
2145             0, 0,
2146             0, 0,
2147             0, 0,
2148             0, 0,
2149             0, 0,
2150             0, 0,
2151         };
2152
2153         return panfrost_pool_upload_aligned(&batch->pool, locations, 96 * sizeof(uint16_t), 64);
2154 }