src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28
  29 #include "panfrost-quirks.h"
  30
  31 #include "pan_pool.h"
  32 #include "pan_bo.h"
  33 #include "pan_cmdstream.h"
  34 #include "pan_context.h"
  35 #include "pan_job.h"
  36
  37 /* If a BO is accessed for a particular shader stage, will it be in the primary
  38  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  39  * fragment will be primary, e.g. compute jobs will be considered
  40  * "vertex/tiler" by analogy */
  41
  42 static inline uint32_t
  43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  44 {
  45         assert(stage == PIPE_SHADER_FRAGMENT ||
  46                stage == PIPE_SHADER_VERTEX ||
  47                stage == PIPE_SHADER_COMPUTE);
  48
  49         return stage == PIPE_SHADER_FRAGMENT ?
  50                PAN_BO_ACCESS_FRAGMENT :
  51                PAN_BO_ACCESS_VERTEX_TILER;
  52 }
  53
  54 static void
  55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
  56                                struct mali_vertex_tiler_postfix *postfix)
  57 {
  58         struct panfrost_device *dev = pan_device(ctx->base.screen);
  59         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  60
  61         struct mali_shared_memory shared = {
  62                 .shared_workgroup_count = ~0,
  63         };
  64
  65         if (batch->stack_size) {
  66                 struct panfrost_bo *stack =
  67                         panfrost_batch_get_scratchpad(batch, batch->stack_size,
  68                                         dev->thread_tls_alloc,
  69                                         dev->core_count);
  70
  71                 shared.stack_shift = panfrost_get_stack_shift(batch->stack_size);
  72                 shared.scratchpad = stack->gpu;
  73         }
  74
  75         postfix->shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared, sizeof(shared), 64);
  76 }
  77
  78 static void
  79 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
  80                                struct mali_vertex_tiler_postfix *postfix)
  81 {
  82         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  83         postfix->shared_memory = panfrost_batch_reserve_framebuffer(batch);
  84 }
  85
  86 static void
  87 panfrost_vt_update_rasterizer(struct panfrost_rasterizer *rasterizer,
  88                               struct mali_vertex_tiler_prefix *prefix,
  89                               struct mali_vertex_tiler_postfix *postfix)
  90 {
  91         postfix->gl_enables |= 0x7;
  92         SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
  93                 rasterizer->base.front_ccw);
  94         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
  95                 (rasterizer->base.cull_face & PIPE_FACE_FRONT));
  96         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
  97                 (rasterizer->base.cull_face & PIPE_FACE_BACK));
  98         SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
  99                 rasterizer->base.flatshade_first);
 100 }
 101
 102 void
 103 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
 104                                   struct mali_vertex_tiler_prefix *prefix,
 105                                   union midgard_primitive_size *primitive_size)
 106 {
 107         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 108
 109         if (!panfrost_writes_point_size(ctx)) {
 110                 float val = (prefix->draw_mode == MALI_DRAW_MODE_POINTS) ?
 111                               rasterizer->base.point_size :
 112                               rasterizer->base.line_width;
 113
 114                 primitive_size->constant = val;
 115         }
 116 }
 117
 118 static void
 119 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
 120                                    struct mali_vertex_tiler_postfix *postfix)
 121 {
 122         SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
 123         if (ctx->occlusion_query) {
 124                 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
 125                 panfrost_batch_add_bo(ctx->batch, ctx->occlusion_query->bo,
 126                                       PAN_BO_ACCESS_SHARED |
 127                                       PAN_BO_ACCESS_RW |
 128                                       PAN_BO_ACCESS_FRAGMENT);
 129         } else {
 130                 postfix->occlusion_counter = 0;
 131         }
 132 }
 133
 134 void
 135 panfrost_vt_init(struct panfrost_context *ctx,
 136                  enum pipe_shader_type stage,
 137                  struct mali_vertex_tiler_prefix *prefix,
 138                  struct mali_vertex_tiler_postfix *postfix)
 139 {
 140         struct panfrost_device *device = pan_device(ctx->base.screen);
 141
 142         if (!ctx->shader[stage])
 143                 return;
 144
 145         memset(prefix, 0, sizeof(*prefix));
 146         memset(postfix, 0, sizeof(*postfix));
 147
 148         if (device->quirks & IS_BIFROST) {
 149                 postfix->gl_enables = 0x2;
 150                 panfrost_vt_emit_shared_memory(ctx, postfix);
 151         } else {
 152                 postfix->gl_enables = 0x6;
 153                 panfrost_vt_attach_framebuffer(ctx, postfix);
 154         }
 155
 156         if (stage == PIPE_SHADER_FRAGMENT) {
 157                 panfrost_vt_update_occlusion_query(ctx, postfix);
 158                 panfrost_vt_update_rasterizer(ctx->rasterizer, prefix, postfix);
 159         }
 160 }
 161
 162 static unsigned
 163 panfrost_translate_index_size(unsigned size)
 164 {
 165         switch (size) {
 166         case 1:
 167                 return MALI_DRAW_INDEXED_UINT8;
 168
 169         case 2:
 170                 return MALI_DRAW_INDEXED_UINT16;
 171
 172         case 4:
 173                 return MALI_DRAW_INDEXED_UINT32;
 174
 175         default:
 176                 unreachable("Invalid index size");
 177         }
 178 }
 179
 180 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
 181  * good for the duration of the draw (transient), could last longer. Also get
 182  * the bounds on the index buffer for the range accessed by the draw. We do
 183  * these operations together because there are natural optimizations which
 184  * require them to be together. */
 185
 186 static mali_ptr
 187 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 188                                   const struct pipe_draw_info *info,
 189                                   unsigned *min_index, unsigned *max_index)
 190 {
 191         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 192         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 193         off_t offset = info->start * info->index_size;
 194         bool needs_indices = true;
 195         mali_ptr out = 0;
 196
 197         if (info->max_index != ~0u) {
 198                 *min_index = info->min_index;
 199                 *max_index = info->max_index;
 200                 needs_indices = false;
 201         }
 202
 203         if (!info->has_user_indices) {
 204                 /* Only resources can be directly mapped */
 205                 panfrost_batch_add_bo(batch, rsrc->bo,
 206                                       PAN_BO_ACCESS_SHARED |
 207                                       PAN_BO_ACCESS_READ |
 208                                       PAN_BO_ACCESS_VERTEX_TILER);
 209                 out = rsrc->bo->gpu + offset;
 210
 211                 /* Check the cache */
 212                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 213                                                            info->start,
 214                                                            info->count,
 215                                                            min_index,
 216                                                            max_index);
 217         } else {
 218                 /* Otherwise, we need to upload to transient memory */
 219                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 220                 struct panfrost_transfer T =
 221                         panfrost_pool_alloc_aligned(&batch->pool,
 222                                 info->count * info->index_size,
 223                                 info->index_size);
 224
 225                 memcpy(T.cpu, ibuf8 + offset, info->count * info->index_size);
 226                 out = T.gpu;
 227         }
 228
 229         if (needs_indices) {
 230                 /* Fallback */
 231                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 232
 233                 if (!info->has_user_indices)
 234                         panfrost_minmax_cache_add(rsrc->index_cache,
 235                                                   info->start, info->count,
 236                                                   *min_index, *max_index);
 237         }
 238
 239         return out;
 240 }
 241
 242 void
 243 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
 244                           const struct pipe_draw_info *info,
 245                           enum mali_draw_mode draw_mode,
 246                           struct mali_vertex_tiler_postfix *vertex_postfix,
 247                           struct mali_vertex_tiler_prefix *tiler_prefix,
 248                           struct mali_vertex_tiler_postfix *tiler_postfix,
 249                           unsigned *vertex_count,
 250                           unsigned *padded_count)
 251 {
 252         tiler_prefix->draw_mode = draw_mode;
 253
 254         unsigned draw_flags = 0;
 255
 256         if (panfrost_writes_point_size(ctx))
 257                 draw_flags |= MALI_DRAW_VARYING_SIZE;
 258
 259         if (info->primitive_restart)
 260                 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
 261
 262         /* These doesn't make much sense */
 263
 264         draw_flags |= 0x3000;
 265
 266         if (info->index_size) {
 267                 unsigned min_index = 0, max_index = 0;
 268
 269                 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
 270                                                                        info,
 271                                                                        &min_index,
 272                                                                        &max_index);
 273
 274                 /* Use the corresponding values */
 275                 *vertex_count = max_index - min_index + 1;
 276                 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
 277                 tiler_prefix->offset_bias_correction = -min_index;
 278                 tiler_prefix->index_count = MALI_POSITIVE(info->count);
 279                 draw_flags |= panfrost_translate_index_size(info->index_size);
 280         } else {
 281                 tiler_prefix->indices = 0;
 282                 *vertex_count = ctx->vertex_count;
 283                 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
 284                 tiler_prefix->offset_bias_correction = 0;
 285                 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
 286         }
 287
 288         tiler_prefix->unknown_draw = draw_flags;
 289
 290         /* Encode the padded vertex count */
 291
 292         if (info->instance_count > 1) {
 293                 *padded_count = panfrost_padded_vertex_count(*vertex_count);
 294
 295                 unsigned shift = __builtin_ctz(ctx->padded_count);
 296                 unsigned k = ctx->padded_count >> (shift + 1);
 297
 298                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
 299                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
 300         } else {
 301                 *padded_count = *vertex_count;
 302
 303                 /* Reset instancing state */
 304                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
 305                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
 306         }
 307 }
 308
 309 static unsigned
 310 translate_tex_wrap(enum pipe_tex_wrap w)
 311 {
 312         switch (w) {
 313         case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT;
 314         case PIPE_TEX_WRAP_CLAMP: return MALI_WRAP_MODE_CLAMP;
 315         case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE;
 316         case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER;
 317         case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT;
 318         case PIPE_TEX_WRAP_MIRROR_CLAMP: return MALI_WRAP_MODE_MIRRORED_CLAMP;
 319         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
 320         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
 321         default: unreachable("Invalid wrap");
 322         }
 323 }
 324
 325 /* The hardware compares in the wrong order order, so we have to flip before
 326  * encoding. Yes, really. */
 327
 328 static enum mali_func
 329 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
 330 {
 331         if (!cso->compare_mode)
 332                 return MALI_FUNC_NEVER;
 333
 334         enum mali_func f = panfrost_translate_compare_func(cso->compare_func);
 335         return panfrost_flip_compare_func(f);
 336 }
 337
 338 static enum mali_mipmap_mode
 339 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
 340 {
 341         switch (f) {
 342         case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST;
 343         case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR;
 344         case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE;
 345         default: unreachable("Invalid");
 346         }
 347 }
 348
 349 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 350                                 struct mali_midgard_sampler_packed *hw)
 351 {
 352         pan_pack(hw, MIDGARD_SAMPLER, cfg) {
 353                 cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 354                 cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 355                 cfg.mipmap_mode = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) ?
 356                         MALI_MIPMAP_MODE_TRILINEAR : MALI_MIPMAP_MODE_NEAREST;
 357                 cfg.normalized_coordinates = cso->normalized_coords;
 358
 359                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
 360
 361                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
 362
 363                 /* If necessary, we disable mipmapping in the sampler descriptor by
 364                  * clamping the LOD as tight as possible (from 0 to epsilon,
 365                  * essentially -- remember these are fixed point numbers, so
 366                  * epsilon=1/256) */
 367
 368                 cfg.maximum_lod = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) ?
 369                         cfg.minimum_lod + 1 :
 370                         FIXED_16(cso->max_lod, false);
 371
 372                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
 373                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
 374                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
 375
 376                 cfg.compare_function = panfrost_sampler_compare_func(cso);
 377                 cfg.seamless_cube_map = cso->seamless_cube_map;
 378
 379                 cfg.border_color_r = cso->border_color.f[0];
 380                 cfg.border_color_g = cso->border_color.f[1];
 381                 cfg.border_color_b = cso->border_color.f[2];
 382                 cfg.border_color_a = cso->border_color.f[3];
 383         }
 384 }
 385
 386 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
 387                                         struct mali_bifrost_sampler_packed *hw)
 388 {
 389         pan_pack(hw, BIFROST_SAMPLER, cfg) {
 390                 cfg.magnify_linear = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR;
 391                 cfg.minify_linear = cso->min_img_filter == PIPE_TEX_FILTER_LINEAR;
 392                 cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
 393                 cfg.normalized_coordinates = cso->normalized_coords;
 394
 395                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
 396                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
 397                 cfg.maximum_lod = FIXED_16(cso->max_lod, false);
 398
 399                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
 400                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
 401                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
 402
 403                 cfg.compare_function = panfrost_sampler_compare_func(cso);
 404                 cfg.seamless_cube_map = cso->seamless_cube_map;
 405         }
 406 }
 407
 408 static bool
 409 panfrost_fs_required(
 410                 struct panfrost_shader_state *fs,
 411                 struct panfrost_blend_final *blend,
 412                 unsigned rt_count)
 413 {
 414         /* If we generally have side effects */
 415         if (fs->fs_sidefx)
 416                 return true;
 417
 418         /* If colour is written we need to execute */
 419         for (unsigned i = 0; i < rt_count; ++i) {
 420                 if (!blend[i].no_colour)
 421                         return true;
 422         }
 423
 424         /* If depth is written and not implied we need to execute.
 425          * TODO: Predicate on Z/S writes being enabled */
 426         return (fs->writes_depth || fs->writes_stencil);
 427 }
 428
 429 static void
 430 panfrost_emit_blend(struct panfrost_batch *batch, void *rts,
 431                 struct panfrost_blend_final *blend)
 432 {
 433         const struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
 434         struct panfrost_shader_state *fs = panfrost_get_shader_state(batch->ctx, PIPE_SHADER_FRAGMENT);
 435         unsigned rt_count = batch->key.nr_cbufs;
 436
 437         struct bifrost_blend_rt *brts = rts;
 438         struct midgard_blend_rt *mrts = rts;
 439
 440         /* Disable blending for depth-only on Bifrost */
 441
 442         if (rt_count == 0 && dev->quirks & IS_BIFROST)
 443                 brts[0].unk2 = 0x3;
 444
 445         for (unsigned i = 0; i < rt_count; ++i) {
 446                 unsigned flags = 0;
 447
 448                 pan_pack(&flags, BLEND_FLAGS, cfg) {
 449                         if (blend[i].no_colour) {
 450                                 cfg.enable = false;
 451                                 break;
 452                         }
 453
 454                         batch->draws |= (PIPE_CLEAR_COLOR0 << i);
 455
 456                         cfg.srgb = util_format_is_srgb(batch->key.cbufs[i]->format);
 457                         cfg.load_destination = blend[i].load_dest;
 458                         cfg.dither_disable = !batch->ctx->blend->base.dither;
 459
 460                         if (!(dev->quirks & IS_BIFROST))
 461                                 cfg.midgard_blend_shader = blend[i].is_shader;
 462                 }
 463
 464                 if (dev->quirks & IS_BIFROST) {
 465                         brts[i].flags = flags;
 466
 467                         if (blend[i].is_shader) {
 468                                 /* The blend shader's address needs to be at
 469                                  * the same top 32 bit as the fragment shader.
 470                                  * TODO: Ensure that's always the case.
 471                                  */
 472                                 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
 473                                        (fs->bo->gpu & (0xffffffffull << 32)));
 474                                 brts[i].shader = blend[i].shader.gpu;
 475                                 brts[i].unk2 = 0x0;
 476                         } else {
 477                                 enum pipe_format format = batch->key.cbufs[i]->format;
 478                                 const struct util_format_description *format_desc;
 479                                 format_desc = util_format_description(format);
 480
 481                                 brts[i].equation = blend[i].equation.equation;
 482
 483                                 /* TODO: this is a bit more complicated */
 484                                 brts[i].constant = blend[i].equation.constant;
 485
 486                                 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
 487
 488                                 /* 0x19 disables blending and forces REPLACE
 489                                  * mode (equivalent to rgb_mode = alpha_mode =
 490                                  * x122, colour mask = 0xF). 0x1a allows
 491                                  * blending. */
 492                                 brts[i].unk2 = blend[i].opaque ? 0x19 : 0x1a;
 493
 494                                 brts[i].shader_type = fs->blend_types[i];
 495                         }
 496                 } else {
 497                         memcpy(&mrts[i].flags, &flags, sizeof(flags));
 498
 499                         if (blend[i].is_shader) {
 500                                 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 501                         } else {
 502                                 mrts[i].blend.equation = blend[i].equation.equation;
 503                                 mrts[i].blend.constant = blend[i].equation.constant;
 504                         }
 505                 }
 506         }
 507 }
 508
 509 static void
 510 panfrost_emit_frag_shader(struct panfrost_context *ctx,
 511                                struct mali_state_packed *fragmeta,
 512                                struct panfrost_blend_final *blend)
 513 {
 514         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 515         struct panfrost_shader_state *fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 516         struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 517         const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
 518         unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
 519         bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
 520
 521         /* Built up here */
 522         struct mali_shader_packed shader = fs->shader;
 523         struct mali_preload_packed preload = fs->preload;
 524         uint32_t properties;
 525         struct mali_multisample_misc_packed multisample_misc;
 526         struct mali_stencil_mask_misc_packed stencil_mask_misc;
 527         union midgard_blend sfbd_blend = { 0 };
 528
 529         if (!panfrost_fs_required(fs, blend, rt_count)) {
 530                 if (dev->quirks & IS_BIFROST) {
 531                         pan_pack(&shader, SHADER, cfg) {}
 532
 533                         pan_pack(&properties, BIFROST_PROPERTIES, cfg) {
 534                                 cfg.unknown = 0x950020; /* XXX */
 535                                 cfg.early_z_enable = true;
 536                         }
 537
 538                         preload.opaque[0] = 0;
 539                 } else {
 540                         pan_pack(&shader, SHADER, cfg) {
 541                                 cfg.shader = 0x1;
 542                         }
 543
 544                         pan_pack(&properties, MIDGARD_PROPERTIES, cfg) {
 545                                 cfg.work_register_count = 1;
 546                                 cfg.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION;
 547                                 cfg.early_z_enable = true;
 548                         }
 549                 }
 550         } else if (dev->quirks & IS_BIFROST) {
 551                 bool no_blend = true;
 552
 553                 for (unsigned i = 0; i < rt_count; ++i)
 554                         no_blend &= (!blend[i].load_dest | blend[i].no_colour);
 555
 556                 pan_pack(&properties, BIFROST_PROPERTIES, cfg) {
 557                         cfg.early_z_enable = !fs->can_discard && !fs->writes_depth && no_blend;
 558                 }
 559
 560                 /* Combine with prepacked properties */
 561                 properties |= fs->properties.opaque[0];
 562         } else {
 563                 /* Reasons to disable early-Z from a shader perspective */
 564                 bool late_z = fs->can_discard || fs->writes_global ||
 565                         fs->writes_depth || fs->writes_stencil;
 566
 567                 /* If either depth or stencil is enabled, discard matters */
 568                 bool zs_enabled =
 569                         (zsa->base.depth.enabled && zsa->base.depth.func != PIPE_FUNC_ALWAYS) ||
 570                         zsa->base.stencil[0].enabled;
 571
 572                 bool has_blend_shader = false;
 573
 574                 for (unsigned c = 0; c < rt_count; ++c)
 575                         has_blend_shader |= blend[c].is_shader;
 576
 577                 pan_pack(&properties, MIDGARD_PROPERTIES, cfg) {
 578                         /* TODO: Reduce this limit? */
 579                         if (has_blend_shader)
 580                                 cfg.work_register_count = MAX2(fs->work_reg_count, 8);
 581                         else
 582                                 cfg.work_register_count = fs->work_reg_count;
 583
 584                         cfg.early_z_enable = !(late_z || alpha_to_coverage);
 585                         cfg.reads_tilebuffer = fs->outputs_read || (!zs_enabled && fs->can_discard);
 586                         cfg.reads_depth_stencil = zs_enabled && fs->can_discard;
 587                 }
 588
 589                 properties |= fs->properties.opaque[0];
 590         }
 591
 592         pan_pack(&multisample_misc, MULTISAMPLE_MISC, cfg) {
 593                 bool msaa = rast->multisample;
 594                 cfg.multisample_enable = msaa;
 595                 cfg.sample_mask = (msaa ? ctx->sample_mask : ~0) & 0xFFFF;
 596
 597                 /* EXT_shader_framebuffer_fetch requires per-sample */
 598                 bool per_sample = ctx->min_samples > 1 || fs->outputs_read;
 599                 cfg.evaluate_per_sample = msaa && per_sample;
 600
 601                 if (dev->quirks & MIDGARD_SFBD) {
 602                         cfg.sfbd_load_destination = blend[0].load_dest;
 603                         cfg.sfbd_blend_shader = blend[0].is_shader;
 604                 }
 605
 606                 cfg.depth_function = zsa->base.depth.enabled ?
 607                         panfrost_translate_compare_func(zsa->base.depth.func) :
 608                         MALI_FUNC_ALWAYS;
 609
 610                 cfg.depth_write_mask = zsa->base.depth.writemask;
 611                 cfg.near_discard = rast->depth_clip_near;
 612                 cfg.far_discard = rast->depth_clip_far;
 613                 cfg.unknown_2 = true;
 614         }
 615
 616         pan_pack(&stencil_mask_misc, STENCIL_MASK_MISC, cfg) {
 617                 cfg.stencil_mask_front = zsa->stencil_mask_front;
 618                 cfg.stencil_mask_back = zsa->stencil_mask_back;
 619                 cfg.stencil_enable = zsa->base.stencil[0].enabled;
 620                 cfg.alpha_to_coverage = alpha_to_coverage;
 621
 622                 if (dev->quirks & MIDGARD_SFBD) {
 623                         cfg.sfbd_write_enable = !blend[0].no_colour;
 624                         cfg.sfbd_srgb = util_format_is_srgb(ctx->pipe_framebuffer.cbufs[0]->format);
 625                         cfg.sfbd_dither_disable = !ctx->blend->base.dither;
 626                 }
 627
 628                 cfg.unknown_1 = 0x7;
 629                 cfg.depth_range_1 = cfg.depth_range_2 = rast->offset_tri;
 630                 cfg.single_sampled_lines = !rast->multisample;
 631         }
 632
 633         if (dev->quirks & MIDGARD_SFBD) {
 634                 if (blend[0].is_shader) {
 635                         sfbd_blend.shader = blend[0].shader.gpu |
 636                                 blend[0].shader.first_tag;
 637                 } else {
 638                         sfbd_blend.equation = blend[0].equation.equation;
 639                         sfbd_blend.constant = blend[0].equation.constant;
 640                 }
 641         } else if (!(dev->quirks & IS_BIFROST)) {
 642                 /* Bug where MRT-capable hw apparently reads the last blend
 643                  * shader from here instead of the usual location? */
 644
 645                 for (signed rt = ((signed) rt_count - 1); rt >= 0; --rt) {
 646                         if (!blend[rt].is_shader)
 647                                 continue;
 648
 649                         sfbd_blend.shader = blend[rt].shader.gpu |
 650                                                  blend[rt].shader.first_tag;
 651                         break;
 652                 }
 653         }
 654
 655         pan_pack(fragmeta, STATE_OPAQUE, cfg) {
 656                 cfg.shader = fs->shader;
 657                 cfg.properties = properties;
 658                 cfg.depth_units = rast->offset_units * 2.0f;
 659                 cfg.depth_factor = rast->offset_scale;
 660                 cfg.multisample_misc = multisample_misc;
 661                 cfg.stencil_mask_misc = stencil_mask_misc;
 662
 663                 cfg.stencil_front = zsa->stencil_front;
 664                 cfg.stencil_back = zsa->stencil_back;
 665
 666                 /* Bottom bits for stencil ref, exactly one word */
 667                 bool back_enab = zsa->base.stencil[1].enabled;
 668                 cfg.stencil_front.opaque[0] |= ctx->stencil_ref.ref_value[0];
 669                 cfg.stencil_back.opaque[0] |= ctx->stencil_ref.ref_value[back_enab ? 1 : 0];
 670
 671                 if (dev->quirks & IS_BIFROST)
 672                         cfg.preload = preload;
 673                 else
 674                         memcpy(&cfg.sfbd_blend, &sfbd_blend, sizeof(sfbd_blend));
 675         }
 676 }
 677
 678 mali_ptr
 679 panfrost_emit_compute_shader_meta(struct panfrost_batch *batch, enum pipe_shader_type stage)
 680 {
 681         struct panfrost_shader_state *ss = panfrost_get_shader_state(batch->ctx, stage);
 682
 683         panfrost_batch_add_bo(batch, ss->bo,
 684                               PAN_BO_ACCESS_PRIVATE |
 685                               PAN_BO_ACCESS_READ |
 686                               PAN_BO_ACCESS_VERTEX_TILER);
 687
 688         panfrost_batch_add_bo(batch, pan_resource(ss->upload.rsrc)->bo,
 689                               PAN_BO_ACCESS_PRIVATE |
 690                               PAN_BO_ACCESS_READ |
 691                               PAN_BO_ACCESS_VERTEX_TILER);
 692
 693         return pan_resource(ss->upload.rsrc)->bo->gpu + ss->upload.offset;
 694 }
 695
 696 mali_ptr
 697 panfrost_emit_frag_shader_meta(struct panfrost_batch *batch)
 698 {
 699         struct panfrost_context *ctx = batch->ctx;
 700         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 701
 702         /* Add the shader BO to the batch. */
 703         panfrost_batch_add_bo(batch, ss->bo,
 704                               PAN_BO_ACCESS_PRIVATE |
 705                               PAN_BO_ACCESS_READ |
 706                               PAN_BO_ACCESS_FRAGMENT);
 707
 708         struct panfrost_device *dev = pan_device(ctx->base.screen);
 709         unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 710         void *rts = NULL;
 711         struct panfrost_transfer xfer;
 712         unsigned rt_size;
 713
 714         if (dev->quirks & MIDGARD_SFBD)
 715                 rt_size = 0;
 716         else if (dev->quirks & IS_BIFROST)
 717                 rt_size = sizeof(struct bifrost_blend_rt);
 718         else
 719                 rt_size = sizeof(struct midgard_blend_rt);
 720
 721         unsigned desc_size = MALI_STATE_LENGTH + rt_size * rt_count;
 722
 723         if (rt_size)
 724                 rts = rzalloc_size(ctx, rt_size * rt_count);
 725
 726         struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 727
 728         for (unsigned c = 0; c < ctx->pipe_framebuffer.nr_cbufs; ++c)
 729                 blend[c] = panfrost_get_blend_for_context(ctx, c);
 730
 731         if (!(dev->quirks & MIDGARD_SFBD))
 732                 panfrost_emit_blend(batch, rts, blend);
 733         else
 734                 batch->draws |= PIPE_CLEAR_COLOR0;
 735
 736         xfer = panfrost_pool_alloc_aligned(&batch->pool, desc_size, MALI_STATE_LENGTH);
 737
 738         panfrost_emit_frag_shader(ctx, (struct mali_state_packed *) xfer.cpu, blend);
 739
 740         memcpy(xfer.cpu + MALI_STATE_LENGTH, rts, rt_size * rt_count);
 741
 742         if (rt_size)
 743                 ralloc_free(rts);
 744
 745         return xfer.gpu;
 746 }
 747
 748 void
 749 panfrost_emit_viewport(struct panfrost_batch *batch,
 750                        struct mali_vertex_tiler_postfix *tiler_postfix)
 751 {
 752         struct panfrost_context *ctx = batch->ctx;
 753         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 754         const struct pipe_scissor_state *ss = &ctx->scissor;
 755         const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 756         const struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
 757
 758         /* Derive min/max from translate/scale. Note since |x| >= 0 by
 759          * definition, we have that -|x| <= |x| hence translate - |scale| <=
 760          * translate + |scale|, so the ordering is correct here. */
 761         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 762         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 763         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 764         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 765         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 766         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 767
 768         /* Scissor to the intersection of viewport and to the scissor, clamped
 769          * to the framebuffer */
 770
 771         unsigned minx = MIN2(fb->width, vp_minx);
 772         unsigned maxx = MIN2(fb->width, vp_maxx);
 773         unsigned miny = MIN2(fb->height, vp_miny);
 774         unsigned maxy = MIN2(fb->height, vp_maxy);
 775
 776         if (ss && rast->scissor) {
 777                 minx = MAX2(ss->minx, minx);
 778                 miny = MAX2(ss->miny, miny);
 779                 maxx = MIN2(ss->maxx, maxx);
 780                 maxy = MIN2(ss->maxy, maxy);
 781         }
 782
 783         struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, MALI_VIEWPORT_LENGTH);
 784
 785         pan_pack(T.cpu, VIEWPORT, cfg) {
 786                 cfg.scissor_minimum_x = minx;
 787                 cfg.scissor_minimum_y = miny;
 788                 cfg.scissor_maximum_x = maxx - 1;
 789                 cfg.scissor_maximum_y = maxy - 1;
 790
 791                 cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY;
 792                 cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY;
 793         }
 794
 795         tiler_postfix->viewport = T.gpu;
 796         panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
 797 }
 798
 799 static mali_ptr
 800 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
 801                                  enum pipe_shader_type st,
 802                                  struct panfrost_constant_buffer *buf,
 803                                  unsigned index)
 804 {
 805         struct pipe_constant_buffer *cb = &buf->cb[index];
 806         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
 807
 808         if (rsrc) {
 809                 panfrost_batch_add_bo(batch, rsrc->bo,
 810                                       PAN_BO_ACCESS_SHARED |
 811                                       PAN_BO_ACCESS_READ |
 812                                       panfrost_bo_access_for_stage(st));
 813
 814                 /* Alignment gauranteed by
 815                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
 816                 return rsrc->bo->gpu + cb->buffer_offset;
 817         } else if (cb->user_buffer) {
 818                 return panfrost_pool_upload_aligned(&batch->pool,
 819                                                  cb->user_buffer +
 820                                                  cb->buffer_offset,
 821                                                  cb->buffer_size, 16);
 822         } else {
 823                 unreachable("No constant buffer");
 824         }
 825 }
 826
 827 struct sysval_uniform {
 828         union {
 829                 float f[4];
 830                 int32_t i[4];
 831                 uint32_t u[4];
 832                 uint64_t du[2];
 833         };
 834 };
 835
 836 static void
 837 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
 838                                       struct sysval_uniform *uniform)
 839 {
 840         struct panfrost_context *ctx = batch->ctx;
 841         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 842
 843         uniform->f[0] = vp->scale[0];
 844         uniform->f[1] = vp->scale[1];
 845         uniform->f[2] = vp->scale[2];
 846 }
 847
 848 static void
 849 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
 850                                        struct sysval_uniform *uniform)
 851 {
 852         struct panfrost_context *ctx = batch->ctx;
 853         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 854
 855         uniform->f[0] = vp->translate[0];
 856         uniform->f[1] = vp->translate[1];
 857         uniform->f[2] = vp->translate[2];
 858 }
 859
 860 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
 861                                        enum pipe_shader_type st,
 862                                        unsigned int sysvalid,
 863                                        struct sysval_uniform *uniform)
 864 {
 865         struct panfrost_context *ctx = batch->ctx;
 866         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
 867         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
 868         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
 869         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
 870
 871         assert(dim);
 872         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
 873
 874         if (dim > 1)
 875                 uniform->i[1] = u_minify(tex->texture->height0,
 876                                          tex->u.tex.first_level);
 877
 878         if (dim > 2)
 879                 uniform->i[2] = u_minify(tex->texture->depth0,
 880                                          tex->u.tex.first_level);
 881
 882         if (is_array)
 883                 uniform->i[dim] = tex->texture->array_size;
 884 }
 885
 886 static void
 887 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
 888                             enum pipe_shader_type st,
 889                             unsigned ssbo_id,
 890                             struct sysval_uniform *uniform)
 891 {
 892         struct panfrost_context *ctx = batch->ctx;
 893
 894         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
 895         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
 896
 897         /* Compute address */
 898         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
 899
 900         panfrost_batch_add_bo(batch, bo,
 901                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
 902                               panfrost_bo_access_for_stage(st));
 903
 904         /* Upload address and size as sysval */
 905         uniform->du[0] = bo->gpu + sb.buffer_offset;
 906         uniform->u[2] = sb.buffer_size;
 907 }
 908
 909 static void
 910 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
 911                                enum pipe_shader_type st,
 912                                unsigned samp_idx,
 913                                struct sysval_uniform *uniform)
 914 {
 915         struct panfrost_context *ctx = batch->ctx;
 916         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
 917
 918         uniform->f[0] = sampl->min_lod;
 919         uniform->f[1] = sampl->max_lod;
 920         uniform->f[2] = sampl->lod_bias;
 921
 922         /* Even without any errata, Midgard represents "no mipmapping" as
 923          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
 924          * panfrost_create_sampler_state which also explains our choice of
 925          * epsilon value (again to keep behaviour consistent) */
 926
 927         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 928                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
 929 }
 930
 931 static void
 932 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
 933                                        struct sysval_uniform *uniform)
 934 {
 935         struct panfrost_context *ctx = batch->ctx;
 936
 937         uniform->u[0] = ctx->compute_grid->grid[0];
 938         uniform->u[1] = ctx->compute_grid->grid[1];
 939         uniform->u[2] = ctx->compute_grid->grid[2];
 940 }
 941
 942 static void
 943 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
 944                         struct panfrost_shader_state *ss,
 945                         enum pipe_shader_type st)
 946 {
 947         struct sysval_uniform *uniforms = (void *)buf;
 948
 949         for (unsigned i = 0; i < ss->sysval_count; ++i) {
 950                 int sysval = ss->sysval[i];
 951
 952                 switch (PAN_SYSVAL_TYPE(sysval)) {
 953                 case PAN_SYSVAL_VIEWPORT_SCALE:
 954                         panfrost_upload_viewport_scale_sysval(batch,
 955                                                               &uniforms[i]);
 956                         break;
 957                 case PAN_SYSVAL_VIEWPORT_OFFSET:
 958                         panfrost_upload_viewport_offset_sysval(batch,
 959                                                                &uniforms[i]);
 960                         break;
 961                 case PAN_SYSVAL_TEXTURE_SIZE:
 962                         panfrost_upload_txs_sysval(batch, st,
 963                                                    PAN_SYSVAL_ID(sysval),
 964                                                    &uniforms[i]);
 965                         break;
 966                 case PAN_SYSVAL_SSBO:
 967                         panfrost_upload_ssbo_sysval(batch, st,
 968                                                     PAN_SYSVAL_ID(sysval),
 969                                                     &uniforms[i]);
 970                         break;
 971                 case PAN_SYSVAL_NUM_WORK_GROUPS:
 972                         panfrost_upload_num_work_groups_sysval(batch,
 973                                                                &uniforms[i]);
 974                         break;
 975                 case PAN_SYSVAL_SAMPLER:
 976                         panfrost_upload_sampler_sysval(batch, st,
 977                                                        PAN_SYSVAL_ID(sysval),
 978                                                        &uniforms[i]);
 979                         break;
 980                 default:
 981                         assert(0);
 982                 }
 983         }
 984 }
 985
 986 static const void *
 987 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
 988                                  unsigned index)
 989 {
 990         struct pipe_constant_buffer *cb = &buf->cb[index];
 991         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
 992
 993         if (rsrc)
 994                 return rsrc->bo->cpu;
 995         else if (cb->user_buffer)
 996                 return cb->user_buffer;
 997         else
 998                 unreachable("No constant buffer");
 999 }
1000
1001 void
1002 panfrost_emit_const_buf(struct panfrost_batch *batch,
1003                         enum pipe_shader_type stage,
1004                         struct mali_vertex_tiler_postfix *postfix)
1005 {
1006         struct panfrost_context *ctx = batch->ctx;
1007         struct panfrost_shader_variants *all = ctx->shader[stage];
1008
1009         if (!all)
1010                 return;
1011
1012         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1013
1014         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1015
1016         /* Uniforms are implicitly UBO #0 */
1017         bool has_uniforms = buf->enabled_mask & (1 << 0);
1018
1019         /* Allocate room for the sysval and the uniforms */
1020         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1021         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1022         size_t size = sys_size + uniform_size;
1023         struct panfrost_transfer transfer =
1024                 panfrost_pool_alloc_aligned(&batch->pool, size, 16);
1025
1026         /* Upload sysvals requested by the shader */
1027         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1028
1029         /* Upload uniforms */
1030         if (has_uniforms && uniform_size) {
1031                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1032                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1033         }
1034
1035         /* Next up, attach UBOs. UBO #0 is the uniforms we just
1036          * uploaded, so it's always included. The count is the highest UBO
1037          * addressable -- gaps are included. */
1038
1039         unsigned ubo_count = 32 - __builtin_clz(buf->enabled_mask | 1);
1040
1041         size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count;
1042         struct panfrost_transfer ubos =
1043                 panfrost_pool_alloc_aligned(&batch->pool, sz,
1044                                 MALI_UNIFORM_BUFFER_LENGTH);
1045
1046         uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
1047
1048         /* Upload uniforms as a UBO */
1049
1050         if (size) {
1051                 pan_pack(ubo_ptr, UNIFORM_BUFFER, cfg) {
1052                         cfg.entries = DIV_ROUND_UP(size, 16);
1053                         cfg.pointer = transfer.gpu;
1054                 }
1055         } else {
1056                 *ubo_ptr = 0;
1057         }
1058
1059         /* The rest are honest-to-goodness UBOs */
1060
1061         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1062                 size_t usz = buf->cb[ubo].buffer_size;
1063                 bool enabled = buf->enabled_mask & (1 << ubo);
1064                 bool empty = usz == 0;
1065
1066                 if (!enabled || empty) {
1067                         ubo_ptr[ubo] = 0;
1068                         continue;
1069                 }
1070
1071                 pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) {
1072                         cfg.entries = DIV_ROUND_UP(usz, 16);
1073                         cfg.pointer = panfrost_map_constant_buffer_gpu(batch,
1074                                         stage, buf, ubo);
1075                 }
1076         }
1077
1078         postfix->uniforms = transfer.gpu;
1079         postfix->uniform_buffers = ubos.gpu;
1080
1081         buf->dirty_mask = 0;
1082 }
1083
1084 void
1085 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1086                             const struct pipe_grid_info *info,
1087                             struct midgard_payload_vertex_tiler *vtp)
1088 {
1089         struct panfrost_context *ctx = batch->ctx;
1090         struct panfrost_device *dev = pan_device(ctx->base.screen);
1091         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1092         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1093         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1094                                                            128));
1095
1096         unsigned log2_instances =
1097                 util_logbase2_ceil(info->grid[0]) +
1098                 util_logbase2_ceil(info->grid[1]) +
1099                 util_logbase2_ceil(info->grid[2]);
1100
1101         unsigned shared_size = single_size * (1 << log2_instances) * dev->core_count;
1102         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1103                                                                   shared_size,
1104                                                                   1);
1105
1106         struct mali_shared_memory shared = {
1107                 .shared_memory = bo->gpu,
1108                 .shared_workgroup_count = log2_instances,
1109                 .shared_shift = util_logbase2(single_size) + 1
1110         };
1111
1112         vtp->postfix.shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared,
1113                                                                sizeof(shared), 64);
1114 }
1115
1116 static mali_ptr
1117 panfrost_get_tex_desc(struct panfrost_batch *batch,
1118                       enum pipe_shader_type st,
1119                       struct panfrost_sampler_view *view)
1120 {
1121         if (!view)
1122                 return (mali_ptr) 0;
1123
1124         struct pipe_sampler_view *pview = &view->base;
1125         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1126
1127         /* Add the BO to the job so it's retained until the job is done. */
1128
1129         panfrost_batch_add_bo(batch, rsrc->bo,
1130                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1131                               panfrost_bo_access_for_stage(st));
1132
1133         panfrost_batch_add_bo(batch, view->bo,
1134                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1135                               panfrost_bo_access_for_stage(st));
1136
1137         return view->bo->gpu;
1138 }
1139
1140 static void
1141 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1142                              struct pipe_context *pctx)
1143 {
1144         struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1145         if (view->texture_bo != rsrc->bo->gpu ||
1146             view->modifier != rsrc->modifier) {
1147                 panfrost_bo_unreference(view->bo);
1148                 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1149         }
1150 }
1151
1152 void
1153 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1154                                   enum pipe_shader_type stage,
1155                                   struct mali_vertex_tiler_postfix *postfix)
1156 {
1157         struct panfrost_context *ctx = batch->ctx;
1158         struct panfrost_device *device = pan_device(ctx->base.screen);
1159
1160         if (!ctx->sampler_view_count[stage])
1161                 return;
1162
1163         if (device->quirks & IS_BIFROST) {
1164                 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1165                                 MALI_BIFROST_TEXTURE_LENGTH *
1166                                 ctx->sampler_view_count[stage],
1167                                 MALI_BIFROST_TEXTURE_LENGTH);
1168
1169                 struct mali_bifrost_texture_packed *out =
1170                         (struct mali_bifrost_texture_packed *) T.cpu;
1171
1172                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1173                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1174                         struct pipe_sampler_view *pview = &view->base;
1175                         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1176
1177                         panfrost_update_sampler_view(view, &ctx->base);
1178                         out[i] = view->bifrost_descriptor;
1179
1180                         /* Add the BOs to the job so they are retained until the job is done. */
1181
1182                         panfrost_batch_add_bo(batch, rsrc->bo,
1183                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1184                                               panfrost_bo_access_for_stage(stage));
1185
1186                         panfrost_batch_add_bo(batch, view->bo,
1187                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1188                                               panfrost_bo_access_for_stage(stage));
1189                 }
1190
1191                 postfix->textures = T.gpu;
1192         } else {
1193                 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1194
1195                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1196                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1197
1198                         panfrost_update_sampler_view(view, &ctx->base);
1199
1200                         trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1201                 }
1202
1203                 postfix->textures = panfrost_pool_upload_aligned(&batch->pool,
1204                                                               trampolines,
1205                                                               sizeof(uint64_t) *
1206                                                               ctx->sampler_view_count[stage],
1207                                                               sizeof(uint64_t));
1208         }
1209 }
1210
1211 void
1212 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1213                                   enum pipe_shader_type stage,
1214                                   struct mali_vertex_tiler_postfix *postfix)
1215 {
1216         struct panfrost_context *ctx = batch->ctx;
1217
1218         if (!ctx->sampler_count[stage])
1219                 return;
1220
1221         size_t desc_size = MALI_BIFROST_SAMPLER_LENGTH;
1222         assert(MALI_BIFROST_SAMPLER_LENGTH == MALI_MIDGARD_SAMPLER_LENGTH);
1223
1224         size_t sz = desc_size * ctx->sampler_count[stage];
1225         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool, sz, desc_size);
1226         struct mali_midgard_sampler_packed *out = (struct mali_midgard_sampler_packed *) T.cpu;
1227
1228         for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i)
1229                 out[i] = ctx->samplers[stage][i]->hw;
1230
1231         postfix->sampler_descriptor = T.gpu;
1232 }
1233
1234 void
1235 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1236                           struct mali_vertex_tiler_postfix *vertex_postfix)
1237 {
1238         struct panfrost_context *ctx = batch->ctx;
1239         struct panfrost_vertex_state *so = ctx->vertex;
1240         struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1241
1242         unsigned instance_shift = vertex_postfix->instance_shift;
1243         unsigned instance_odd = vertex_postfix->instance_odd;
1244
1245         /* Worst case: everything is NPOT, which is only possible if instancing
1246          * is enabled. Otherwise single record is gauranteed */
1247         bool could_npot = instance_shift || instance_odd;
1248
1249         struct panfrost_transfer S = panfrost_pool_alloc_aligned(&batch->pool,
1250                         MALI_ATTRIBUTE_BUFFER_LENGTH * vs->attribute_count *
1251                         (could_npot ? 2 : 1),
1252                         MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1253
1254         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1255                         MALI_ATTRIBUTE_LENGTH * vs->attribute_count,
1256                         MALI_ATTRIBUTE_LENGTH);
1257
1258         struct mali_attribute_buffer_packed *bufs =
1259                 (struct mali_attribute_buffer_packed *) S.cpu;
1260
1261         struct mali_attribute_packed *out =
1262                 (struct mali_attribute_packed *) T.cpu;
1263
1264         unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 };
1265         unsigned k = 0;
1266
1267         for (unsigned i = 0; i < so->num_elements; ++i) {
1268                 /* We map buffers 1:1 with the attributes, which
1269                  * means duplicating some vertex buffers (who cares? aside from
1270                  * maybe some caching implications but I somehow doubt that
1271                  * matters) */
1272
1273                 struct pipe_vertex_element *elem = &so->pipe[i];
1274                 unsigned vbi = elem->vertex_buffer_index;
1275                 attrib_to_buffer[i] = k;
1276
1277                 if (!(ctx->vb_mask & (1 << vbi)))
1278                         continue;
1279
1280                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1281                 struct panfrost_resource *rsrc;
1282
1283                 rsrc = pan_resource(buf->buffer.resource);
1284                 if (!rsrc)
1285                         continue;
1286
1287                 /* Add a dependency of the batch on the vertex buffer */
1288                 panfrost_batch_add_bo(batch, rsrc->bo,
1289                                       PAN_BO_ACCESS_SHARED |
1290                                       PAN_BO_ACCESS_READ |
1291                                       PAN_BO_ACCESS_VERTEX_TILER);
1292
1293                 /* Mask off lower bits, see offset fixup below */
1294                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1295                 mali_ptr addr = raw_addr & ~63;
1296
1297                 /* Since we advanced the base pointer, we shrink the buffer
1298                  * size, but add the offset we subtracted */
1299                 unsigned size = rsrc->base.width0 + (raw_addr - addr)
1300                         - buf->buffer_offset;
1301
1302                 /* When there is a divisor, the hardware-level divisor is
1303                  * the product of the instance divisor and the padded count */
1304                 unsigned divisor = elem->instance_divisor;
1305                 unsigned hw_divisor = ctx->padded_count * divisor;
1306                 unsigned stride = buf->stride;
1307
1308                 /* If there's a divisor(=1) but no instancing, we want every
1309                  * attribute to be the same */
1310
1311                 if (divisor && ctx->instance_count == 1)
1312                         stride = 0;
1313
1314                 if (!divisor || ctx->instance_count <= 1) {
1315                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1316                                 if (ctx->instance_count > 1)
1317                                         cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
1318
1319                                 cfg.pointer = addr;
1320                                 cfg.stride = stride;
1321                                 cfg.size = size;
1322                                 cfg.divisor_r = instance_shift;
1323                                 cfg.divisor_p = instance_odd;
1324                         }
1325                 } else if (util_is_power_of_two_or_zero(hw_divisor)) {
1326                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1327                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
1328                                 cfg.pointer = addr;
1329                                 cfg.stride = stride;
1330                                 cfg.size = size;
1331                                 cfg.divisor_r = __builtin_ctz(hw_divisor);
1332                         }
1333
1334                 } else {
1335                         unsigned shift = 0, extra_flags = 0;
1336
1337                         unsigned magic_divisor =
1338                                 panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
1339
1340                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1341                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
1342                                 cfg.pointer = addr;
1343                                 cfg.stride = stride;
1344                                 cfg.size = size;
1345
1346                                 cfg.divisor_r = shift;
1347                                 cfg.divisor_e = extra_flags;
1348                         }
1349
1350                         pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
1351                                 cfg.divisor_numerator = magic_divisor;
1352                                 cfg.divisor = divisor;
1353                         }
1354
1355                         ++k;
1356                 }
1357
1358                 ++k;
1359         }
1360
1361         /* Add special gl_VertexID/gl_InstanceID buffers */
1362
1363         if (unlikely(vs->attribute_count >= PAN_VERTEX_ID)) {
1364                 panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1365
1366                 pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
1367                         cfg.buffer_index = k++;
1368                         cfg.format = so->formats[PAN_VERTEX_ID];
1369                 }
1370
1371                 panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1372
1373                 pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
1374                         cfg.buffer_index = k++;
1375                         cfg.format = so->formats[PAN_INSTANCE_ID];
1376                 }
1377         }
1378
1379         /* Attribute addresses require 64-byte alignment, so let:
1380          *
1381          *      base' = base & ~63 = base - (base & 63)
1382          *      offset' = offset + (base & 63)
1383          *
1384          * Since base' + offset' = base + offset, these are equivalent
1385          * addressing modes and now base is 64 aligned.
1386          */
1387
1388         unsigned start = vertex_postfix->offset_start;
1389
1390         for (unsigned i = 0; i < so->num_elements; ++i) {
1391                 unsigned vbi = so->pipe[i].vertex_buffer_index;
1392                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1393
1394                 /* Adjust by the masked off bits of the offset. Make sure we
1395                  * read src_offset from so->hw (which is not GPU visible)
1396                  * rather than target (which is) due to caching effects */
1397
1398                 unsigned src_offset = so->pipe[i].src_offset;
1399
1400                 /* BOs aligned to 4k so guaranteed aligned to 64 */
1401                 src_offset += (buf->buffer_offset & 63);
1402
1403                 /* Also, somewhat obscurely per-instance data needs to be
1404                  * offset in response to a delayed start in an indexed draw */
1405
1406                 if (so->pipe[i].instance_divisor && ctx->instance_count > 1 && start)
1407                         src_offset -= buf->stride * start;
1408
1409                 pan_pack(out + i, ATTRIBUTE, cfg) {
1410                         cfg.buffer_index = attrib_to_buffer[i];
1411                         cfg.format = so->formats[i];
1412                         cfg.offset = src_offset;
1413                 }
1414         }
1415
1416         vertex_postfix->attributes = S.gpu;
1417         vertex_postfix->attribute_meta = T.gpu;
1418 }
1419
1420 static mali_ptr
1421 panfrost_emit_varyings(struct panfrost_batch *batch,
1422                 struct mali_attribute_buffer_packed *slot,
1423                 unsigned stride, unsigned count)
1424 {
1425         unsigned size = stride * count;
1426         mali_ptr ptr = panfrost_pool_alloc_aligned(&batch->invisible_pool, size, 64).gpu;
1427
1428         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1429                 cfg.stride = stride;
1430                 cfg.size = size;
1431                 cfg.pointer = ptr;
1432         }
1433
1434         return ptr;
1435 }
1436
1437 static unsigned
1438 panfrost_streamout_offset(unsigned stride, unsigned offset,
1439                         struct pipe_stream_output_target *target)
1440 {
1441         return (target->buffer_offset + (offset * stride * 4)) & 63;
1442 }
1443
1444 static void
1445 panfrost_emit_streamout(struct panfrost_batch *batch,
1446                         struct mali_attribute_buffer_packed *slot,
1447                         unsigned stride_words, unsigned offset, unsigned count,
1448                         struct pipe_stream_output_target *target)
1449 {
1450         unsigned stride = stride_words * 4;
1451         unsigned max_size = target->buffer_size;
1452         unsigned expected_size = stride * count;
1453
1454         /* Grab the BO and bind it to the batch */
1455         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1456
1457         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1458          * the perspective of the TILER and FRAGMENT.
1459          */
1460         panfrost_batch_add_bo(batch, bo,
1461                               PAN_BO_ACCESS_SHARED |
1462                               PAN_BO_ACCESS_RW |
1463                               PAN_BO_ACCESS_VERTEX_TILER |
1464                               PAN_BO_ACCESS_FRAGMENT);
1465
1466         /* We will have an offset applied to get alignment */
1467         mali_ptr addr = bo->gpu + target->buffer_offset + (offset * stride);
1468
1469         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1470                 cfg.pointer = (addr & ~63);
1471                 cfg.stride = stride;
1472                 cfg.size = MIN2(max_size, expected_size) + (addr & 63);
1473         }
1474 }
1475
1476 static bool
1477 has_point_coord(unsigned mask, gl_varying_slot loc)
1478 {
1479         if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1480                 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1481         else if (loc == VARYING_SLOT_PNTC)
1482                 return (mask & (1 << 8));
1483         else
1484                 return false;
1485 }
1486
1487 /* Helpers for manipulating stream out information so we can pack varyings
1488  * accordingly. Compute the src_offset for a given captured varying */
1489
1490 static struct pipe_stream_output *
1491 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1492 {
1493         for (unsigned i = 0; i < info->num_outputs; ++i) {
1494                 if (info->output[i].register_index == loc)
1495                         return &info->output[i];
1496         }
1497
1498         unreachable("Varying not captured");
1499 }
1500
1501 static unsigned
1502 pan_varying_size(enum mali_format fmt)
1503 {
1504         unsigned type = MALI_EXTRACT_TYPE(fmt);
1505         unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1506         unsigned bits = MALI_EXTRACT_BITS(fmt);
1507         unsigned bpc = 0;
1508
1509         if (bits == MALI_CHANNEL_FLOAT) {
1510                 /* No doubles */
1511                 bool fp16 = (type == MALI_FORMAT_SINT);
1512                 assert(fp16 || (type == MALI_FORMAT_UNORM));
1513
1514                 bpc = fp16 ? 2 : 4;
1515         } else {
1516                 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1517
1518                 /* See the enums */
1519                 bits = 1 << bits;
1520                 assert(bits >= 8);
1521                 bpc = bits / 8;
1522         }
1523
1524         return bpc * chan;
1525 }
1526
1527 /* Indices for named (non-XFB) varyings that are present. These are packed
1528  * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1529  * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1530  * of a given special field given a shift S by:
1531  *
1532  *      idx = popcount(P & ((1 << S) - 1))
1533  *
1534  * That is... look at all of the varyings that come earlier and count them, the
1535  * count is the new index since plus one. Likewise, the total number of special
1536  * buffers required is simply popcount(P)
1537  */
1538
1539 enum pan_special_varying {
1540         PAN_VARY_GENERAL = 0,
1541         PAN_VARY_POSITION = 1,
1542         PAN_VARY_PSIZ = 2,
1543         PAN_VARY_PNTCOORD = 3,
1544         PAN_VARY_FACE = 4,
1545         PAN_VARY_FRAGCOORD = 5,
1546
1547         /* Keep last */
1548         PAN_VARY_MAX,
1549 };
1550
1551 /* Given a varying, figure out which index it correpsonds to */
1552
1553 static inline unsigned
1554 pan_varying_index(unsigned present, enum pan_special_varying v)
1555 {
1556         unsigned mask = (1 << v) - 1;
1557         return util_bitcount(present & mask);
1558 }
1559
1560 /* Get the base offset for XFB buffers, which by convention come after
1561  * everything else. Wrapper function for semantic reasons; by construction this
1562  * is just popcount. */
1563
1564 static inline unsigned
1565 pan_xfb_base(unsigned present)
1566 {
1567         return util_bitcount(present);
1568 }
1569
1570 /* Computes the present mask for varyings so we can start emitting varying records */
1571
1572 static inline unsigned
1573 pan_varying_present(
1574         struct panfrost_shader_state *vs,
1575         struct panfrost_shader_state *fs,
1576         unsigned quirks)
1577 {
1578         /* At the moment we always emit general and position buffers. Not
1579          * strictly necessary but usually harmless */
1580
1581         unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1582
1583         /* Enable special buffers by the shader info */
1584
1585         if (vs->writes_point_size)
1586                 present |= (1 << PAN_VARY_PSIZ);
1587
1588         if (fs->reads_point_coord)
1589                 present |= (1 << PAN_VARY_PNTCOORD);
1590
1591         if (fs->reads_face)
1592                 present |= (1 << PAN_VARY_FACE);
1593
1594         if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1595                 present |= (1 << PAN_VARY_FRAGCOORD);
1596
1597         /* Also, if we have a point sprite, we need a point coord buffer */
1598
1599         for (unsigned i = 0; i < fs->varying_count; i++)  {
1600                 gl_varying_slot loc = fs->varyings_loc[i];
1601
1602                 if (has_point_coord(fs->point_sprite_mask, loc))
1603                         present |= (1 << PAN_VARY_PNTCOORD);
1604         }
1605
1606         return present;
1607 }
1608
1609 /* Emitters for varying records */
1610
1611 static void
1612 pan_emit_vary(struct mali_attribute_packed *out,
1613                 unsigned present, enum pan_special_varying buf,
1614                 unsigned quirks, enum mali_format format,
1615                 unsigned offset)
1616 {
1617         unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1618         unsigned swizzle = quirks & HAS_SWIZZLES ?
1619                         panfrost_get_default_swizzle(nr_channels) :
1620                         panfrost_bifrost_swizzle(nr_channels);
1621
1622         pan_pack(out, ATTRIBUTE, cfg) {
1623                 cfg.buffer_index = pan_varying_index(present, buf);
1624                 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1625                 cfg.format = (format << 12) | swizzle;
1626                 cfg.offset = offset;
1627         }
1628 }
1629
1630 /* General varying that is unused */
1631
1632 static void
1633 pan_emit_vary_only(struct mali_attribute_packed *out,
1634                 unsigned present, unsigned quirks)
1635 {
1636         pan_emit_vary(out, present, 0, quirks, MALI_VARYING_DISCARD, 0);
1637 }
1638
1639 /* Special records */
1640
1641 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1642         [PAN_VARY_POSITION]     = MALI_VARYING_POS,
1643         [PAN_VARY_PSIZ]         = MALI_R16F,
1644         [PAN_VARY_PNTCOORD]     = MALI_R16F,
1645         [PAN_VARY_FACE]         = MALI_R32I,
1646         [PAN_VARY_FRAGCOORD]    = MALI_RGBA32F
1647 };
1648
1649 static void
1650 pan_emit_vary_special(struct mali_attribute_packed *out,
1651                 unsigned present, enum pan_special_varying buf,
1652                 unsigned quirks)
1653 {
1654         assert(buf < PAN_VARY_MAX);
1655         pan_emit_vary(out, present, buf, quirks, pan_varying_formats[buf], 0);
1656 }
1657
1658 static enum mali_format
1659 pan_xfb_format(enum mali_format format, unsigned nr)
1660 {
1661         if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1662                 return MALI_R32F | MALI_NR_CHANNELS(nr);
1663         else
1664                 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1665 }
1666
1667 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1668  * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1669  * value. */
1670
1671 static void
1672 pan_emit_vary_xfb(struct mali_attribute_packed *out,
1673                 unsigned present,
1674                 unsigned max_xfb,
1675                 unsigned *streamout_offsets,
1676                 unsigned quirks,
1677                 enum mali_format format,
1678                 struct pipe_stream_output o)
1679 {
1680         unsigned swizzle = quirks & HAS_SWIZZLES ?
1681                         panfrost_get_default_swizzle(o.num_components) :
1682                         panfrost_bifrost_swizzle(o.num_components);
1683
1684         pan_pack(out, ATTRIBUTE, cfg) {
1685                 /* XFB buffers come after everything else */
1686                 cfg.buffer_index = pan_xfb_base(present) + o.output_buffer;
1687                 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1688
1689                 /* Override number of channels and precision to highp */
1690                 cfg.format = (pan_xfb_format(format, o.num_components) << 12) | swizzle;
1691
1692                 /* Apply given offsets together */
1693                 cfg.offset = (o.dst_offset * 4) /* dwords */
1694                         + streamout_offsets[o.output_buffer];
1695         }
1696 }
1697
1698 /* Determine if we should capture a varying for XFB. This requires actually
1699  * having a buffer for it. If we don't capture it, we'll fallback to a general
1700  * varying path (linked or unlinked, possibly discarding the write) */
1701
1702 static bool
1703 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1704                 unsigned loc, unsigned max_xfb)
1705 {
1706         if (!(xfb->so_mask & (1ll << loc)))
1707                 return false;
1708
1709         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1710         return o->output_buffer < max_xfb;
1711 }
1712
1713 static void
1714 pan_emit_general_varying(struct mali_attribute_packed *out,
1715                 struct panfrost_shader_state *other,
1716                 struct panfrost_shader_state *xfb,
1717                 gl_varying_slot loc,
1718                 enum mali_format format,
1719                 unsigned present,
1720                 unsigned quirks,
1721                 unsigned *gen_offsets,
1722                 enum mali_format *gen_formats,
1723                 unsigned *gen_stride,
1724                 unsigned idx,
1725                 bool should_alloc)
1726 {
1727         /* Check if we're linked */
1728         signed other_idx = -1;
1729
1730         for (unsigned j = 0; j < other->varying_count; ++j) {
1731                 if (other->varyings_loc[j] == loc) {
1732                         other_idx = j;
1733                         break;
1734                 }
1735         }
1736
1737         if (other_idx < 0) {
1738                 pan_emit_vary_only(out, present, quirks);
1739                 return;
1740         }
1741
1742         unsigned offset = gen_offsets[other_idx];
1743
1744         if (should_alloc) {
1745                 /* We're linked, so allocate a space via a watermark allocation */
1746                 enum mali_format alt = other->varyings[other_idx];
1747
1748                 /* Do interpolation at minimum precision */
1749                 unsigned size_main = pan_varying_size(format);
1750                 unsigned size_alt = pan_varying_size(alt);
1751                 unsigned size = MIN2(size_main, size_alt);
1752
1753                 /* If a varying is marked for XFB but not actually captured, we
1754                  * should match the format to the format that would otherwise
1755                  * be used for XFB, since dEQP checks for invariance here. It's
1756                  * unclear if this is required by the spec. */
1757
1758                 if (xfb->so_mask & (1ull << loc)) {
1759                         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1760                         format = pan_xfb_format(format, o->num_components);
1761                         size = pan_varying_size(format);
1762                 } else if (size == size_alt) {
1763                         format = alt;
1764                 }
1765
1766                 gen_offsets[idx] = *gen_stride;
1767                 gen_formats[other_idx] = format;
1768                 offset = *gen_stride;
1769                 *gen_stride += size;
1770         }
1771
1772         pan_emit_vary(out, present, PAN_VARY_GENERAL, quirks, format, offset);
1773 }
1774
1775 /* Higher-level wrapper around all of the above, classifying a varying into one
1776  * of the above types */
1777
1778 static void
1779 panfrost_emit_varying(
1780                 struct mali_attribute_packed *out,
1781                 struct panfrost_shader_state *stage,
1782                 struct panfrost_shader_state *other,
1783                 struct panfrost_shader_state *xfb,
1784                 unsigned present,
1785                 unsigned max_xfb,
1786                 unsigned *streamout_offsets,
1787                 unsigned quirks,
1788                 unsigned *gen_offsets,
1789                 enum mali_format *gen_formats,
1790                 unsigned *gen_stride,
1791                 unsigned idx,
1792                 bool should_alloc,
1793                 bool is_fragment)
1794 {
1795         gl_varying_slot loc = stage->varyings_loc[idx];
1796         enum mali_format format = stage->varyings[idx];
1797
1798         /* Override format to match linkage */
1799         if (!should_alloc && gen_formats[idx])
1800                 format = gen_formats[idx];
1801
1802         if (has_point_coord(stage->point_sprite_mask, loc)) {
1803                 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1804         } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1805                 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1806                 pan_emit_vary_xfb(out, present, max_xfb, streamout_offsets, quirks, format, *o);
1807         } else if (loc == VARYING_SLOT_POS) {
1808                 if (is_fragment)
1809                         pan_emit_vary_special(out, present, PAN_VARY_FRAGCOORD, quirks);
1810                 else
1811                         pan_emit_vary_special(out, present, PAN_VARY_POSITION, quirks);
1812         } else if (loc == VARYING_SLOT_PSIZ) {
1813                 pan_emit_vary_special(out, present, PAN_VARY_PSIZ, quirks);
1814         } else if (loc == VARYING_SLOT_PNTC) {
1815                 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1816         } else if (loc == VARYING_SLOT_FACE) {
1817                 pan_emit_vary_special(out, present, PAN_VARY_FACE, quirks);
1818         } else {
1819                 pan_emit_general_varying(out, other, xfb, loc, format, present,
1820                                 quirks, gen_offsets, gen_formats, gen_stride,
1821                                 idx, should_alloc);
1822         }
1823 }
1824
1825 static void
1826 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
1827                 unsigned present,
1828                 enum pan_special_varying v,
1829                 unsigned special)
1830 {
1831         if (present & (1 << v)) {
1832                 unsigned idx = pan_varying_index(present, v);
1833
1834                 pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
1835                         cfg.special = special;
1836                         cfg.type = 0;
1837                 }
1838         }
1839 }
1840
1841 void
1842 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1843                                  unsigned vertex_count,
1844                                  struct mali_vertex_tiler_postfix *vertex_postfix,
1845                                  struct mali_vertex_tiler_postfix *tiler_postfix,
1846                                  union midgard_primitive_size *primitive_size)
1847 {
1848         /* Load the shaders */
1849         struct panfrost_context *ctx = batch->ctx;
1850         struct panfrost_device *dev = pan_device(ctx->base.screen);
1851         struct panfrost_shader_state *vs, *fs;
1852         size_t vs_size, fs_size;
1853
1854         /* Allocate the varying descriptor */
1855
1856         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1857         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1858         vs_size = MALI_ATTRIBUTE_LENGTH * vs->varying_count;
1859         fs_size = MALI_ATTRIBUTE_LENGTH * fs->varying_count;
1860
1861         struct panfrost_transfer trans = panfrost_pool_alloc_aligned(
1862                         &batch->pool, vs_size + fs_size, MALI_ATTRIBUTE_LENGTH);
1863
1864         struct pipe_stream_output_info *so = &vs->stream_output;
1865         unsigned present = pan_varying_present(vs, fs, dev->quirks);
1866
1867         /* Check if this varying is linked by us. This is the case for
1868          * general-purpose, non-captured varyings. If it is, link it. If it's
1869          * not, use the provided stream out information to determine the
1870          * offset, since it was already linked for us. */
1871
1872         unsigned gen_offsets[32];
1873         enum mali_format gen_formats[32];
1874         memset(gen_offsets, 0, sizeof(gen_offsets));
1875         memset(gen_formats, 0, sizeof(gen_formats));
1876
1877         unsigned gen_stride = 0;
1878         assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
1879         assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
1880
1881         unsigned streamout_offsets[32];
1882
1883         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1884                 streamout_offsets[i] = panfrost_streamout_offset(
1885                                         so->stride[i],
1886                                         ctx->streamout.offsets[i],
1887                                         ctx->streamout.targets[i]);
1888         }
1889
1890         struct mali_attribute_packed *ovs = (struct mali_attribute_packed *)trans.cpu;
1891         struct mali_attribute_packed *ofs = ovs + vs->varying_count;
1892
1893         for (unsigned i = 0; i < vs->varying_count; i++) {
1894                 panfrost_emit_varying(ovs + i, vs, fs, vs, present,
1895                                 ctx->streamout.num_targets, streamout_offsets,
1896                                 dev->quirks,
1897                                 gen_offsets, gen_formats, &gen_stride, i, true, false);
1898         }
1899
1900         for (unsigned i = 0; i < fs->varying_count; i++) {
1901                 panfrost_emit_varying(ofs + i, fs, vs, vs, present,
1902                                 ctx->streamout.num_targets, streamout_offsets,
1903                                 dev->quirks,
1904                                 gen_offsets, gen_formats, &gen_stride, i, false, true);
1905         }
1906
1907         unsigned xfb_base = pan_xfb_base(present);
1908         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1909                         MALI_ATTRIBUTE_BUFFER_LENGTH * (xfb_base + ctx->streamout.num_targets),
1910                         MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1911         struct mali_attribute_buffer_packed *varyings =
1912                 (struct mali_attribute_buffer_packed *) T.cpu;
1913
1914         /* Emit the stream out buffers */
1915
1916         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1917                                                            ctx->vertex_count);
1918
1919         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1920                 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
1921                                         so->stride[i],
1922                                         ctx->streamout.offsets[i],
1923                                         out_count,
1924                                         ctx->streamout.targets[i]);
1925         }
1926
1927         panfrost_emit_varyings(batch,
1928                         &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
1929                         gen_stride, vertex_count);
1930
1931         /* fp32 vec4 gl_Position */
1932         tiler_postfix->position_varying = panfrost_emit_varyings(batch,
1933                         &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
1934                         sizeof(float) * 4, vertex_count);
1935
1936         if (present & (1 << PAN_VARY_PSIZ)) {
1937                 primitive_size->pointer = panfrost_emit_varyings(batch,
1938                                 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
1939                                 2, vertex_count);
1940         }
1941
1942         pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD);
1943         pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
1944         pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
1945
1946         vertex_postfix->varyings = T.gpu;
1947         tiler_postfix->varyings = T.gpu;
1948
1949         vertex_postfix->varying_meta = trans.gpu;
1950         tiler_postfix->varying_meta = trans.gpu + vs_size;
1951 }
1952
1953 void
1954 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1955                                 struct mali_vertex_tiler_prefix *vertex_prefix,
1956                                 struct mali_vertex_tiler_postfix *vertex_postfix,
1957                                 struct mali_vertex_tiler_prefix *tiler_prefix,
1958                                 struct mali_vertex_tiler_postfix *tiler_postfix,
1959                                 union midgard_primitive_size *primitive_size)
1960 {
1961         struct panfrost_context *ctx = batch->ctx;
1962         struct panfrost_device *device = pan_device(ctx->base.screen);
1963         bool wallpapering = ctx->wallpaper_batch && batch->scoreboard.tiler_dep;
1964         struct bifrost_payload_vertex bifrost_vertex = {0,};
1965         struct bifrost_payload_tiler bifrost_tiler = {0,};
1966         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
1967         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
1968         void *vp, *tp;
1969         size_t vp_size, tp_size;
1970
1971         if (device->quirks & IS_BIFROST) {
1972                 bifrost_vertex.prefix = *vertex_prefix;
1973                 bifrost_vertex.postfix = *vertex_postfix;
1974                 vp = &bifrost_vertex;
1975                 vp_size = sizeof(bifrost_vertex);
1976
1977                 bifrost_tiler.prefix = *tiler_prefix;
1978                 bifrost_tiler.tiler.primitive_size = *primitive_size;
1979                 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
1980                 bifrost_tiler.postfix = *tiler_postfix;
1981                 tp = &bifrost_tiler;
1982                 tp_size = sizeof(bifrost_tiler);
1983         } else {
1984                 midgard_vertex.prefix = *vertex_prefix;
1985                 midgard_vertex.postfix = *vertex_postfix;
1986                 vp = &midgard_vertex;
1987                 vp_size = sizeof(midgard_vertex);
1988
1989                 midgard_tiler.prefix = *tiler_prefix;
1990                 midgard_tiler.postfix = *tiler_postfix;
1991                 midgard_tiler.primitive_size = *primitive_size;
1992                 tp = &midgard_tiler;
1993                 tp_size = sizeof(midgard_tiler);
1994         }
1995
1996         if (wallpapering) {
1997                 /* Inject in reverse order, with "predicted" job indices.
1998                  * THIS IS A HACK XXX */
1999                 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false,
2000                                  batch->scoreboard.job_index + 2, tp, tp_size, true);
2001                 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2002                                  vp, vp_size, true);
2003                 return;
2004         }
2005
2006         /* If rasterizer discard is enable, only submit the vertex */
2007
2008         unsigned vertex = panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2009                                            vp, vp_size, false);
2010
2011         if (ctx->rasterizer->base.rasterizer_discard)
2012                 return;
2013
2014         panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false, vertex, tp, tp_size,
2015                          false);
2016 }
2017
2018 /* TODO: stop hardcoding this */
2019 mali_ptr
2020 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2021 {
2022         uint16_t locations[] = {
2023             128, 128,
2024             0, 256,
2025             0, 256,
2026             0, 256,
2027             0, 256,
2028             0, 256,
2029             0, 256,
2030             0, 256,
2031             0, 256,
2032             0, 256,
2033             0, 256,
2034             0, 256,
2035             0, 256,
2036             0, 256,
2037             0, 256,
2038             0, 256,
2039             0, 256,
2040             0, 256,
2041             0, 256,
2042             0, 256,
2043             0, 256,
2044             0, 256,
2045             0, 256,
2046             0, 256,
2047             0, 256,
2048             0, 256,
2049             0, 256,
2050             0, 256,
2051             0, 256,
2052             0, 256,
2053             0, 256,
2054             0, 256,
2055             128, 128,
2056             0, 0,
2057             0, 0,
2058             0, 0,
2059             0, 0,
2060             0, 0,
2061             0, 0,
2062             0, 0,
2063             0, 0,
2064             0, 0,
2065             0, 0,
2066             0, 0,
2067             0, 0,
2068             0, 0,
2069             0, 0,
2070             0, 0,
2071         };
2072
2073         return panfrost_pool_upload_aligned(&batch->pool, locations, 96 * sizeof(uint16_t), 64);
2074 }