src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28
  29 #include "panfrost-quirks.h"
  30
  31 #include "pan_pool.h"
  32 #include "pan_bo.h"
  33 #include "pan_cmdstream.h"
  34 #include "pan_context.h"
  35 #include "pan_job.h"
  36
  37 /* If a BO is accessed for a particular shader stage, will it be in the primary
  38  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  39  * fragment will be primary, e.g. compute jobs will be considered
  40  * "vertex/tiler" by analogy */
  41
  42 static inline uint32_t
  43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  44 {
  45         assert(stage == PIPE_SHADER_FRAGMENT ||
  46                stage == PIPE_SHADER_VERTEX ||
  47                stage == PIPE_SHADER_COMPUTE);
  48
  49         return stage == PIPE_SHADER_FRAGMENT ?
  50                PAN_BO_ACCESS_FRAGMENT :
  51                PAN_BO_ACCESS_VERTEX_TILER;
  52 }
  53
  54 static void
  55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
  56                                struct mali_vertex_tiler_postfix *postfix)
  57 {
  58         struct panfrost_device *dev = pan_device(ctx->base.screen);
  59         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  60
  61         struct mali_shared_memory shared = {
  62                 .shared_workgroup_count = ~0,
  63         };
  64
  65         if (batch->stack_size) {
  66                 struct panfrost_bo *stack =
  67                         panfrost_batch_get_scratchpad(batch, batch->stack_size,
  68                                         dev->thread_tls_alloc,
  69                                         dev->core_count);
  70
  71                 shared.stack_shift = panfrost_get_stack_shift(batch->stack_size);
  72                 shared.scratchpad = stack->gpu;
  73         }
  74
  75         postfix->shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared, sizeof(shared), 64);
  76 }
  77
  78 static void
  79 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
  80                                struct mali_vertex_tiler_postfix *postfix)
  81 {
  82         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  83         postfix->shared_memory = panfrost_batch_reserve_framebuffer(batch);
  84 }
  85
  86 static void
  87 panfrost_vt_update_rasterizer(struct panfrost_rasterizer *rasterizer,
  88                               struct mali_vertex_tiler_prefix *prefix,
  89                               struct mali_vertex_tiler_postfix *postfix)
  90 {
  91         postfix->gl_enables |= 0x7;
  92         SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
  93                 rasterizer->base.front_ccw);
  94         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
  95                 (rasterizer->base.cull_face & PIPE_FACE_FRONT));
  96         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
  97                 (rasterizer->base.cull_face & PIPE_FACE_BACK));
  98         SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
  99                 rasterizer->base.flatshade_first);
 100 }
 101
 102 void
 103 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
 104                                   struct mali_vertex_tiler_prefix *prefix,
 105                                   union midgard_primitive_size *primitive_size)
 106 {
 107         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 108
 109         if (!panfrost_writes_point_size(ctx)) {
 110                 float val = (prefix->draw_mode == MALI_DRAW_MODE_POINTS) ?
 111                               rasterizer->base.point_size :
 112                               rasterizer->base.line_width;
 113
 114                 primitive_size->constant = val;
 115         }
 116 }
 117
 118 static void
 119 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
 120                                    struct mali_vertex_tiler_postfix *postfix)
 121 {
 122         SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
 123         if (ctx->occlusion_query) {
 124                 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
 125                 panfrost_batch_add_bo(ctx->batch, ctx->occlusion_query->bo,
 126                                       PAN_BO_ACCESS_SHARED |
 127                                       PAN_BO_ACCESS_RW |
 128                                       PAN_BO_ACCESS_FRAGMENT);
 129         } else {
 130                 postfix->occlusion_counter = 0;
 131         }
 132 }
 133
 134 void
 135 panfrost_vt_init(struct panfrost_context *ctx,
 136                  enum pipe_shader_type stage,
 137                  struct mali_vertex_tiler_prefix *prefix,
 138                  struct mali_vertex_tiler_postfix *postfix)
 139 {
 140         struct panfrost_device *device = pan_device(ctx->base.screen);
 141
 142         if (!ctx->shader[stage])
 143                 return;
 144
 145         memset(prefix, 0, sizeof(*prefix));
 146         memset(postfix, 0, sizeof(*postfix));
 147
 148         if (device->quirks & IS_BIFROST) {
 149                 postfix->gl_enables = 0x2;
 150                 panfrost_vt_emit_shared_memory(ctx, postfix);
 151         } else {
 152                 postfix->gl_enables = 0x6;
 153                 panfrost_vt_attach_framebuffer(ctx, postfix);
 154         }
 155
 156         if (stage == PIPE_SHADER_FRAGMENT) {
 157                 panfrost_vt_update_occlusion_query(ctx, postfix);
 158                 panfrost_vt_update_rasterizer(ctx->rasterizer, prefix, postfix);
 159         }
 160 }
 161
 162 static unsigned
 163 panfrost_translate_index_size(unsigned size)
 164 {
 165         switch (size) {
 166         case 1:
 167                 return MALI_DRAW_INDEXED_UINT8;
 168
 169         case 2:
 170                 return MALI_DRAW_INDEXED_UINT16;
 171
 172         case 4:
 173                 return MALI_DRAW_INDEXED_UINT32;
 174
 175         default:
 176                 unreachable("Invalid index size");
 177         }
 178 }
 179
 180 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
 181  * good for the duration of the draw (transient), could last longer. Also get
 182  * the bounds on the index buffer for the range accessed by the draw. We do
 183  * these operations together because there are natural optimizations which
 184  * require them to be together. */
 185
 186 static mali_ptr
 187 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 188                                   const struct pipe_draw_info *info,
 189                                   unsigned *min_index, unsigned *max_index)
 190 {
 191         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 192         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 193         off_t offset = info->start * info->index_size;
 194         bool needs_indices = true;
 195         mali_ptr out = 0;
 196
 197         if (info->max_index != ~0u) {
 198                 *min_index = info->min_index;
 199                 *max_index = info->max_index;
 200                 needs_indices = false;
 201         }
 202
 203         if (!info->has_user_indices) {
 204                 /* Only resources can be directly mapped */
 205                 panfrost_batch_add_bo(batch, rsrc->bo,
 206                                       PAN_BO_ACCESS_SHARED |
 207                                       PAN_BO_ACCESS_READ |
 208                                       PAN_BO_ACCESS_VERTEX_TILER);
 209                 out = rsrc->bo->gpu + offset;
 210
 211                 /* Check the cache */
 212                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 213                                                            info->start,
 214                                                            info->count,
 215                                                            min_index,
 216                                                            max_index);
 217         } else {
 218                 /* Otherwise, we need to upload to transient memory */
 219                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 220                 struct panfrost_transfer T =
 221                         panfrost_pool_alloc_aligned(&batch->pool,
 222                                 info->count * info->index_size,
 223                                 info->index_size);
 224
 225                 memcpy(T.cpu, ibuf8 + offset, info->count * info->index_size);
 226                 out = T.gpu;
 227         }
 228
 229         if (needs_indices) {
 230                 /* Fallback */
 231                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 232
 233                 if (!info->has_user_indices)
 234                         panfrost_minmax_cache_add(rsrc->index_cache,
 235                                                   info->start, info->count,
 236                                                   *min_index, *max_index);
 237         }
 238
 239         return out;
 240 }
 241
 242 void
 243 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
 244                           const struct pipe_draw_info *info,
 245                           enum mali_draw_mode draw_mode,
 246                           struct mali_vertex_tiler_postfix *vertex_postfix,
 247                           struct mali_vertex_tiler_prefix *tiler_prefix,
 248                           struct mali_vertex_tiler_postfix *tiler_postfix,
 249                           unsigned *vertex_count,
 250                           unsigned *padded_count)
 251 {
 252         tiler_prefix->draw_mode = draw_mode;
 253
 254         unsigned draw_flags = 0;
 255
 256         if (panfrost_writes_point_size(ctx))
 257                 draw_flags |= MALI_DRAW_VARYING_SIZE;
 258
 259         if (info->primitive_restart)
 260                 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
 261
 262         /* These doesn't make much sense */
 263
 264         draw_flags |= 0x3000;
 265
 266         if (info->index_size) {
 267                 unsigned min_index = 0, max_index = 0;
 268
 269                 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
 270                                                                        info,
 271                                                                        &min_index,
 272                                                                        &max_index);
 273
 274                 /* Use the corresponding values */
 275                 *vertex_count = max_index - min_index + 1;
 276                 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
 277                 tiler_prefix->offset_bias_correction = -min_index;
 278                 tiler_prefix->index_count = MALI_POSITIVE(info->count);
 279                 draw_flags |= panfrost_translate_index_size(info->index_size);
 280         } else {
 281                 tiler_prefix->indices = 0;
 282                 *vertex_count = ctx->vertex_count;
 283                 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
 284                 tiler_prefix->offset_bias_correction = 0;
 285                 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
 286         }
 287
 288         tiler_prefix->unknown_draw = draw_flags;
 289
 290         /* Encode the padded vertex count */
 291
 292         if (info->instance_count > 1) {
 293                 *padded_count = panfrost_padded_vertex_count(*vertex_count);
 294
 295                 unsigned shift = __builtin_ctz(ctx->padded_count);
 296                 unsigned k = ctx->padded_count >> (shift + 1);
 297
 298                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
 299                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
 300         } else {
 301                 *padded_count = *vertex_count;
 302
 303                 /* Reset instancing state */
 304                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
 305                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
 306         }
 307 }
 308
 309 static unsigned
 310 translate_tex_wrap(enum pipe_tex_wrap w)
 311 {
 312         switch (w) {
 313         case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT;
 314         case PIPE_TEX_WRAP_CLAMP: return MALI_WRAP_MODE_CLAMP;
 315         case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE;
 316         case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER;
 317         case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT;
 318         case PIPE_TEX_WRAP_MIRROR_CLAMP: return MALI_WRAP_MODE_MIRRORED_CLAMP;
 319         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
 320         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
 321         default: unreachable("Invalid wrap");
 322         }
 323 }
 324
 325 /* The hardware compares in the wrong order order, so we have to flip before
 326  * encoding. Yes, really. */
 327
 328 static enum mali_func
 329 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
 330 {
 331         if (!cso->compare_mode)
 332                 return MALI_FUNC_NEVER;
 333
 334         enum mali_func f = panfrost_translate_compare_func(cso->compare_func);
 335         return panfrost_flip_compare_func(f);
 336 }
 337
 338 static enum mali_mipmap_mode
 339 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
 340 {
 341         switch (f) {
 342         case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST;
 343         case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR;
 344         case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE;
 345         default: unreachable("Invalid");
 346         }
 347 }
 348
 349 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 350                                 struct mali_midgard_sampler_packed *hw)
 351 {
 352         pan_pack(hw, MIDGARD_SAMPLER, cfg) {
 353                 cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 354                 cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 355                 cfg.mipmap_mode = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) ?
 356                         MALI_MIPMAP_MODE_TRILINEAR : MALI_MIPMAP_MODE_NEAREST;
 357                 cfg.normalized_coordinates = cso->normalized_coords;
 358
 359                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
 360
 361                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
 362
 363                 /* If necessary, we disable mipmapping in the sampler descriptor by
 364                  * clamping the LOD as tight as possible (from 0 to epsilon,
 365                  * essentially -- remember these are fixed point numbers, so
 366                  * epsilon=1/256) */
 367
 368                 cfg.maximum_lod = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) ?
 369                         cfg.minimum_lod + 1 :
 370                         FIXED_16(cso->max_lod, false);
 371
 372                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
 373                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
 374                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
 375
 376                 cfg.compare_function = panfrost_sampler_compare_func(cso);
 377                 cfg.seamless_cube_map = cso->seamless_cube_map;
 378
 379                 cfg.border_color_r = cso->border_color.f[0];
 380                 cfg.border_color_g = cso->border_color.f[1];
 381                 cfg.border_color_b = cso->border_color.f[2];
 382                 cfg.border_color_a = cso->border_color.f[3];
 383         }
 384 }
 385
 386 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
 387                                         struct mali_bifrost_sampler_packed *hw)
 388 {
 389         pan_pack(hw, BIFROST_SAMPLER, cfg) {
 390                 cfg.magnify_linear = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR;
 391                 cfg.minify_linear = cso->min_img_filter == PIPE_TEX_FILTER_LINEAR;
 392                 cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
 393                 cfg.normalized_coordinates = cso->normalized_coords;
 394
 395                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
 396                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
 397                 cfg.maximum_lod = FIXED_16(cso->max_lod, false);
 398
 399                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
 400                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
 401                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
 402
 403                 cfg.compare_function = panfrost_sampler_compare_func(cso);
 404                 cfg.seamless_cube_map = cso->seamless_cube_map;
 405         }
 406 }
 407
 408 static bool
 409 panfrost_fs_required(
 410                 struct panfrost_shader_state *fs,
 411                 struct panfrost_blend_final *blend,
 412                 unsigned rt_count)
 413 {
 414         /* If we generally have side effects */
 415         if (fs->fs_sidefx)
 416                 return true;
 417
 418         /* If colour is written we need to execute */
 419         for (unsigned i = 0; i < rt_count; ++i) {
 420                 if (!blend[i].no_colour)
 421                         return true;
 422         }
 423
 424         /* If depth is written and not implied we need to execute.
 425          * TODO: Predicate on Z/S writes being enabled */
 426         return (fs->writes_depth || fs->writes_stencil);
 427 }
 428
 429 static void
 430 panfrost_emit_blend(struct panfrost_batch *batch, void *rts,
 431                 struct panfrost_blend_final *blend)
 432 {
 433         const struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
 434         struct panfrost_shader_state *fs = panfrost_get_shader_state(batch->ctx, PIPE_SHADER_FRAGMENT);
 435         unsigned rt_count = batch->key.nr_cbufs;
 436
 437         struct bifrost_blend_rt *brts = rts;
 438
 439         /* Disable blending for depth-only */
 440
 441         if (rt_count == 0) {
 442                 if (dev->quirks & IS_BIFROST) {
 443                         memset(brts, 0, sizeof(*brts));
 444                         brts[0].unk2 = 0x3;
 445                 } else {
 446                         pan_pack(rts, MIDGARD_BLEND_OPAQUE, cfg) {
 447                                 cfg.equation = 0xf0122122; /* Replace */
 448                         }
 449                 }
 450         }
 451
 452         for (unsigned i = 0; i < rt_count; ++i) {
 453                 struct mali_blend_flags_packed flags = {};
 454
 455                 pan_pack(&flags, BLEND_FLAGS, cfg) {
 456                         if (blend[i].no_colour) {
 457                                 cfg.enable = false;
 458                                 break;
 459                         }
 460
 461                         batch->draws |= (PIPE_CLEAR_COLOR0 << i);
 462
 463                         cfg.srgb = util_format_is_srgb(batch->key.cbufs[i]->format);
 464                         cfg.load_destination = blend[i].load_dest;
 465                         cfg.dither_disable = !batch->ctx->blend->base.dither;
 466
 467                         if (!(dev->quirks & IS_BIFROST))
 468                                 cfg.midgard_blend_shader = blend[i].is_shader;
 469                 }
 470
 471                 if (dev->quirks & IS_BIFROST) {
 472                         memset(brts + i, 0, sizeof(brts[i]));
 473                         brts[i].flags = flags.opaque[0];
 474
 475                         if (blend[i].is_shader) {
 476                                 /* The blend shader's address needs to be at
 477                                  * the same top 32 bit as the fragment shader.
 478                                  * TODO: Ensure that's always the case.
 479                                  */
 480                                 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
 481                                        (fs->bo->gpu & (0xffffffffull << 32)));
 482                                 brts[i].shader = blend[i].shader.gpu;
 483                                 brts[i].unk2 = 0x0;
 484                         } else {
 485                                 enum pipe_format format = batch->key.cbufs[i]->format;
 486                                 const struct util_format_description *format_desc;
 487                                 format_desc = util_format_description(format);
 488
 489                                 brts[i].equation = blend[i].equation.equation;
 490
 491                                 /* TODO: this is a bit more complicated */
 492                                 brts[i].constant = blend[i].equation.constant;
 493
 494                                 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
 495
 496                                 /* 0x19 disables blending and forces REPLACE
 497                                  * mode (equivalent to rgb_mode = alpha_mode =
 498                                  * x122, colour mask = 0xF). 0x1a allows
 499                                  * blending. */
 500                                 brts[i].unk2 = blend[i].opaque ? 0x19 : 0x1a;
 501
 502                                 brts[i].shader_type = fs->blend_types[i];
 503                         }
 504                 } else {
 505                         pan_pack(rts, MIDGARD_BLEND_OPAQUE, cfg) {
 506                                 cfg.flags = flags;
 507
 508                                 if (blend[i].is_shader) {
 509                                         cfg.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 510                                 } else {
 511                                         cfg.equation = blend[i].equation.equation.opaque[0];
 512                                         cfg.constant = blend[i].equation.constant;
 513                                 }
 514                         }
 515
 516                         rts += MALI_MIDGARD_BLEND_LENGTH;
 517                 }
 518         }
 519 }
 520
 521 static void
 522 panfrost_emit_frag_shader(struct panfrost_context *ctx,
 523                                struct mali_state_packed *fragmeta,
 524                                struct panfrost_blend_final *blend)
 525 {
 526         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 527         struct panfrost_shader_state *fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 528         struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 529         const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
 530         unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
 531         bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
 532
 533         /* Built up here */
 534         struct mali_shader_packed shader = fs->shader;
 535         struct mali_preload_packed preload = fs->preload;
 536         uint32_t properties;
 537         struct mali_multisample_misc_packed multisample_misc;
 538         struct mali_stencil_mask_misc_packed stencil_mask_misc;
 539         union midgard_blend sfbd_blend = { 0 };
 540
 541         if (!panfrost_fs_required(fs, blend, rt_count)) {
 542                 if (dev->quirks & IS_BIFROST) {
 543                         pan_pack(&shader, SHADER, cfg) {}
 544
 545                         pan_pack(&properties, BIFROST_PROPERTIES, cfg) {
 546                                 cfg.unknown = 0x950020; /* XXX */
 547                                 cfg.early_z_enable = true;
 548                         }
 549
 550                         preload.opaque[0] = 0;
 551                 } else {
 552                         pan_pack(&shader, SHADER, cfg) {
 553                                 cfg.shader = 0x1;
 554                         }
 555
 556                         pan_pack(&properties, MIDGARD_PROPERTIES, cfg) {
 557                                 cfg.work_register_count = 1;
 558                                 cfg.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION;
 559                                 cfg.early_z_enable = true;
 560                         }
 561                 }
 562         } else if (dev->quirks & IS_BIFROST) {
 563                 bool no_blend = true;
 564
 565                 for (unsigned i = 0; i < rt_count; ++i)
 566                         no_blend &= (!blend[i].load_dest | blend[i].no_colour);
 567
 568                 pan_pack(&properties, BIFROST_PROPERTIES, cfg) {
 569                         cfg.early_z_enable = !fs->can_discard && !fs->writes_depth && no_blend;
 570                 }
 571
 572                 /* Combine with prepacked properties */
 573                 properties |= fs->properties.opaque[0];
 574         } else {
 575                 /* Reasons to disable early-Z from a shader perspective */
 576                 bool late_z = fs->can_discard || fs->writes_global ||
 577                         fs->writes_depth || fs->writes_stencil;
 578
 579                 /* If either depth or stencil is enabled, discard matters */
 580                 bool zs_enabled =
 581                         (zsa->base.depth.enabled && zsa->base.depth.func != PIPE_FUNC_ALWAYS) ||
 582                         zsa->base.stencil[0].enabled;
 583
 584                 bool has_blend_shader = false;
 585
 586                 for (unsigned c = 0; c < rt_count; ++c)
 587                         has_blend_shader |= blend[c].is_shader;
 588
 589                 pan_pack(&properties, MIDGARD_PROPERTIES, cfg) {
 590                         /* TODO: Reduce this limit? */
 591                         if (has_blend_shader)
 592                                 cfg.work_register_count = MAX2(fs->work_reg_count, 8);
 593                         else
 594                                 cfg.work_register_count = fs->work_reg_count;
 595
 596                         cfg.early_z_enable = !(late_z || alpha_to_coverage);
 597                         cfg.reads_tilebuffer = fs->outputs_read || (!zs_enabled && fs->can_discard);
 598                         cfg.reads_depth_stencil = zs_enabled && fs->can_discard;
 599                 }
 600
 601                 properties |= fs->properties.opaque[0];
 602         }
 603
 604         pan_pack(&multisample_misc, MULTISAMPLE_MISC, cfg) {
 605                 bool msaa = rast->multisample;
 606                 cfg.multisample_enable = msaa;
 607                 cfg.sample_mask = (msaa ? ctx->sample_mask : ~0) & 0xFFFF;
 608
 609                 /* EXT_shader_framebuffer_fetch requires per-sample */
 610                 bool per_sample = ctx->min_samples > 1 || fs->outputs_read;
 611                 cfg.evaluate_per_sample = msaa && per_sample;
 612
 613                 if (dev->quirks & MIDGARD_SFBD) {
 614                         cfg.sfbd_load_destination = blend[0].load_dest;
 615                         cfg.sfbd_blend_shader = blend[0].is_shader;
 616                 }
 617
 618                 cfg.depth_function = zsa->base.depth.enabled ?
 619                         panfrost_translate_compare_func(zsa->base.depth.func) :
 620                         MALI_FUNC_ALWAYS;
 621
 622                 cfg.depth_write_mask = zsa->base.depth.writemask;
 623                 cfg.near_discard = rast->depth_clip_near;
 624                 cfg.far_discard = rast->depth_clip_far;
 625                 cfg.unknown_2 = true;
 626         }
 627
 628         pan_pack(&stencil_mask_misc, STENCIL_MASK_MISC, cfg) {
 629                 cfg.stencil_mask_front = zsa->stencil_mask_front;
 630                 cfg.stencil_mask_back = zsa->stencil_mask_back;
 631                 cfg.stencil_enable = zsa->base.stencil[0].enabled;
 632                 cfg.alpha_to_coverage = alpha_to_coverage;
 633
 634                 if (dev->quirks & MIDGARD_SFBD) {
 635                         cfg.sfbd_write_enable = !blend[0].no_colour;
 636                         cfg.sfbd_srgb = util_format_is_srgb(ctx->pipe_framebuffer.cbufs[0]->format);
 637                         cfg.sfbd_dither_disable = !ctx->blend->base.dither;
 638                 }
 639
 640                 cfg.unknown_1 = 0x7;
 641                 cfg.depth_range_1 = cfg.depth_range_2 = rast->offset_tri;
 642                 cfg.single_sampled_lines = !rast->multisample;
 643         }
 644
 645         if (dev->quirks & MIDGARD_SFBD) {
 646                 if (blend[0].is_shader) {
 647                         sfbd_blend.shader = blend[0].shader.gpu |
 648                                 blend[0].shader.first_tag;
 649                 } else {
 650                         sfbd_blend.equation = blend[0].equation.equation;
 651                         sfbd_blend.constant = blend[0].equation.constant;
 652                 }
 653         } else if (!(dev->quirks & IS_BIFROST)) {
 654                 /* Bug where MRT-capable hw apparently reads the last blend
 655                  * shader from here instead of the usual location? */
 656
 657                 for (signed rt = ((signed) rt_count - 1); rt >= 0; --rt) {
 658                         if (!blend[rt].is_shader)
 659                                 continue;
 660
 661                         sfbd_blend.shader = blend[rt].shader.gpu |
 662                                                  blend[rt].shader.first_tag;
 663                         break;
 664                 }
 665         }
 666
 667         pan_pack(fragmeta, STATE_OPAQUE, cfg) {
 668                 cfg.shader = fs->shader;
 669                 cfg.properties = properties;
 670                 cfg.depth_units = rast->offset_units * 2.0f;
 671                 cfg.depth_factor = rast->offset_scale;
 672                 cfg.multisample_misc = multisample_misc;
 673                 cfg.stencil_mask_misc = stencil_mask_misc;
 674
 675                 cfg.stencil_front = zsa->stencil_front;
 676                 cfg.stencil_back = zsa->stencil_back;
 677
 678                 /* Bottom bits for stencil ref, exactly one word */
 679                 bool back_enab = zsa->base.stencil[1].enabled;
 680                 cfg.stencil_front.opaque[0] |= ctx->stencil_ref.ref_value[0];
 681                 cfg.stencil_back.opaque[0] |= ctx->stencil_ref.ref_value[back_enab ? 1 : 0];
 682
 683                 if (dev->quirks & IS_BIFROST)
 684                         cfg.preload = preload;
 685                 else
 686                         memcpy(&cfg.sfbd_blend, &sfbd_blend, sizeof(sfbd_blend));
 687         }
 688 }
 689
 690 mali_ptr
 691 panfrost_emit_compute_shader_meta(struct panfrost_batch *batch, enum pipe_shader_type stage)
 692 {
 693         struct panfrost_shader_state *ss = panfrost_get_shader_state(batch->ctx, stage);
 694
 695         panfrost_batch_add_bo(batch, ss->bo,
 696                               PAN_BO_ACCESS_PRIVATE |
 697                               PAN_BO_ACCESS_READ |
 698                               PAN_BO_ACCESS_VERTEX_TILER);
 699
 700         panfrost_batch_add_bo(batch, pan_resource(ss->upload.rsrc)->bo,
 701                               PAN_BO_ACCESS_PRIVATE |
 702                               PAN_BO_ACCESS_READ |
 703                               PAN_BO_ACCESS_VERTEX_TILER);
 704
 705         return pan_resource(ss->upload.rsrc)->bo->gpu + ss->upload.offset;
 706 }
 707
 708 mali_ptr
 709 panfrost_emit_frag_shader_meta(struct panfrost_batch *batch)
 710 {
 711         struct panfrost_context *ctx = batch->ctx;
 712         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 713
 714         /* Add the shader BO to the batch. */
 715         panfrost_batch_add_bo(batch, ss->bo,
 716                               PAN_BO_ACCESS_PRIVATE |
 717                               PAN_BO_ACCESS_READ |
 718                               PAN_BO_ACCESS_FRAGMENT);
 719
 720         struct panfrost_device *dev = pan_device(ctx->base.screen);
 721         unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 722         void *rts = NULL;
 723         struct panfrost_transfer xfer;
 724         unsigned rt_size;
 725
 726         if (dev->quirks & MIDGARD_SFBD)
 727                 rt_size = 0;
 728         else if (dev->quirks & IS_BIFROST)
 729                 rt_size = sizeof(struct bifrost_blend_rt);
 730         else
 731                 rt_size = sizeof(struct midgard_blend_rt);
 732
 733         unsigned desc_size = MALI_STATE_LENGTH + rt_size * rt_count;
 734
 735         if (rt_size)
 736                 rts = rzalloc_size(ctx, rt_size * rt_count);
 737
 738         struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 739
 740         for (unsigned c = 0; c < ctx->pipe_framebuffer.nr_cbufs; ++c)
 741                 blend[c] = panfrost_get_blend_for_context(ctx, c);
 742
 743         if (!(dev->quirks & MIDGARD_SFBD))
 744                 panfrost_emit_blend(batch, rts, blend);
 745         else
 746                 batch->draws |= PIPE_CLEAR_COLOR0;
 747
 748         xfer = panfrost_pool_alloc_aligned(&batch->pool, desc_size, MALI_STATE_LENGTH);
 749
 750         panfrost_emit_frag_shader(ctx, (struct mali_state_packed *) xfer.cpu, blend);
 751
 752         memcpy(xfer.cpu + MALI_STATE_LENGTH, rts, rt_size * rt_count);
 753
 754         if (rt_size)
 755                 ralloc_free(rts);
 756
 757         return xfer.gpu;
 758 }
 759
 760 void
 761 panfrost_emit_viewport(struct panfrost_batch *batch,
 762                        struct mali_vertex_tiler_postfix *tiler_postfix)
 763 {
 764         struct panfrost_context *ctx = batch->ctx;
 765         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 766         const struct pipe_scissor_state *ss = &ctx->scissor;
 767         const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 768         const struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
 769
 770         /* Derive min/max from translate/scale. Note since |x| >= 0 by
 771          * definition, we have that -|x| <= |x| hence translate - |scale| <=
 772          * translate + |scale|, so the ordering is correct here. */
 773         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 774         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 775         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 776         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 777         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 778         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 779
 780         /* Scissor to the intersection of viewport and to the scissor, clamped
 781          * to the framebuffer */
 782
 783         unsigned minx = MIN2(fb->width, vp_minx);
 784         unsigned maxx = MIN2(fb->width, vp_maxx);
 785         unsigned miny = MIN2(fb->height, vp_miny);
 786         unsigned maxy = MIN2(fb->height, vp_maxy);
 787
 788         if (ss && rast->scissor) {
 789                 minx = MAX2(ss->minx, minx);
 790                 miny = MAX2(ss->miny, miny);
 791                 maxx = MIN2(ss->maxx, maxx);
 792                 maxy = MIN2(ss->maxy, maxy);
 793         }
 794
 795         struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, MALI_VIEWPORT_LENGTH);
 796
 797         pan_pack(T.cpu, VIEWPORT, cfg) {
 798                 cfg.scissor_minimum_x = minx;
 799                 cfg.scissor_minimum_y = miny;
 800                 cfg.scissor_maximum_x = maxx - 1;
 801                 cfg.scissor_maximum_y = maxy - 1;
 802
 803                 cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY;
 804                 cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY;
 805         }
 806
 807         tiler_postfix->viewport = T.gpu;
 808         panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
 809 }
 810
 811 static mali_ptr
 812 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
 813                                  enum pipe_shader_type st,
 814                                  struct panfrost_constant_buffer *buf,
 815                                  unsigned index)
 816 {
 817         struct pipe_constant_buffer *cb = &buf->cb[index];
 818         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
 819
 820         if (rsrc) {
 821                 panfrost_batch_add_bo(batch, rsrc->bo,
 822                                       PAN_BO_ACCESS_SHARED |
 823                                       PAN_BO_ACCESS_READ |
 824                                       panfrost_bo_access_for_stage(st));
 825
 826                 /* Alignment gauranteed by
 827                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
 828                 return rsrc->bo->gpu + cb->buffer_offset;
 829         } else if (cb->user_buffer) {
 830                 return panfrost_pool_upload_aligned(&batch->pool,
 831                                                  cb->user_buffer +
 832                                                  cb->buffer_offset,
 833                                                  cb->buffer_size, 16);
 834         } else {
 835                 unreachable("No constant buffer");
 836         }
 837 }
 838
 839 struct sysval_uniform {
 840         union {
 841                 float f[4];
 842                 int32_t i[4];
 843                 uint32_t u[4];
 844                 uint64_t du[2];
 845         };
 846 };
 847
 848 static void
 849 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
 850                                       struct sysval_uniform *uniform)
 851 {
 852         struct panfrost_context *ctx = batch->ctx;
 853         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 854
 855         uniform->f[0] = vp->scale[0];
 856         uniform->f[1] = vp->scale[1];
 857         uniform->f[2] = vp->scale[2];
 858 }
 859
 860 static void
 861 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
 862                                        struct sysval_uniform *uniform)
 863 {
 864         struct panfrost_context *ctx = batch->ctx;
 865         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 866
 867         uniform->f[0] = vp->translate[0];
 868         uniform->f[1] = vp->translate[1];
 869         uniform->f[2] = vp->translate[2];
 870 }
 871
 872 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
 873                                        enum pipe_shader_type st,
 874                                        unsigned int sysvalid,
 875                                        struct sysval_uniform *uniform)
 876 {
 877         struct panfrost_context *ctx = batch->ctx;
 878         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
 879         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
 880         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
 881         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
 882
 883         assert(dim);
 884         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
 885
 886         if (dim > 1)
 887                 uniform->i[1] = u_minify(tex->texture->height0,
 888                                          tex->u.tex.first_level);
 889
 890         if (dim > 2)
 891                 uniform->i[2] = u_minify(tex->texture->depth0,
 892                                          tex->u.tex.first_level);
 893
 894         if (is_array)
 895                 uniform->i[dim] = tex->texture->array_size;
 896 }
 897
 898 static void
 899 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
 900                             enum pipe_shader_type st,
 901                             unsigned ssbo_id,
 902                             struct sysval_uniform *uniform)
 903 {
 904         struct panfrost_context *ctx = batch->ctx;
 905
 906         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
 907         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
 908
 909         /* Compute address */
 910         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
 911
 912         panfrost_batch_add_bo(batch, bo,
 913                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
 914                               panfrost_bo_access_for_stage(st));
 915
 916         /* Upload address and size as sysval */
 917         uniform->du[0] = bo->gpu + sb.buffer_offset;
 918         uniform->u[2] = sb.buffer_size;
 919 }
 920
 921 static void
 922 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
 923                                enum pipe_shader_type st,
 924                                unsigned samp_idx,
 925                                struct sysval_uniform *uniform)
 926 {
 927         struct panfrost_context *ctx = batch->ctx;
 928         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
 929
 930         uniform->f[0] = sampl->min_lod;
 931         uniform->f[1] = sampl->max_lod;
 932         uniform->f[2] = sampl->lod_bias;
 933
 934         /* Even without any errata, Midgard represents "no mipmapping" as
 935          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
 936          * panfrost_create_sampler_state which also explains our choice of
 937          * epsilon value (again to keep behaviour consistent) */
 938
 939         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 940                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
 941 }
 942
 943 static void
 944 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
 945                                        struct sysval_uniform *uniform)
 946 {
 947         struct panfrost_context *ctx = batch->ctx;
 948
 949         uniform->u[0] = ctx->compute_grid->grid[0];
 950         uniform->u[1] = ctx->compute_grid->grid[1];
 951         uniform->u[2] = ctx->compute_grid->grid[2];
 952 }
 953
 954 static void
 955 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
 956                         struct panfrost_shader_state *ss,
 957                         enum pipe_shader_type st)
 958 {
 959         struct sysval_uniform *uniforms = (void *)buf;
 960
 961         for (unsigned i = 0; i < ss->sysval_count; ++i) {
 962                 int sysval = ss->sysval[i];
 963
 964                 switch (PAN_SYSVAL_TYPE(sysval)) {
 965                 case PAN_SYSVAL_VIEWPORT_SCALE:
 966                         panfrost_upload_viewport_scale_sysval(batch,
 967                                                               &uniforms[i]);
 968                         break;
 969                 case PAN_SYSVAL_VIEWPORT_OFFSET:
 970                         panfrost_upload_viewport_offset_sysval(batch,
 971                                                                &uniforms[i]);
 972                         break;
 973                 case PAN_SYSVAL_TEXTURE_SIZE:
 974                         panfrost_upload_txs_sysval(batch, st,
 975                                                    PAN_SYSVAL_ID(sysval),
 976                                                    &uniforms[i]);
 977                         break;
 978                 case PAN_SYSVAL_SSBO:
 979                         panfrost_upload_ssbo_sysval(batch, st,
 980                                                     PAN_SYSVAL_ID(sysval),
 981                                                     &uniforms[i]);
 982                         break;
 983                 case PAN_SYSVAL_NUM_WORK_GROUPS:
 984                         panfrost_upload_num_work_groups_sysval(batch,
 985                                                                &uniforms[i]);
 986                         break;
 987                 case PAN_SYSVAL_SAMPLER:
 988                         panfrost_upload_sampler_sysval(batch, st,
 989                                                        PAN_SYSVAL_ID(sysval),
 990                                                        &uniforms[i]);
 991                         break;
 992                 default:
 993                         assert(0);
 994                 }
 995         }
 996 }
 997
 998 static const void *
 999 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1000                                  unsigned index)
1001 {
1002         struct pipe_constant_buffer *cb = &buf->cb[index];
1003         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1004
1005         if (rsrc)
1006                 return rsrc->bo->cpu;
1007         else if (cb->user_buffer)
1008                 return cb->user_buffer;
1009         else
1010                 unreachable("No constant buffer");
1011 }
1012
1013 void
1014 panfrost_emit_const_buf(struct panfrost_batch *batch,
1015                         enum pipe_shader_type stage,
1016                         struct mali_vertex_tiler_postfix *postfix)
1017 {
1018         struct panfrost_context *ctx = batch->ctx;
1019         struct panfrost_shader_variants *all = ctx->shader[stage];
1020
1021         if (!all)
1022                 return;
1023
1024         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1025
1026         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1027
1028         /* Uniforms are implicitly UBO #0 */
1029         bool has_uniforms = buf->enabled_mask & (1 << 0);
1030
1031         /* Allocate room for the sysval and the uniforms */
1032         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1033         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1034         size_t size = sys_size + uniform_size;
1035         struct panfrost_transfer transfer =
1036                 panfrost_pool_alloc_aligned(&batch->pool, size, 16);
1037
1038         /* Upload sysvals requested by the shader */
1039         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1040
1041         /* Upload uniforms */
1042         if (has_uniforms && uniform_size) {
1043                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1044                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1045         }
1046
1047         /* Next up, attach UBOs. UBO #0 is the uniforms we just
1048          * uploaded, so it's always included. The count is the highest UBO
1049          * addressable -- gaps are included. */
1050
1051         unsigned ubo_count = 32 - __builtin_clz(buf->enabled_mask | 1);
1052
1053         size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count;
1054         struct panfrost_transfer ubos =
1055                 panfrost_pool_alloc_aligned(&batch->pool, sz,
1056                                 MALI_UNIFORM_BUFFER_LENGTH);
1057
1058         uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
1059
1060         /* Upload uniforms as a UBO */
1061
1062         if (size) {
1063                 pan_pack(ubo_ptr, UNIFORM_BUFFER, cfg) {
1064                         cfg.entries = DIV_ROUND_UP(size, 16);
1065                         cfg.pointer = transfer.gpu;
1066                 }
1067         } else {
1068                 *ubo_ptr = 0;
1069         }
1070
1071         /* The rest are honest-to-goodness UBOs */
1072
1073         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1074                 size_t usz = buf->cb[ubo].buffer_size;
1075                 bool enabled = buf->enabled_mask & (1 << ubo);
1076                 bool empty = usz == 0;
1077
1078                 if (!enabled || empty) {
1079                         ubo_ptr[ubo] = 0;
1080                         continue;
1081                 }
1082
1083                 pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) {
1084                         cfg.entries = DIV_ROUND_UP(usz, 16);
1085                         cfg.pointer = panfrost_map_constant_buffer_gpu(batch,
1086                                         stage, buf, ubo);
1087                 }
1088         }
1089
1090         postfix->uniforms = transfer.gpu;
1091         postfix->uniform_buffers = ubos.gpu;
1092
1093         buf->dirty_mask = 0;
1094 }
1095
1096 void
1097 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1098                             const struct pipe_grid_info *info,
1099                             struct midgard_payload_vertex_tiler *vtp)
1100 {
1101         struct panfrost_context *ctx = batch->ctx;
1102         struct panfrost_device *dev = pan_device(ctx->base.screen);
1103         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1104         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1105         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1106                                                            128));
1107
1108         unsigned log2_instances =
1109                 util_logbase2_ceil(info->grid[0]) +
1110                 util_logbase2_ceil(info->grid[1]) +
1111                 util_logbase2_ceil(info->grid[2]);
1112
1113         unsigned shared_size = single_size * (1 << log2_instances) * dev->core_count;
1114         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1115                                                                   shared_size,
1116                                                                   1);
1117
1118         struct mali_shared_memory shared = {
1119                 .shared_memory = bo->gpu,
1120                 .shared_workgroup_count = log2_instances,
1121                 .shared_shift = util_logbase2(single_size) + 1
1122         };
1123
1124         vtp->postfix.shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared,
1125                                                                sizeof(shared), 64);
1126 }
1127
1128 static mali_ptr
1129 panfrost_get_tex_desc(struct panfrost_batch *batch,
1130                       enum pipe_shader_type st,
1131                       struct panfrost_sampler_view *view)
1132 {
1133         if (!view)
1134                 return (mali_ptr) 0;
1135
1136         struct pipe_sampler_view *pview = &view->base;
1137         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1138
1139         /* Add the BO to the job so it's retained until the job is done. */
1140
1141         panfrost_batch_add_bo(batch, rsrc->bo,
1142                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1143                               panfrost_bo_access_for_stage(st));
1144
1145         panfrost_batch_add_bo(batch, view->bo,
1146                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1147                               panfrost_bo_access_for_stage(st));
1148
1149         return view->bo->gpu;
1150 }
1151
1152 static void
1153 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1154                              struct pipe_context *pctx)
1155 {
1156         struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1157         if (view->texture_bo != rsrc->bo->gpu ||
1158             view->modifier != rsrc->modifier) {
1159                 panfrost_bo_unreference(view->bo);
1160                 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1161         }
1162 }
1163
1164 void
1165 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1166                                   enum pipe_shader_type stage,
1167                                   struct mali_vertex_tiler_postfix *postfix)
1168 {
1169         struct panfrost_context *ctx = batch->ctx;
1170         struct panfrost_device *device = pan_device(ctx->base.screen);
1171
1172         if (!ctx->sampler_view_count[stage])
1173                 return;
1174
1175         if (device->quirks & IS_BIFROST) {
1176                 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1177                                 MALI_BIFROST_TEXTURE_LENGTH *
1178                                 ctx->sampler_view_count[stage],
1179                                 MALI_BIFROST_TEXTURE_LENGTH);
1180
1181                 struct mali_bifrost_texture_packed *out =
1182                         (struct mali_bifrost_texture_packed *) T.cpu;
1183
1184                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1185                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1186                         struct pipe_sampler_view *pview = &view->base;
1187                         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1188
1189                         panfrost_update_sampler_view(view, &ctx->base);
1190                         out[i] = view->bifrost_descriptor;
1191
1192                         /* Add the BOs to the job so they are retained until the job is done. */
1193
1194                         panfrost_batch_add_bo(batch, rsrc->bo,
1195                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1196                                               panfrost_bo_access_for_stage(stage));
1197
1198                         panfrost_batch_add_bo(batch, view->bo,
1199                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1200                                               panfrost_bo_access_for_stage(stage));
1201                 }
1202
1203                 postfix->textures = T.gpu;
1204         } else {
1205                 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1206
1207                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1208                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1209
1210                         panfrost_update_sampler_view(view, &ctx->base);
1211
1212                         trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1213                 }
1214
1215                 postfix->textures = panfrost_pool_upload_aligned(&batch->pool,
1216                                                               trampolines,
1217                                                               sizeof(uint64_t) *
1218                                                               ctx->sampler_view_count[stage],
1219                                                               sizeof(uint64_t));
1220         }
1221 }
1222
1223 void
1224 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1225                                   enum pipe_shader_type stage,
1226                                   struct mali_vertex_tiler_postfix *postfix)
1227 {
1228         struct panfrost_context *ctx = batch->ctx;
1229
1230         if (!ctx->sampler_count[stage])
1231                 return;
1232
1233         size_t desc_size = MALI_BIFROST_SAMPLER_LENGTH;
1234         assert(MALI_BIFROST_SAMPLER_LENGTH == MALI_MIDGARD_SAMPLER_LENGTH);
1235
1236         size_t sz = desc_size * ctx->sampler_count[stage];
1237         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool, sz, desc_size);
1238         struct mali_midgard_sampler_packed *out = (struct mali_midgard_sampler_packed *) T.cpu;
1239
1240         for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i)
1241                 out[i] = ctx->samplers[stage][i]->hw;
1242
1243         postfix->sampler_descriptor = T.gpu;
1244 }
1245
1246 void
1247 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1248                           struct mali_vertex_tiler_postfix *vertex_postfix)
1249 {
1250         struct panfrost_context *ctx = batch->ctx;
1251         struct panfrost_vertex_state *so = ctx->vertex;
1252         struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1253
1254         unsigned instance_shift = vertex_postfix->instance_shift;
1255         unsigned instance_odd = vertex_postfix->instance_odd;
1256
1257         /* Worst case: everything is NPOT, which is only possible if instancing
1258          * is enabled. Otherwise single record is gauranteed */
1259         bool could_npot = instance_shift || instance_odd;
1260
1261         struct panfrost_transfer S = panfrost_pool_alloc_aligned(&batch->pool,
1262                         MALI_ATTRIBUTE_BUFFER_LENGTH * vs->attribute_count *
1263                         (could_npot ? 2 : 1),
1264                         MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1265
1266         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1267                         MALI_ATTRIBUTE_LENGTH * vs->attribute_count,
1268                         MALI_ATTRIBUTE_LENGTH);
1269
1270         struct mali_attribute_buffer_packed *bufs =
1271                 (struct mali_attribute_buffer_packed *) S.cpu;
1272
1273         struct mali_attribute_packed *out =
1274                 (struct mali_attribute_packed *) T.cpu;
1275
1276         unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 };
1277         unsigned k = 0;
1278
1279         for (unsigned i = 0; i < so->num_elements; ++i) {
1280                 /* We map buffers 1:1 with the attributes, which
1281                  * means duplicating some vertex buffers (who cares? aside from
1282                  * maybe some caching implications but I somehow doubt that
1283                  * matters) */
1284
1285                 struct pipe_vertex_element *elem = &so->pipe[i];
1286                 unsigned vbi = elem->vertex_buffer_index;
1287                 attrib_to_buffer[i] = k;
1288
1289                 if (!(ctx->vb_mask & (1 << vbi)))
1290                         continue;
1291
1292                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1293                 struct panfrost_resource *rsrc;
1294
1295                 rsrc = pan_resource(buf->buffer.resource);
1296                 if (!rsrc)
1297                         continue;
1298
1299                 /* Add a dependency of the batch on the vertex buffer */
1300                 panfrost_batch_add_bo(batch, rsrc->bo,
1301                                       PAN_BO_ACCESS_SHARED |
1302                                       PAN_BO_ACCESS_READ |
1303                                       PAN_BO_ACCESS_VERTEX_TILER);
1304
1305                 /* Mask off lower bits, see offset fixup below */
1306                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1307                 mali_ptr addr = raw_addr & ~63;
1308
1309                 /* Since we advanced the base pointer, we shrink the buffer
1310                  * size, but add the offset we subtracted */
1311                 unsigned size = rsrc->base.width0 + (raw_addr - addr)
1312                         - buf->buffer_offset;
1313
1314                 /* When there is a divisor, the hardware-level divisor is
1315                  * the product of the instance divisor and the padded count */
1316                 unsigned divisor = elem->instance_divisor;
1317                 unsigned hw_divisor = ctx->padded_count * divisor;
1318                 unsigned stride = buf->stride;
1319
1320                 /* If there's a divisor(=1) but no instancing, we want every
1321                  * attribute to be the same */
1322
1323                 if (divisor && ctx->instance_count == 1)
1324                         stride = 0;
1325
1326                 if (!divisor || ctx->instance_count <= 1) {
1327                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1328                                 if (ctx->instance_count > 1)
1329                                         cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
1330
1331                                 cfg.pointer = addr;
1332                                 cfg.stride = stride;
1333                                 cfg.size = size;
1334                                 cfg.divisor_r = instance_shift;
1335                                 cfg.divisor_p = instance_odd;
1336                         }
1337                 } else if (util_is_power_of_two_or_zero(hw_divisor)) {
1338                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1339                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
1340                                 cfg.pointer = addr;
1341                                 cfg.stride = stride;
1342                                 cfg.size = size;
1343                                 cfg.divisor_r = __builtin_ctz(hw_divisor);
1344                         }
1345
1346                 } else {
1347                         unsigned shift = 0, extra_flags = 0;
1348
1349                         unsigned magic_divisor =
1350                                 panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
1351
1352                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1353                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
1354                                 cfg.pointer = addr;
1355                                 cfg.stride = stride;
1356                                 cfg.size = size;
1357
1358                                 cfg.divisor_r = shift;
1359                                 cfg.divisor_e = extra_flags;
1360                         }
1361
1362                         pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
1363                                 cfg.divisor_numerator = magic_divisor;
1364                                 cfg.divisor = divisor;
1365                         }
1366
1367                         ++k;
1368                 }
1369
1370                 ++k;
1371         }
1372
1373         /* Add special gl_VertexID/gl_InstanceID buffers */
1374
1375         if (unlikely(vs->attribute_count >= PAN_VERTEX_ID)) {
1376                 panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1377
1378                 pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
1379                         cfg.buffer_index = k++;
1380                         cfg.format = so->formats[PAN_VERTEX_ID];
1381                 }
1382
1383                 panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1384
1385                 pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
1386                         cfg.buffer_index = k++;
1387                         cfg.format = so->formats[PAN_INSTANCE_ID];
1388                 }
1389         }
1390
1391         /* Attribute addresses require 64-byte alignment, so let:
1392          *
1393          *      base' = base & ~63 = base - (base & 63)
1394          *      offset' = offset + (base & 63)
1395          *
1396          * Since base' + offset' = base + offset, these are equivalent
1397          * addressing modes and now base is 64 aligned.
1398          */
1399
1400         unsigned start = vertex_postfix->offset_start;
1401
1402         for (unsigned i = 0; i < so->num_elements; ++i) {
1403                 unsigned vbi = so->pipe[i].vertex_buffer_index;
1404                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1405
1406                 /* Adjust by the masked off bits of the offset. Make sure we
1407                  * read src_offset from so->hw (which is not GPU visible)
1408                  * rather than target (which is) due to caching effects */
1409
1410                 unsigned src_offset = so->pipe[i].src_offset;
1411
1412                 /* BOs aligned to 4k so guaranteed aligned to 64 */
1413                 src_offset += (buf->buffer_offset & 63);
1414
1415                 /* Also, somewhat obscurely per-instance data needs to be
1416                  * offset in response to a delayed start in an indexed draw */
1417
1418                 if (so->pipe[i].instance_divisor && ctx->instance_count > 1 && start)
1419                         src_offset -= buf->stride * start;
1420
1421                 pan_pack(out + i, ATTRIBUTE, cfg) {
1422                         cfg.buffer_index = attrib_to_buffer[i];
1423                         cfg.format = so->formats[i];
1424                         cfg.offset = src_offset;
1425                 }
1426         }
1427
1428         vertex_postfix->attributes = S.gpu;
1429         vertex_postfix->attribute_meta = T.gpu;
1430 }
1431
1432 static mali_ptr
1433 panfrost_emit_varyings(struct panfrost_batch *batch,
1434                 struct mali_attribute_buffer_packed *slot,
1435                 unsigned stride, unsigned count)
1436 {
1437         unsigned size = stride * count;
1438         mali_ptr ptr = panfrost_pool_alloc_aligned(&batch->invisible_pool, size, 64).gpu;
1439
1440         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1441                 cfg.stride = stride;
1442                 cfg.size = size;
1443                 cfg.pointer = ptr;
1444         }
1445
1446         return ptr;
1447 }
1448
1449 static unsigned
1450 panfrost_streamout_offset(unsigned stride, unsigned offset,
1451                         struct pipe_stream_output_target *target)
1452 {
1453         return (target->buffer_offset + (offset * stride * 4)) & 63;
1454 }
1455
1456 static void
1457 panfrost_emit_streamout(struct panfrost_batch *batch,
1458                         struct mali_attribute_buffer_packed *slot,
1459                         unsigned stride_words, unsigned offset, unsigned count,
1460                         struct pipe_stream_output_target *target)
1461 {
1462         unsigned stride = stride_words * 4;
1463         unsigned max_size = target->buffer_size;
1464         unsigned expected_size = stride * count;
1465
1466         /* Grab the BO and bind it to the batch */
1467         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1468
1469         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1470          * the perspective of the TILER and FRAGMENT.
1471          */
1472         panfrost_batch_add_bo(batch, bo,
1473                               PAN_BO_ACCESS_SHARED |
1474                               PAN_BO_ACCESS_RW |
1475                               PAN_BO_ACCESS_VERTEX_TILER |
1476                               PAN_BO_ACCESS_FRAGMENT);
1477
1478         /* We will have an offset applied to get alignment */
1479         mali_ptr addr = bo->gpu + target->buffer_offset + (offset * stride);
1480
1481         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1482                 cfg.pointer = (addr & ~63);
1483                 cfg.stride = stride;
1484                 cfg.size = MIN2(max_size, expected_size) + (addr & 63);
1485         }
1486 }
1487
1488 static bool
1489 has_point_coord(unsigned mask, gl_varying_slot loc)
1490 {
1491         if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1492                 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1493         else if (loc == VARYING_SLOT_PNTC)
1494                 return (mask & (1 << 8));
1495         else
1496                 return false;
1497 }
1498
1499 /* Helpers for manipulating stream out information so we can pack varyings
1500  * accordingly. Compute the src_offset for a given captured varying */
1501
1502 static struct pipe_stream_output *
1503 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1504 {
1505         for (unsigned i = 0; i < info->num_outputs; ++i) {
1506                 if (info->output[i].register_index == loc)
1507                         return &info->output[i];
1508         }
1509
1510         unreachable("Varying not captured");
1511 }
1512
1513 static unsigned
1514 pan_varying_size(enum mali_format fmt)
1515 {
1516         unsigned type = MALI_EXTRACT_TYPE(fmt);
1517         unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1518         unsigned bits = MALI_EXTRACT_BITS(fmt);
1519         unsigned bpc = 0;
1520
1521         if (bits == MALI_CHANNEL_FLOAT) {
1522                 /* No doubles */
1523                 bool fp16 = (type == MALI_FORMAT_SINT);
1524                 assert(fp16 || (type == MALI_FORMAT_UNORM));
1525
1526                 bpc = fp16 ? 2 : 4;
1527         } else {
1528                 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1529
1530                 /* See the enums */
1531                 bits = 1 << bits;
1532                 assert(bits >= 8);
1533                 bpc = bits / 8;
1534         }
1535
1536         return bpc * chan;
1537 }
1538
1539 /* Indices for named (non-XFB) varyings that are present. These are packed
1540  * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1541  * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1542  * of a given special field given a shift S by:
1543  *
1544  *      idx = popcount(P & ((1 << S) - 1))
1545  *
1546  * That is... look at all of the varyings that come earlier and count them, the
1547  * count is the new index since plus one. Likewise, the total number of special
1548  * buffers required is simply popcount(P)
1549  */
1550
1551 enum pan_special_varying {
1552         PAN_VARY_GENERAL = 0,
1553         PAN_VARY_POSITION = 1,
1554         PAN_VARY_PSIZ = 2,
1555         PAN_VARY_PNTCOORD = 3,
1556         PAN_VARY_FACE = 4,
1557         PAN_VARY_FRAGCOORD = 5,
1558
1559         /* Keep last */
1560         PAN_VARY_MAX,
1561 };
1562
1563 /* Given a varying, figure out which index it correpsonds to */
1564
1565 static inline unsigned
1566 pan_varying_index(unsigned present, enum pan_special_varying v)
1567 {
1568         unsigned mask = (1 << v) - 1;
1569         return util_bitcount(present & mask);
1570 }
1571
1572 /* Get the base offset for XFB buffers, which by convention come after
1573  * everything else. Wrapper function for semantic reasons; by construction this
1574  * is just popcount. */
1575
1576 static inline unsigned
1577 pan_xfb_base(unsigned present)
1578 {
1579         return util_bitcount(present);
1580 }
1581
1582 /* Computes the present mask for varyings so we can start emitting varying records */
1583
1584 static inline unsigned
1585 pan_varying_present(
1586         struct panfrost_shader_state *vs,
1587         struct panfrost_shader_state *fs,
1588         unsigned quirks)
1589 {
1590         /* At the moment we always emit general and position buffers. Not
1591          * strictly necessary but usually harmless */
1592
1593         unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1594
1595         /* Enable special buffers by the shader info */
1596
1597         if (vs->writes_point_size)
1598                 present |= (1 << PAN_VARY_PSIZ);
1599
1600         if (fs->reads_point_coord)
1601                 present |= (1 << PAN_VARY_PNTCOORD);
1602
1603         if (fs->reads_face)
1604                 present |= (1 << PAN_VARY_FACE);
1605
1606         if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1607                 present |= (1 << PAN_VARY_FRAGCOORD);
1608
1609         /* Also, if we have a point sprite, we need a point coord buffer */
1610
1611         for (unsigned i = 0; i < fs->varying_count; i++)  {
1612                 gl_varying_slot loc = fs->varyings_loc[i];
1613
1614                 if (has_point_coord(fs->point_sprite_mask, loc))
1615                         present |= (1 << PAN_VARY_PNTCOORD);
1616         }
1617
1618         return present;
1619 }
1620
1621 /* Emitters for varying records */
1622
1623 static void
1624 pan_emit_vary(struct mali_attribute_packed *out,
1625                 unsigned present, enum pan_special_varying buf,
1626                 unsigned quirks, enum mali_format format,
1627                 unsigned offset)
1628 {
1629         unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1630         unsigned swizzle = quirks & HAS_SWIZZLES ?
1631                         panfrost_get_default_swizzle(nr_channels) :
1632                         panfrost_bifrost_swizzle(nr_channels);
1633
1634         pan_pack(out, ATTRIBUTE, cfg) {
1635                 cfg.buffer_index = pan_varying_index(present, buf);
1636                 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1637                 cfg.format = (format << 12) | swizzle;
1638                 cfg.offset = offset;
1639         }
1640 }
1641
1642 /* General varying that is unused */
1643
1644 static void
1645 pan_emit_vary_only(struct mali_attribute_packed *out,
1646                 unsigned present, unsigned quirks)
1647 {
1648         pan_emit_vary(out, present, 0, quirks, MALI_VARYING_DISCARD, 0);
1649 }
1650
1651 /* Special records */
1652
1653 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1654         [PAN_VARY_POSITION]     = MALI_VARYING_POS,
1655         [PAN_VARY_PSIZ]         = MALI_R16F,
1656         [PAN_VARY_PNTCOORD]     = MALI_R16F,
1657         [PAN_VARY_FACE]         = MALI_R32I,
1658         [PAN_VARY_FRAGCOORD]    = MALI_RGBA32F
1659 };
1660
1661 static void
1662 pan_emit_vary_special(struct mali_attribute_packed *out,
1663                 unsigned present, enum pan_special_varying buf,
1664                 unsigned quirks)
1665 {
1666         assert(buf < PAN_VARY_MAX);
1667         pan_emit_vary(out, present, buf, quirks, pan_varying_formats[buf], 0);
1668 }
1669
1670 static enum mali_format
1671 pan_xfb_format(enum mali_format format, unsigned nr)
1672 {
1673         if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1674                 return MALI_R32F | MALI_NR_CHANNELS(nr);
1675         else
1676                 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1677 }
1678
1679 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1680  * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1681  * value. */
1682
1683 static void
1684 pan_emit_vary_xfb(struct mali_attribute_packed *out,
1685                 unsigned present,
1686                 unsigned max_xfb,
1687                 unsigned *streamout_offsets,
1688                 unsigned quirks,
1689                 enum mali_format format,
1690                 struct pipe_stream_output o)
1691 {
1692         unsigned swizzle = quirks & HAS_SWIZZLES ?
1693                         panfrost_get_default_swizzle(o.num_components) :
1694                         panfrost_bifrost_swizzle(o.num_components);
1695
1696         pan_pack(out, ATTRIBUTE, cfg) {
1697                 /* XFB buffers come after everything else */
1698                 cfg.buffer_index = pan_xfb_base(present) + o.output_buffer;
1699                 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1700
1701                 /* Override number of channels and precision to highp */
1702                 cfg.format = (pan_xfb_format(format, o.num_components) << 12) | swizzle;
1703
1704                 /* Apply given offsets together */
1705                 cfg.offset = (o.dst_offset * 4) /* dwords */
1706                         + streamout_offsets[o.output_buffer];
1707         }
1708 }
1709
1710 /* Determine if we should capture a varying for XFB. This requires actually
1711  * having a buffer for it. If we don't capture it, we'll fallback to a general
1712  * varying path (linked or unlinked, possibly discarding the write) */
1713
1714 static bool
1715 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1716                 unsigned loc, unsigned max_xfb)
1717 {
1718         if (!(xfb->so_mask & (1ll << loc)))
1719                 return false;
1720
1721         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1722         return o->output_buffer < max_xfb;
1723 }
1724
1725 static void
1726 pan_emit_general_varying(struct mali_attribute_packed *out,
1727                 struct panfrost_shader_state *other,
1728                 struct panfrost_shader_state *xfb,
1729                 gl_varying_slot loc,
1730                 enum mali_format format,
1731                 unsigned present,
1732                 unsigned quirks,
1733                 unsigned *gen_offsets,
1734                 enum mali_format *gen_formats,
1735                 unsigned *gen_stride,
1736                 unsigned idx,
1737                 bool should_alloc)
1738 {
1739         /* Check if we're linked */
1740         signed other_idx = -1;
1741
1742         for (unsigned j = 0; j < other->varying_count; ++j) {
1743                 if (other->varyings_loc[j] == loc) {
1744                         other_idx = j;
1745                         break;
1746                 }
1747         }
1748
1749         if (other_idx < 0) {
1750                 pan_emit_vary_only(out, present, quirks);
1751                 return;
1752         }
1753
1754         unsigned offset = gen_offsets[other_idx];
1755
1756         if (should_alloc) {
1757                 /* We're linked, so allocate a space via a watermark allocation */
1758                 enum mali_format alt = other->varyings[other_idx];
1759
1760                 /* Do interpolation at minimum precision */
1761                 unsigned size_main = pan_varying_size(format);
1762                 unsigned size_alt = pan_varying_size(alt);
1763                 unsigned size = MIN2(size_main, size_alt);
1764
1765                 /* If a varying is marked for XFB but not actually captured, we
1766                  * should match the format to the format that would otherwise
1767                  * be used for XFB, since dEQP checks for invariance here. It's
1768                  * unclear if this is required by the spec. */
1769
1770                 if (xfb->so_mask & (1ull << loc)) {
1771                         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1772                         format = pan_xfb_format(format, o->num_components);
1773                         size = pan_varying_size(format);
1774                 } else if (size == size_alt) {
1775                         format = alt;
1776                 }
1777
1778                 gen_offsets[idx] = *gen_stride;
1779                 gen_formats[other_idx] = format;
1780                 offset = *gen_stride;
1781                 *gen_stride += size;
1782         }
1783
1784         pan_emit_vary(out, present, PAN_VARY_GENERAL, quirks, format, offset);
1785 }
1786
1787 /* Higher-level wrapper around all of the above, classifying a varying into one
1788  * of the above types */
1789
1790 static void
1791 panfrost_emit_varying(
1792                 struct mali_attribute_packed *out,
1793                 struct panfrost_shader_state *stage,
1794                 struct panfrost_shader_state *other,
1795                 struct panfrost_shader_state *xfb,
1796                 unsigned present,
1797                 unsigned max_xfb,
1798                 unsigned *streamout_offsets,
1799                 unsigned quirks,
1800                 unsigned *gen_offsets,
1801                 enum mali_format *gen_formats,
1802                 unsigned *gen_stride,
1803                 unsigned idx,
1804                 bool should_alloc,
1805                 bool is_fragment)
1806 {
1807         gl_varying_slot loc = stage->varyings_loc[idx];
1808         enum mali_format format = stage->varyings[idx];
1809
1810         /* Override format to match linkage */
1811         if (!should_alloc && gen_formats[idx])
1812                 format = gen_formats[idx];
1813
1814         if (has_point_coord(stage->point_sprite_mask, loc)) {
1815                 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1816         } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1817                 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1818                 pan_emit_vary_xfb(out, present, max_xfb, streamout_offsets, quirks, format, *o);
1819         } else if (loc == VARYING_SLOT_POS) {
1820                 if (is_fragment)
1821                         pan_emit_vary_special(out, present, PAN_VARY_FRAGCOORD, quirks);
1822                 else
1823                         pan_emit_vary_special(out, present, PAN_VARY_POSITION, quirks);
1824         } else if (loc == VARYING_SLOT_PSIZ) {
1825                 pan_emit_vary_special(out, present, PAN_VARY_PSIZ, quirks);
1826         } else if (loc == VARYING_SLOT_PNTC) {
1827                 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1828         } else if (loc == VARYING_SLOT_FACE) {
1829                 pan_emit_vary_special(out, present, PAN_VARY_FACE, quirks);
1830         } else {
1831                 pan_emit_general_varying(out, other, xfb, loc, format, present,
1832                                 quirks, gen_offsets, gen_formats, gen_stride,
1833                                 idx, should_alloc);
1834         }
1835 }
1836
1837 static void
1838 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
1839                 unsigned present,
1840                 enum pan_special_varying v,
1841                 unsigned special)
1842 {
1843         if (present & (1 << v)) {
1844                 unsigned idx = pan_varying_index(present, v);
1845
1846                 pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
1847                         cfg.special = special;
1848                         cfg.type = 0;
1849                 }
1850         }
1851 }
1852
1853 void
1854 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1855                                  unsigned vertex_count,
1856                                  struct mali_vertex_tiler_postfix *vertex_postfix,
1857                                  struct mali_vertex_tiler_postfix *tiler_postfix,
1858                                  union midgard_primitive_size *primitive_size)
1859 {
1860         /* Load the shaders */
1861         struct panfrost_context *ctx = batch->ctx;
1862         struct panfrost_device *dev = pan_device(ctx->base.screen);
1863         struct panfrost_shader_state *vs, *fs;
1864         size_t vs_size, fs_size;
1865
1866         /* Allocate the varying descriptor */
1867
1868         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1869         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1870         vs_size = MALI_ATTRIBUTE_LENGTH * vs->varying_count;
1871         fs_size = MALI_ATTRIBUTE_LENGTH * fs->varying_count;
1872
1873         struct panfrost_transfer trans = panfrost_pool_alloc_aligned(
1874                         &batch->pool, vs_size + fs_size, MALI_ATTRIBUTE_LENGTH);
1875
1876         struct pipe_stream_output_info *so = &vs->stream_output;
1877         unsigned present = pan_varying_present(vs, fs, dev->quirks);
1878
1879         /* Check if this varying is linked by us. This is the case for
1880          * general-purpose, non-captured varyings. If it is, link it. If it's
1881          * not, use the provided stream out information to determine the
1882          * offset, since it was already linked for us. */
1883
1884         unsigned gen_offsets[32];
1885         enum mali_format gen_formats[32];
1886         memset(gen_offsets, 0, sizeof(gen_offsets));
1887         memset(gen_formats, 0, sizeof(gen_formats));
1888
1889         unsigned gen_stride = 0;
1890         assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
1891         assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
1892
1893         unsigned streamout_offsets[32];
1894
1895         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1896                 streamout_offsets[i] = panfrost_streamout_offset(
1897                                         so->stride[i],
1898                                         ctx->streamout.offsets[i],
1899                                         ctx->streamout.targets[i]);
1900         }
1901
1902         struct mali_attribute_packed *ovs = (struct mali_attribute_packed *)trans.cpu;
1903         struct mali_attribute_packed *ofs = ovs + vs->varying_count;
1904
1905         for (unsigned i = 0; i < vs->varying_count; i++) {
1906                 panfrost_emit_varying(ovs + i, vs, fs, vs, present,
1907                                 ctx->streamout.num_targets, streamout_offsets,
1908                                 dev->quirks,
1909                                 gen_offsets, gen_formats, &gen_stride, i, true, false);
1910         }
1911
1912         for (unsigned i = 0; i < fs->varying_count; i++) {
1913                 panfrost_emit_varying(ofs + i, fs, vs, vs, present,
1914                                 ctx->streamout.num_targets, streamout_offsets,
1915                                 dev->quirks,
1916                                 gen_offsets, gen_formats, &gen_stride, i, false, true);
1917         }
1918
1919         unsigned xfb_base = pan_xfb_base(present);
1920         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1921                         MALI_ATTRIBUTE_BUFFER_LENGTH * (xfb_base + ctx->streamout.num_targets),
1922                         MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1923         struct mali_attribute_buffer_packed *varyings =
1924                 (struct mali_attribute_buffer_packed *) T.cpu;
1925
1926         /* Emit the stream out buffers */
1927
1928         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1929                                                            ctx->vertex_count);
1930
1931         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1932                 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
1933                                         so->stride[i],
1934                                         ctx->streamout.offsets[i],
1935                                         out_count,
1936                                         ctx->streamout.targets[i]);
1937         }
1938
1939         panfrost_emit_varyings(batch,
1940                         &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
1941                         gen_stride, vertex_count);
1942
1943         /* fp32 vec4 gl_Position */
1944         tiler_postfix->position_varying = panfrost_emit_varyings(batch,
1945                         &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
1946                         sizeof(float) * 4, vertex_count);
1947
1948         if (present & (1 << PAN_VARY_PSIZ)) {
1949                 primitive_size->pointer = panfrost_emit_varyings(batch,
1950                                 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
1951                                 2, vertex_count);
1952         }
1953
1954         pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD);
1955         pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
1956         pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
1957
1958         vertex_postfix->varyings = T.gpu;
1959         tiler_postfix->varyings = T.gpu;
1960
1961         vertex_postfix->varying_meta = trans.gpu;
1962         tiler_postfix->varying_meta = trans.gpu + vs_size;
1963 }
1964
1965 void
1966 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1967                                 struct mali_vertex_tiler_prefix *vertex_prefix,
1968                                 struct mali_vertex_tiler_postfix *vertex_postfix,
1969                                 struct mali_vertex_tiler_prefix *tiler_prefix,
1970                                 struct mali_vertex_tiler_postfix *tiler_postfix,
1971                                 union midgard_primitive_size *primitive_size)
1972 {
1973         struct panfrost_context *ctx = batch->ctx;
1974         struct panfrost_device *device = pan_device(ctx->base.screen);
1975         bool wallpapering = ctx->wallpaper_batch && batch->scoreboard.tiler_dep;
1976         struct bifrost_payload_vertex bifrost_vertex = {0,};
1977         struct bifrost_payload_tiler bifrost_tiler = {0,};
1978         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
1979         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
1980         void *vp, *tp;
1981         size_t vp_size, tp_size;
1982
1983         if (device->quirks & IS_BIFROST) {
1984                 bifrost_vertex.prefix = *vertex_prefix;
1985                 bifrost_vertex.postfix = *vertex_postfix;
1986                 vp = &bifrost_vertex;
1987                 vp_size = sizeof(bifrost_vertex);
1988
1989                 bifrost_tiler.prefix = *tiler_prefix;
1990                 bifrost_tiler.tiler.primitive_size = *primitive_size;
1991                 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
1992                 bifrost_tiler.postfix = *tiler_postfix;
1993                 tp = &bifrost_tiler;
1994                 tp_size = sizeof(bifrost_tiler);
1995         } else {
1996                 midgard_vertex.prefix = *vertex_prefix;
1997                 midgard_vertex.postfix = *vertex_postfix;
1998                 vp = &midgard_vertex;
1999                 vp_size = sizeof(midgard_vertex);
2000
2001                 midgard_tiler.prefix = *tiler_prefix;
2002                 midgard_tiler.postfix = *tiler_postfix;
2003                 midgard_tiler.primitive_size = *primitive_size;
2004                 tp = &midgard_tiler;
2005                 tp_size = sizeof(midgard_tiler);
2006         }
2007
2008         if (wallpapering) {
2009                 /* Inject in reverse order, with "predicted" job indices.
2010                  * THIS IS A HACK XXX */
2011                 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false,
2012                                  batch->scoreboard.job_index + 2, tp, tp_size, true);
2013                 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2014                                  vp, vp_size, true);
2015                 return;
2016         }
2017
2018         /* If rasterizer discard is enable, only submit the vertex */
2019
2020         unsigned vertex = panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2021                                            vp, vp_size, false);
2022
2023         if (ctx->rasterizer->base.rasterizer_discard)
2024                 return;
2025
2026         panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false, vertex, tp, tp_size,
2027                          false);
2028 }
2029
2030 /* TODO: stop hardcoding this */
2031 mali_ptr
2032 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2033 {
2034         uint16_t locations[] = {
2035             128, 128,
2036             0, 256,
2037             0, 256,
2038             0, 256,
2039             0, 256,
2040             0, 256,
2041             0, 256,
2042             0, 256,
2043             0, 256,
2044             0, 256,
2045             0, 256,
2046             0, 256,
2047             0, 256,
2048             0, 256,
2049             0, 256,
2050             0, 256,
2051             0, 256,
2052             0, 256,
2053             0, 256,
2054             0, 256,
2055             0, 256,
2056             0, 256,
2057             0, 256,
2058             0, 256,
2059             0, 256,
2060             0, 256,
2061             0, 256,
2062             0, 256,
2063             0, 256,
2064             0, 256,
2065             0, 256,
2066             0, 256,
2067             128, 128,
2068             0, 0,
2069             0, 0,
2070             0, 0,
2071             0, 0,
2072             0, 0,
2073             0, 0,
2074             0, 0,
2075             0, 0,
2076             0, 0,
2077             0, 0,
2078             0, 0,
2079             0, 0,
2080             0, 0,
2081             0, 0,
2082             0, 0,
2083         };
2084
2085         return panfrost_pool_upload_aligned(&batch->pool, locations, 96 * sizeof(uint16_t), 64);
2086 }