src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28
  29 #include "panfrost-quirks.h"
  30
  31 #include "pan_pool.h"
  32 #include "pan_bo.h"
  33 #include "pan_cmdstream.h"
  34 #include "pan_context.h"
  35 #include "pan_job.h"
  36
  37 /* If a BO is accessed for a particular shader stage, will it be in the primary
  38  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  39  * fragment will be primary, e.g. compute jobs will be considered
  40  * "vertex/tiler" by analogy */
  41
  42 static inline uint32_t
  43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  44 {
  45         assert(stage == PIPE_SHADER_FRAGMENT ||
  46                stage == PIPE_SHADER_VERTEX ||
  47                stage == PIPE_SHADER_COMPUTE);
  48
  49         return stage == PIPE_SHADER_FRAGMENT ?
  50                PAN_BO_ACCESS_FRAGMENT :
  51                PAN_BO_ACCESS_VERTEX_TILER;
  52 }
  53
  54 static void
  55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
  56                                struct mali_vertex_tiler_postfix *postfix)
  57 {
  58         struct panfrost_device *dev = pan_device(ctx->base.screen);
  59         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  60
  61         struct mali_shared_memory shared = {
  62                 .shared_workgroup_count = ~0,
  63         };
  64
  65         if (batch->stack_size) {
  66                 struct panfrost_bo *stack =
  67                         panfrost_batch_get_scratchpad(batch, batch->stack_size,
  68                                         dev->thread_tls_alloc,
  69                                         dev->core_count);
  70
  71                 shared.stack_shift = panfrost_get_stack_shift(batch->stack_size);
  72                 shared.scratchpad = stack->gpu;
  73         }
  74
  75         postfix->shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared, sizeof(shared), 64);
  76 }
  77
  78 static void
  79 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
  80                                struct mali_vertex_tiler_postfix *postfix)
  81 {
  82         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  83         postfix->shared_memory = panfrost_batch_reserve_framebuffer(batch);
  84 }
  85
  86 static void
  87 panfrost_vt_update_rasterizer(struct panfrost_rasterizer *rasterizer,
  88                               struct mali_vertex_tiler_prefix *prefix,
  89                               struct mali_vertex_tiler_postfix *postfix)
  90 {
  91         postfix->gl_enables |= 0x7;
  92         SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
  93                 rasterizer->base.front_ccw);
  94         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
  95                 (rasterizer->base.cull_face & PIPE_FACE_FRONT));
  96         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
  97                 (rasterizer->base.cull_face & PIPE_FACE_BACK));
  98         SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
  99                 rasterizer->base.flatshade_first);
 100 }
 101
 102 void
 103 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
 104                                   struct mali_vertex_tiler_prefix *prefix,
 105                                   union midgard_primitive_size *primitive_size)
 106 {
 107         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 108
 109         if (!panfrost_writes_point_size(ctx)) {
 110                 float val = (prefix->draw_mode == MALI_DRAW_MODE_POINTS) ?
 111                               rasterizer->base.point_size :
 112                               rasterizer->base.line_width;
 113
 114                 primitive_size->constant = val;
 115         }
 116 }
 117
 118 static void
 119 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
 120                                    struct mali_vertex_tiler_postfix *postfix)
 121 {
 122         SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
 123         if (ctx->occlusion_query) {
 124                 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
 125                 panfrost_batch_add_bo(ctx->batch, ctx->occlusion_query->bo,
 126                                       PAN_BO_ACCESS_SHARED |
 127                                       PAN_BO_ACCESS_RW |
 128                                       PAN_BO_ACCESS_FRAGMENT);
 129         } else {
 130                 postfix->occlusion_counter = 0;
 131         }
 132 }
 133
 134 void
 135 panfrost_vt_init(struct panfrost_context *ctx,
 136                  enum pipe_shader_type stage,
 137                  struct mali_vertex_tiler_prefix *prefix,
 138                  struct mali_vertex_tiler_postfix *postfix)
 139 {
 140         struct panfrost_device *device = pan_device(ctx->base.screen);
 141
 142         if (!ctx->shader[stage])
 143                 return;
 144
 145         memset(prefix, 0, sizeof(*prefix));
 146         memset(postfix, 0, sizeof(*postfix));
 147
 148         if (device->quirks & IS_BIFROST) {
 149                 postfix->gl_enables = 0x2;
 150                 panfrost_vt_emit_shared_memory(ctx, postfix);
 151         } else {
 152                 postfix->gl_enables = 0x6;
 153                 panfrost_vt_attach_framebuffer(ctx, postfix);
 154         }
 155
 156         if (stage == PIPE_SHADER_FRAGMENT) {
 157                 panfrost_vt_update_occlusion_query(ctx, postfix);
 158                 panfrost_vt_update_rasterizer(ctx->rasterizer, prefix, postfix);
 159         }
 160 }
 161
 162 static unsigned
 163 panfrost_translate_index_size(unsigned size)
 164 {
 165         switch (size) {
 166         case 1:
 167                 return MALI_DRAW_INDEXED_UINT8;
 168
 169         case 2:
 170                 return MALI_DRAW_INDEXED_UINT16;
 171
 172         case 4:
 173                 return MALI_DRAW_INDEXED_UINT32;
 174
 175         default:
 176                 unreachable("Invalid index size");
 177         }
 178 }
 179
 180 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
 181  * good for the duration of the draw (transient), could last longer. Also get
 182  * the bounds on the index buffer for the range accessed by the draw. We do
 183  * these operations together because there are natural optimizations which
 184  * require them to be together. */
 185
 186 static mali_ptr
 187 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 188                                   const struct pipe_draw_info *info,
 189                                   unsigned *min_index, unsigned *max_index)
 190 {
 191         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 192         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 193         off_t offset = info->start * info->index_size;
 194         bool needs_indices = true;
 195         mali_ptr out = 0;
 196
 197         if (info->max_index != ~0u) {
 198                 *min_index = info->min_index;
 199                 *max_index = info->max_index;
 200                 needs_indices = false;
 201         }
 202
 203         if (!info->has_user_indices) {
 204                 /* Only resources can be directly mapped */
 205                 panfrost_batch_add_bo(batch, rsrc->bo,
 206                                       PAN_BO_ACCESS_SHARED |
 207                                       PAN_BO_ACCESS_READ |
 208                                       PAN_BO_ACCESS_VERTEX_TILER);
 209                 out = rsrc->bo->gpu + offset;
 210
 211                 /* Check the cache */
 212                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 213                                                            info->start,
 214                                                            info->count,
 215                                                            min_index,
 216                                                            max_index);
 217         } else {
 218                 /* Otherwise, we need to upload to transient memory */
 219                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 220                 struct panfrost_transfer T =
 221                         panfrost_pool_alloc_aligned(&batch->pool,
 222                                 info->count * info->index_size,
 223                                 info->index_size);
 224
 225                 memcpy(T.cpu, ibuf8 + offset, info->count * info->index_size);
 226                 out = T.gpu;
 227         }
 228
 229         if (needs_indices) {
 230                 /* Fallback */
 231                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 232
 233                 if (!info->has_user_indices)
 234                         panfrost_minmax_cache_add(rsrc->index_cache,
 235                                                   info->start, info->count,
 236                                                   *min_index, *max_index);
 237         }
 238
 239         return out;
 240 }
 241
 242 void
 243 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
 244                           const struct pipe_draw_info *info,
 245                           enum mali_draw_mode draw_mode,
 246                           struct mali_vertex_tiler_postfix *vertex_postfix,
 247                           struct mali_vertex_tiler_prefix *tiler_prefix,
 248                           struct mali_vertex_tiler_postfix *tiler_postfix,
 249                           unsigned *vertex_count,
 250                           unsigned *padded_count)
 251 {
 252         tiler_prefix->draw_mode = draw_mode;
 253
 254         unsigned draw_flags = 0;
 255
 256         if (panfrost_writes_point_size(ctx))
 257                 draw_flags |= MALI_DRAW_VARYING_SIZE;
 258
 259         if (info->primitive_restart)
 260                 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
 261
 262         /* These doesn't make much sense */
 263
 264         draw_flags |= 0x3000;
 265
 266         if (info->index_size) {
 267                 unsigned min_index = 0, max_index = 0;
 268
 269                 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
 270                                                                        info,
 271                                                                        &min_index,
 272                                                                        &max_index);
 273
 274                 /* Use the corresponding values */
 275                 *vertex_count = max_index - min_index + 1;
 276                 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
 277                 tiler_prefix->offset_bias_correction = -min_index;
 278                 tiler_prefix->index_count = MALI_POSITIVE(info->count);
 279                 draw_flags |= panfrost_translate_index_size(info->index_size);
 280         } else {
 281                 tiler_prefix->indices = 0;
 282                 *vertex_count = ctx->vertex_count;
 283                 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
 284                 tiler_prefix->offset_bias_correction = 0;
 285                 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
 286         }
 287
 288         tiler_prefix->unknown_draw = draw_flags;
 289
 290         /* Encode the padded vertex count */
 291
 292         if (info->instance_count > 1) {
 293                 *padded_count = panfrost_padded_vertex_count(*vertex_count);
 294
 295                 unsigned shift = __builtin_ctz(ctx->padded_count);
 296                 unsigned k = ctx->padded_count >> (shift + 1);
 297
 298                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
 299                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
 300         } else {
 301                 *padded_count = *vertex_count;
 302
 303                 /* Reset instancing state */
 304                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
 305                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
 306         }
 307 }
 308
 309 static unsigned
 310 translate_tex_wrap(enum pipe_tex_wrap w)
 311 {
 312         switch (w) {
 313         case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT;
 314         case PIPE_TEX_WRAP_CLAMP: return MALI_WRAP_MODE_CLAMP;
 315         case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE;
 316         case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER;
 317         case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT;
 318         case PIPE_TEX_WRAP_MIRROR_CLAMP: return MALI_WRAP_MODE_MIRRORED_CLAMP;
 319         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
 320         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
 321         default: unreachable("Invalid wrap");
 322         }
 323 }
 324
 325 /* The hardware compares in the wrong order order, so we have to flip before
 326  * encoding. Yes, really. */
 327
 328 static enum mali_func
 329 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
 330 {
 331         if (!cso->compare_mode)
 332                 return MALI_FUNC_NEVER;
 333
 334         enum mali_func f = panfrost_translate_compare_func(cso->compare_func);
 335         return panfrost_flip_compare_func(f);
 336 }
 337
 338 static enum mali_mipmap_mode
 339 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
 340 {
 341         switch (f) {
 342         case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST;
 343         case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR;
 344         case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE;
 345         default: unreachable("Invalid");
 346         }
 347 }
 348
 349 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 350                                 struct mali_midgard_sampler_packed *hw)
 351 {
 352         pan_pack(hw, MIDGARD_SAMPLER, cfg) {
 353                 cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 354                 cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 355                 cfg.mipmap_mode = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) ?
 356                         MALI_MIPMAP_MODE_TRILINEAR : MALI_MIPMAP_MODE_NEAREST;
 357                 cfg.normalized_coordinates = cso->normalized_coords;
 358
 359                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
 360
 361                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
 362
 363                 /* If necessary, we disable mipmapping in the sampler descriptor by
 364                  * clamping the LOD as tight as possible (from 0 to epsilon,
 365                  * essentially -- remember these are fixed point numbers, so
 366                  * epsilon=1/256) */
 367
 368                 cfg.maximum_lod = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) ?
 369                         cfg.minimum_lod + 1 :
 370                         FIXED_16(cso->max_lod, false);
 371
 372                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
 373                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
 374                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
 375
 376                 cfg.compare_function = panfrost_sampler_compare_func(cso);
 377                 cfg.seamless_cube_map = cso->seamless_cube_map;
 378
 379                 cfg.border_color_r = cso->border_color.f[0];
 380                 cfg.border_color_g = cso->border_color.f[1];
 381                 cfg.border_color_b = cso->border_color.f[2];
 382                 cfg.border_color_a = cso->border_color.f[3];
 383         }
 384 }
 385
 386 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
 387                                         struct mali_bifrost_sampler_packed *hw)
 388 {
 389         pan_pack(hw, BIFROST_SAMPLER, cfg) {
 390                 cfg.magnify_linear = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR;
 391                 cfg.minify_linear = cso->min_img_filter == PIPE_TEX_FILTER_LINEAR;
 392                 cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
 393                 cfg.normalized_coordinates = cso->normalized_coords;
 394
 395                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
 396                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
 397                 cfg.maximum_lod = FIXED_16(cso->max_lod, false);
 398
 399                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
 400                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
 401                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
 402
 403                 cfg.compare_function = panfrost_sampler_compare_func(cso);
 404                 cfg.seamless_cube_map = cso->seamless_cube_map;
 405         }
 406 }
 407
 408 static bool
 409 panfrost_fs_required(
 410                 struct panfrost_shader_state *fs,
 411                 struct panfrost_blend_final *blend,
 412                 unsigned rt_count)
 413 {
 414         /* If we generally have side effects */
 415         if (fs->fs_sidefx)
 416                 return true;
 417
 418         /* If colour is written we need to execute */
 419         for (unsigned i = 0; i < rt_count; ++i) {
 420                 if (!blend[i].no_colour)
 421                         return true;
 422         }
 423
 424         /* If depth is written and not implied we need to execute.
 425          * TODO: Predicate on Z/S writes being enabled */
 426         return (fs->writes_depth || fs->writes_stencil);
 427 }
 428
 429 static void
 430 panfrost_emit_blend(struct panfrost_batch *batch, void *rts,
 431                 struct panfrost_blend_final *blend)
 432 {
 433         const struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
 434         struct panfrost_shader_state *fs = panfrost_get_shader_state(batch->ctx, PIPE_SHADER_FRAGMENT);
 435         unsigned rt_count = batch->key.nr_cbufs;
 436
 437         struct bifrost_blend_rt *brts = rts;
 438         struct midgard_blend_rt *mrts = rts;
 439
 440         /* Disable blending for depth-only on Bifrost */
 441
 442         if (rt_count == 0 && dev->quirks & IS_BIFROST)
 443                 brts[0].unk2 = 0x3;
 444
 445         for (unsigned i = 0; i < rt_count; ++i) {
 446                 unsigned flags = 0;
 447
 448                 pan_pack(&flags, BLEND_FLAGS, cfg) {
 449                         if (blend[i].no_colour) {
 450                                 cfg.enable = false;
 451                                 break;
 452                         }
 453
 454                         batch->draws |= (PIPE_CLEAR_COLOR0 << i);
 455
 456                         cfg.srgb = util_format_is_srgb(batch->key.cbufs[i]->format);
 457                         cfg.load_destination = blend[i].load_dest;
 458                         cfg.dither_disable = !batch->ctx->blend->base.dither;
 459
 460                         if (!(dev->quirks & IS_BIFROST))
 461                                 cfg.midgard_blend_shader = blend[i].is_shader;
 462                 }
 463
 464                 if (dev->quirks & IS_BIFROST) {
 465                         brts[i].flags = flags;
 466
 467                         if (blend[i].is_shader) {
 468                                 /* The blend shader's address needs to be at
 469                                  * the same top 32 bit as the fragment shader.
 470                                  * TODO: Ensure that's always the case.
 471                                  */
 472                                 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
 473                                        (fs->bo->gpu & (0xffffffffull << 32)));
 474                                 brts[i].shader = blend[i].shader.gpu;
 475                                 brts[i].unk2 = 0x0;
 476                         } else {
 477                                 enum pipe_format format = batch->key.cbufs[i]->format;
 478                                 const struct util_format_description *format_desc;
 479                                 format_desc = util_format_description(format);
 480
 481                                 brts[i].equation = blend[i].equation.equation;
 482
 483                                 /* TODO: this is a bit more complicated */
 484                                 brts[i].constant = blend[i].equation.constant;
 485
 486                                 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
 487
 488                                 /* 0x19 disables blending and forces REPLACE
 489                                  * mode (equivalent to rgb_mode = alpha_mode =
 490                                  * x122, colour mask = 0xF). 0x1a allows
 491                                  * blending. */
 492                                 brts[i].unk2 = blend[i].opaque ? 0x19 : 0x1a;
 493
 494                                 brts[i].shader_type = fs->blend_types[i];
 495                         }
 496                 } else {
 497                         memcpy(&mrts[i].flags, &flags, sizeof(flags));
 498
 499                         if (blend[i].is_shader) {
 500                                 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 501                         } else {
 502                                 mrts[i].blend.equation = blend[i].equation.equation;
 503                                 mrts[i].blend.constant = blend[i].equation.constant;
 504                         }
 505                 }
 506         }
 507 }
 508
 509 static void
 510 panfrost_emit_frag_shader(struct panfrost_context *ctx,
 511                                struct mali_shader_meta *fragmeta,
 512                                struct panfrost_blend_final *blend)
 513 {
 514         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 515         struct panfrost_shader_state *fs;
 516
 517         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 518
 519         struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 520         const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
 521         unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
 522
 523         memset(fragmeta, 0, sizeof(*fragmeta));
 524         memcpy(&fragmeta->shader, &fs->shader, sizeof(fs->shader));
 525
 526         if (!panfrost_fs_required(fs, blend, rt_count)) {
 527                 struct mali_shader_packed shader = { 0 };
 528                 struct mali_midgard_properties_packed prop;
 529
 530                 if (dev->quirks & IS_BIFROST) {
 531                         struct mali_preload_packed preload = { 0 };
 532                         memcpy(&fragmeta->bifrost_preload, &preload, sizeof(preload));
 533
 534                         pan_pack(&prop, BIFROST_PROPERTIES, cfg) {
 535                                 cfg.unknown = 0x950020; /* XXX */
 536                                 cfg.early_z_enable = true;
 537                         }
 538                 } else {
 539                         pan_pack(&shader, SHADER, cfg) {
 540                                 cfg.shader = 0x1;
 541                         }
 542
 543                         pan_pack(&prop, MIDGARD_PROPERTIES, cfg) {
 544                                 cfg.work_register_count = 1;
 545                                 cfg.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION;
 546                                 cfg.early_z_enable = true;
 547                         }
 548                 }
 549
 550                 memcpy(&fragmeta->shader, &shader, sizeof(shader));
 551                 memcpy(&fragmeta->midgard_props, &prop, sizeof(prop));
 552         } else if (dev->quirks & IS_BIFROST) {
 553                 struct mali_bifrost_properties_packed prop;
 554
 555                 bool no_blend = true;
 556
 557                 for (unsigned i = 0; i < rt_count; ++i)
 558                         no_blend &= (!blend[i].load_dest | blend[i].no_colour);
 559
 560                 pan_pack(&prop, BIFROST_PROPERTIES, cfg) {
 561                         cfg.early_z_enable = !fs->can_discard && !fs->writes_depth && no_blend;
 562                 }
 563
 564                 /* Combine with prepacked properties */
 565                 prop.opaque[0] |= fs->properties.opaque[0];
 566
 567                 memcpy(&fragmeta->bifrost_props, &prop, sizeof(prop));
 568                 memcpy(&fragmeta->bifrost_preload, &fs->preload, sizeof(fs->preload));
 569         } else {
 570                 struct mali_midgard_properties_packed prop;
 571
 572                 /* Reasons to disable early-Z from a shader perspective */
 573                 bool late_z = fs->can_discard || fs->writes_global ||
 574                         fs->writes_depth || fs->writes_stencil;
 575
 576                 /* Reasons to disable early-Z from a CSO perspective */
 577                 bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
 578
 579                 /* If either depth or stencil is enabled, discard matters */
 580                 bool zs_enabled =
 581                         (zsa->base.depth.enabled && zsa->base.depth.func != PIPE_FUNC_ALWAYS) ||
 582                         zsa->base.stencil[0].enabled;
 583
 584                 bool has_blend_shader = false;
 585
 586                 for (unsigned c = 0; c < rt_count; ++c)
 587                         has_blend_shader |= blend[c].is_shader;
 588
 589                 pan_pack(&prop, MIDGARD_PROPERTIES, cfg) {
 590                         /* TODO: Reduce this limit? */
 591                         if (has_blend_shader)
 592                                 cfg.work_register_count = MAX2(fs->work_reg_count, 8);
 593                         else
 594                                 cfg.work_register_count = fs->work_reg_count;
 595
 596                         cfg.early_z_enable = !(late_z || alpha_to_coverage);
 597                         cfg.reads_tilebuffer = fs->outputs_read || (!zs_enabled && fs->can_discard);
 598                         cfg.reads_depth_stencil = zs_enabled && fs->can_discard;
 599                 }
 600
 601                 /* Combine with prepacked properties */
 602                 prop.opaque[0] |= fs->properties.opaque[0];
 603                 memcpy(&fragmeta->midgard_props, &prop, sizeof(prop));
 604         }
 605
 606         bool msaa = rast->multisample;
 607         fragmeta->coverage_mask = msaa ? ctx->sample_mask : ~0;
 608
 609         fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x10;
 610         fragmeta->unknown2_4 = 0x4e0;
 611
 612         /* TODO: Sample size */
 613         SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
 614         SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
 615
 616         /* EXT_shader_framebuffer_fetch requires the shader to be run
 617          * per-sample when outputs are read. */
 618         bool per_sample = ctx->min_samples > 1 || fs->outputs_read;
 619         SET_BIT(fragmeta->unknown2_3, MALI_PER_SAMPLE, msaa && per_sample);
 620
 621         fragmeta->depth_units = rast->offset_units * 2.0f;
 622         fragmeta->depth_factor = rast->offset_scale;
 623
 624         /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
 625
 626         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, rast->offset_tri);
 627         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, rast->offset_tri);
 628
 629         SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, rast->depth_clip_near);
 630         SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, rast->depth_clip_far);
 631
 632         SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
 633                 zsa->base.stencil[0].enabled);
 634
 635         fragmeta->stencil_mask_front = zsa->stencil_mask_front;
 636         fragmeta->stencil_mask_back = zsa->stencil_mask_back;
 637
 638         /* Bottom bits for stencil ref, exactly one word */
 639         fragmeta->stencil_front.opaque[0] = zsa->stencil_front.opaque[0] | ctx->stencil_ref.ref_value[0];
 640
 641         /* If back-stencil is not enabled, use the front values */
 642
 643         if (zsa->base.stencil[1].enabled)
 644                 fragmeta->stencil_back.opaque[0] = zsa->stencil_back.opaque[0] | ctx->stencil_ref.ref_value[1];
 645         else
 646                 fragmeta->stencil_back = fragmeta->stencil_front;
 647
 648         SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
 649                 zsa->base.depth.writemask);
 650
 651         fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
 652         fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(
 653                 zsa->base.depth.enabled ? zsa->base.depth.func : PIPE_FUNC_ALWAYS));
 654
 655         SET_BIT(fragmeta->unknown2_4, MALI_ALPHA_TO_COVERAGE,
 656                         ctx->blend->base.alpha_to_coverage);
 657
 658         if (dev->quirks & MIDGARD_SFBD) {
 659                 /* When only a single render target platform is used, the blend
 660                  * information is inside the shader meta itself. We additionally
 661                  * need to signal CAN_DISCARD for nontrivial blend modes (so
 662                  * we're able to read back the destination buffer) */
 663
 664                 if (blend[0].no_colour)
 665                         return;
 666
 667                 fragmeta->unknown2_4 |= MALI_SFBD_ENABLE;
 668
 669                 SET_BIT(fragmeta->unknown2_4, MALI_SFBD_SRGB,
 670                                 util_format_is_srgb(ctx->pipe_framebuffer.cbufs[0]->format));
 671
 672                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
 673                         blend[0].is_shader);
 674
 675                 if (blend[0].is_shader) {
 676                         fragmeta->blend.shader = blend[0].shader.gpu |
 677                                 blend[0].shader.first_tag;
 678                 } else {
 679                         fragmeta->blend.equation = blend[0].equation.equation;
 680                         fragmeta->blend.constant = blend[0].equation.constant;
 681                 }
 682
 683                 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
 684                         blend[0].load_dest);
 685
 686                 SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER, !ctx->blend->base.dither);
 687         } else if (!(dev->quirks & IS_BIFROST)) {
 688                 /* Bug where MRT-capable hw apparently reads the last blend
 689                  * shader from here instead of the usual location? */
 690
 691                 for (signed rt = ((signed) rt_count - 1); rt >= 0; --rt) {
 692                         if (!blend[rt].is_shader)
 693                                 continue;
 694
 695                         fragmeta->blend.shader = blend[rt].shader.gpu |
 696                                                  blend[rt].shader.first_tag;
 697                         break;
 698                 }
 699         }
 700 }
 701
 702 mali_ptr
 703 panfrost_emit_compute_shader_meta(struct panfrost_batch *batch, enum pipe_shader_type stage)
 704 {
 705         struct panfrost_shader_state *ss = panfrost_get_shader_state(batch->ctx, stage);
 706
 707         panfrost_batch_add_bo(batch, ss->bo,
 708                               PAN_BO_ACCESS_PRIVATE |
 709                               PAN_BO_ACCESS_READ |
 710                               PAN_BO_ACCESS_VERTEX_TILER);
 711
 712         panfrost_batch_add_bo(batch, pan_resource(ss->upload.rsrc)->bo,
 713                               PAN_BO_ACCESS_PRIVATE |
 714                               PAN_BO_ACCESS_READ |
 715                               PAN_BO_ACCESS_VERTEX_TILER);
 716
 717         return pan_resource(ss->upload.rsrc)->bo->gpu + ss->upload.offset;
 718 }
 719
 720 mali_ptr
 721 panfrost_emit_frag_shader_meta(struct panfrost_batch *batch)
 722 {
 723         struct panfrost_context *ctx = batch->ctx;
 724         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 725         struct mali_shader_meta meta;
 726
 727         /* Add the shader BO to the batch. */
 728         panfrost_batch_add_bo(batch, ss->bo,
 729                               PAN_BO_ACCESS_PRIVATE |
 730                               PAN_BO_ACCESS_READ |
 731                               PAN_BO_ACCESS_FRAGMENT);
 732
 733         struct panfrost_device *dev = pan_device(ctx->base.screen);
 734         unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 735         size_t desc_size = sizeof(meta);
 736         void *rts = NULL;
 737         struct panfrost_transfer xfer;
 738         unsigned rt_size;
 739
 740         if (dev->quirks & MIDGARD_SFBD)
 741                 rt_size = 0;
 742         else if (dev->quirks & IS_BIFROST)
 743                 rt_size = sizeof(struct bifrost_blend_rt);
 744         else
 745                 rt_size = sizeof(struct midgard_blend_rt);
 746
 747         desc_size += rt_size * rt_count;
 748
 749         if (rt_size)
 750                 rts = rzalloc_size(ctx, rt_size * rt_count);
 751
 752         struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 753
 754         for (unsigned c = 0; c < ctx->pipe_framebuffer.nr_cbufs; ++c)
 755                 blend[c] = panfrost_get_blend_for_context(ctx, c);
 756
 757         panfrost_emit_frag_shader(ctx, &meta, blend);
 758
 759         if (!(dev->quirks & MIDGARD_SFBD))
 760                 panfrost_emit_blend(batch, rts, blend);
 761         else
 762                 batch->draws |= PIPE_CLEAR_COLOR0;
 763
 764         xfer = panfrost_pool_alloc_aligned(&batch->pool, desc_size, sizeof(meta));
 765
 766         memcpy(xfer.cpu, &meta, sizeof(meta));
 767         memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
 768
 769         if (rt_size)
 770                 ralloc_free(rts);
 771
 772         return xfer.gpu;
 773 }
 774
 775 void
 776 panfrost_emit_viewport(struct panfrost_batch *batch,
 777                        struct mali_vertex_tiler_postfix *tiler_postfix)
 778 {
 779         struct panfrost_context *ctx = batch->ctx;
 780         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 781         const struct pipe_scissor_state *ss = &ctx->scissor;
 782         const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 783         const struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
 784
 785         /* Derive min/max from translate/scale. Note since |x| >= 0 by
 786          * definition, we have that -|x| <= |x| hence translate - |scale| <=
 787          * translate + |scale|, so the ordering is correct here. */
 788         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 789         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 790         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 791         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 792         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 793         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 794
 795         /* Scissor to the intersection of viewport and to the scissor, clamped
 796          * to the framebuffer */
 797
 798         unsigned minx = MIN2(fb->width, vp_minx);
 799         unsigned maxx = MIN2(fb->width, vp_maxx);
 800         unsigned miny = MIN2(fb->height, vp_miny);
 801         unsigned maxy = MIN2(fb->height, vp_maxy);
 802
 803         if (ss && rast->scissor) {
 804                 minx = MAX2(ss->minx, minx);
 805                 miny = MAX2(ss->miny, miny);
 806                 maxx = MIN2(ss->maxx, maxx);
 807                 maxy = MIN2(ss->maxy, maxy);
 808         }
 809
 810         struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, MALI_VIEWPORT_LENGTH);
 811
 812         pan_pack(T.cpu, VIEWPORT, cfg) {
 813                 cfg.scissor_minimum_x = minx;
 814                 cfg.scissor_minimum_y = miny;
 815                 cfg.scissor_maximum_x = maxx - 1;
 816                 cfg.scissor_maximum_y = maxy - 1;
 817
 818                 cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY;
 819                 cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY;
 820         }
 821
 822         tiler_postfix->viewport = T.gpu;
 823         panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
 824 }
 825
 826 static mali_ptr
 827 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
 828                                  enum pipe_shader_type st,
 829                                  struct panfrost_constant_buffer *buf,
 830                                  unsigned index)
 831 {
 832         struct pipe_constant_buffer *cb = &buf->cb[index];
 833         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
 834
 835         if (rsrc) {
 836                 panfrost_batch_add_bo(batch, rsrc->bo,
 837                                       PAN_BO_ACCESS_SHARED |
 838                                       PAN_BO_ACCESS_READ |
 839                                       panfrost_bo_access_for_stage(st));
 840
 841                 /* Alignment gauranteed by
 842                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
 843                 return rsrc->bo->gpu + cb->buffer_offset;
 844         } else if (cb->user_buffer) {
 845                 return panfrost_pool_upload_aligned(&batch->pool,
 846                                                  cb->user_buffer +
 847                                                  cb->buffer_offset,
 848                                                  cb->buffer_size, 16);
 849         } else {
 850                 unreachable("No constant buffer");
 851         }
 852 }
 853
 854 struct sysval_uniform {
 855         union {
 856                 float f[4];
 857                 int32_t i[4];
 858                 uint32_t u[4];
 859                 uint64_t du[2];
 860         };
 861 };
 862
 863 static void
 864 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
 865                                       struct sysval_uniform *uniform)
 866 {
 867         struct panfrost_context *ctx = batch->ctx;
 868         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 869
 870         uniform->f[0] = vp->scale[0];
 871         uniform->f[1] = vp->scale[1];
 872         uniform->f[2] = vp->scale[2];
 873 }
 874
 875 static void
 876 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
 877                                        struct sysval_uniform *uniform)
 878 {
 879         struct panfrost_context *ctx = batch->ctx;
 880         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 881
 882         uniform->f[0] = vp->translate[0];
 883         uniform->f[1] = vp->translate[1];
 884         uniform->f[2] = vp->translate[2];
 885 }
 886
 887 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
 888                                        enum pipe_shader_type st,
 889                                        unsigned int sysvalid,
 890                                        struct sysval_uniform *uniform)
 891 {
 892         struct panfrost_context *ctx = batch->ctx;
 893         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
 894         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
 895         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
 896         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
 897
 898         assert(dim);
 899         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
 900
 901         if (dim > 1)
 902                 uniform->i[1] = u_minify(tex->texture->height0,
 903                                          tex->u.tex.first_level);
 904
 905         if (dim > 2)
 906                 uniform->i[2] = u_minify(tex->texture->depth0,
 907                                          tex->u.tex.first_level);
 908
 909         if (is_array)
 910                 uniform->i[dim] = tex->texture->array_size;
 911 }
 912
 913 static void
 914 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
 915                             enum pipe_shader_type st,
 916                             unsigned ssbo_id,
 917                             struct sysval_uniform *uniform)
 918 {
 919         struct panfrost_context *ctx = batch->ctx;
 920
 921         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
 922         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
 923
 924         /* Compute address */
 925         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
 926
 927         panfrost_batch_add_bo(batch, bo,
 928                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
 929                               panfrost_bo_access_for_stage(st));
 930
 931         /* Upload address and size as sysval */
 932         uniform->du[0] = bo->gpu + sb.buffer_offset;
 933         uniform->u[2] = sb.buffer_size;
 934 }
 935
 936 static void
 937 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
 938                                enum pipe_shader_type st,
 939                                unsigned samp_idx,
 940                                struct sysval_uniform *uniform)
 941 {
 942         struct panfrost_context *ctx = batch->ctx;
 943         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
 944
 945         uniform->f[0] = sampl->min_lod;
 946         uniform->f[1] = sampl->max_lod;
 947         uniform->f[2] = sampl->lod_bias;
 948
 949         /* Even without any errata, Midgard represents "no mipmapping" as
 950          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
 951          * panfrost_create_sampler_state which also explains our choice of
 952          * epsilon value (again to keep behaviour consistent) */
 953
 954         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 955                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
 956 }
 957
 958 static void
 959 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
 960                                        struct sysval_uniform *uniform)
 961 {
 962         struct panfrost_context *ctx = batch->ctx;
 963
 964         uniform->u[0] = ctx->compute_grid->grid[0];
 965         uniform->u[1] = ctx->compute_grid->grid[1];
 966         uniform->u[2] = ctx->compute_grid->grid[2];
 967 }
 968
 969 static void
 970 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
 971                         struct panfrost_shader_state *ss,
 972                         enum pipe_shader_type st)
 973 {
 974         struct sysval_uniform *uniforms = (void *)buf;
 975
 976         for (unsigned i = 0; i < ss->sysval_count; ++i) {
 977                 int sysval = ss->sysval[i];
 978
 979                 switch (PAN_SYSVAL_TYPE(sysval)) {
 980                 case PAN_SYSVAL_VIEWPORT_SCALE:
 981                         panfrost_upload_viewport_scale_sysval(batch,
 982                                                               &uniforms[i]);
 983                         break;
 984                 case PAN_SYSVAL_VIEWPORT_OFFSET:
 985                         panfrost_upload_viewport_offset_sysval(batch,
 986                                                                &uniforms[i]);
 987                         break;
 988                 case PAN_SYSVAL_TEXTURE_SIZE:
 989                         panfrost_upload_txs_sysval(batch, st,
 990                                                    PAN_SYSVAL_ID(sysval),
 991                                                    &uniforms[i]);
 992                         break;
 993                 case PAN_SYSVAL_SSBO:
 994                         panfrost_upload_ssbo_sysval(batch, st,
 995                                                     PAN_SYSVAL_ID(sysval),
 996                                                     &uniforms[i]);
 997                         break;
 998                 case PAN_SYSVAL_NUM_WORK_GROUPS:
 999                         panfrost_upload_num_work_groups_sysval(batch,
1000                                                                &uniforms[i]);
1001                         break;
1002                 case PAN_SYSVAL_SAMPLER:
1003                         panfrost_upload_sampler_sysval(batch, st,
1004                                                        PAN_SYSVAL_ID(sysval),
1005                                                        &uniforms[i]);
1006                         break;
1007                 default:
1008                         assert(0);
1009                 }
1010         }
1011 }
1012
1013 static const void *
1014 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1015                                  unsigned index)
1016 {
1017         struct pipe_constant_buffer *cb = &buf->cb[index];
1018         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1019
1020         if (rsrc)
1021                 return rsrc->bo->cpu;
1022         else if (cb->user_buffer)
1023                 return cb->user_buffer;
1024         else
1025                 unreachable("No constant buffer");
1026 }
1027
1028 void
1029 panfrost_emit_const_buf(struct panfrost_batch *batch,
1030                         enum pipe_shader_type stage,
1031                         struct mali_vertex_tiler_postfix *postfix)
1032 {
1033         struct panfrost_context *ctx = batch->ctx;
1034         struct panfrost_shader_variants *all = ctx->shader[stage];
1035
1036         if (!all)
1037                 return;
1038
1039         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1040
1041         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1042
1043         /* Uniforms are implicitly UBO #0 */
1044         bool has_uniforms = buf->enabled_mask & (1 << 0);
1045
1046         /* Allocate room for the sysval and the uniforms */
1047         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1048         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1049         size_t size = sys_size + uniform_size;
1050         struct panfrost_transfer transfer =
1051                 panfrost_pool_alloc_aligned(&batch->pool, size, 16);
1052
1053         /* Upload sysvals requested by the shader */
1054         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1055
1056         /* Upload uniforms */
1057         if (has_uniforms && uniform_size) {
1058                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1059                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1060         }
1061
1062         /* Next up, attach UBOs. UBO #0 is the uniforms we just
1063          * uploaded, so it's always included. The count is the highest UBO
1064          * addressable -- gaps are included. */
1065
1066         unsigned ubo_count = 32 - __builtin_clz(buf->enabled_mask | 1);
1067
1068         size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count;
1069         struct panfrost_transfer ubos =
1070                 panfrost_pool_alloc_aligned(&batch->pool, sz,
1071                                 MALI_UNIFORM_BUFFER_LENGTH);
1072
1073         uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
1074
1075         /* Upload uniforms as a UBO */
1076
1077         if (size) {
1078                 pan_pack(ubo_ptr, UNIFORM_BUFFER, cfg) {
1079                         cfg.entries = DIV_ROUND_UP(size, 16);
1080                         cfg.pointer = transfer.gpu;
1081                 }
1082         } else {
1083                 *ubo_ptr = 0;
1084         }
1085
1086         /* The rest are honest-to-goodness UBOs */
1087
1088         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1089                 size_t usz = buf->cb[ubo].buffer_size;
1090                 bool enabled = buf->enabled_mask & (1 << ubo);
1091                 bool empty = usz == 0;
1092
1093                 if (!enabled || empty) {
1094                         ubo_ptr[ubo] = 0;
1095                         continue;
1096                 }
1097
1098                 pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) {
1099                         cfg.entries = DIV_ROUND_UP(usz, 16);
1100                         cfg.pointer = panfrost_map_constant_buffer_gpu(batch,
1101                                         stage, buf, ubo);
1102                 }
1103         }
1104
1105         postfix->uniforms = transfer.gpu;
1106         postfix->uniform_buffers = ubos.gpu;
1107
1108         buf->dirty_mask = 0;
1109 }
1110
1111 void
1112 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1113                             const struct pipe_grid_info *info,
1114                             struct midgard_payload_vertex_tiler *vtp)
1115 {
1116         struct panfrost_context *ctx = batch->ctx;
1117         struct panfrost_device *dev = pan_device(ctx->base.screen);
1118         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1119         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1120         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1121                                                            128));
1122
1123         unsigned log2_instances =
1124                 util_logbase2_ceil(info->grid[0]) +
1125                 util_logbase2_ceil(info->grid[1]) +
1126                 util_logbase2_ceil(info->grid[2]);
1127
1128         unsigned shared_size = single_size * (1 << log2_instances) * dev->core_count;
1129         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1130                                                                   shared_size,
1131                                                                   1);
1132
1133         struct mali_shared_memory shared = {
1134                 .shared_memory = bo->gpu,
1135                 .shared_workgroup_count = log2_instances,
1136                 .shared_shift = util_logbase2(single_size) + 1
1137         };
1138
1139         vtp->postfix.shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared,
1140                                                                sizeof(shared), 64);
1141 }
1142
1143 static mali_ptr
1144 panfrost_get_tex_desc(struct panfrost_batch *batch,
1145                       enum pipe_shader_type st,
1146                       struct panfrost_sampler_view *view)
1147 {
1148         if (!view)
1149                 return (mali_ptr) 0;
1150
1151         struct pipe_sampler_view *pview = &view->base;
1152         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1153
1154         /* Add the BO to the job so it's retained until the job is done. */
1155
1156         panfrost_batch_add_bo(batch, rsrc->bo,
1157                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1158                               panfrost_bo_access_for_stage(st));
1159
1160         panfrost_batch_add_bo(batch, view->bo,
1161                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1162                               panfrost_bo_access_for_stage(st));
1163
1164         return view->bo->gpu;
1165 }
1166
1167 static void
1168 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1169                              struct pipe_context *pctx)
1170 {
1171         struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1172         if (view->texture_bo != rsrc->bo->gpu ||
1173             view->modifier != rsrc->modifier) {
1174                 panfrost_bo_unreference(view->bo);
1175                 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1176         }
1177 }
1178
1179 void
1180 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1181                                   enum pipe_shader_type stage,
1182                                   struct mali_vertex_tiler_postfix *postfix)
1183 {
1184         struct panfrost_context *ctx = batch->ctx;
1185         struct panfrost_device *device = pan_device(ctx->base.screen);
1186
1187         if (!ctx->sampler_view_count[stage])
1188                 return;
1189
1190         if (device->quirks & IS_BIFROST) {
1191                 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1192                                 MALI_BIFROST_TEXTURE_LENGTH *
1193                                 ctx->sampler_view_count[stage],
1194                                 MALI_BIFROST_TEXTURE_LENGTH);
1195
1196                 struct mali_bifrost_texture_packed *out =
1197                         (struct mali_bifrost_texture_packed *) T.cpu;
1198
1199                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1200                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1201                         struct pipe_sampler_view *pview = &view->base;
1202                         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1203
1204                         panfrost_update_sampler_view(view, &ctx->base);
1205                         out[i] = view->bifrost_descriptor;
1206
1207                         /* Add the BOs to the job so they are retained until the job is done. */
1208
1209                         panfrost_batch_add_bo(batch, rsrc->bo,
1210                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1211                                               panfrost_bo_access_for_stage(stage));
1212
1213                         panfrost_batch_add_bo(batch, view->bo,
1214                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1215                                               panfrost_bo_access_for_stage(stage));
1216                 }
1217
1218                 postfix->textures = T.gpu;
1219         } else {
1220                 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1221
1222                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1223                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1224
1225                         panfrost_update_sampler_view(view, &ctx->base);
1226
1227                         trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1228                 }
1229
1230                 postfix->textures = panfrost_pool_upload_aligned(&batch->pool,
1231                                                               trampolines,
1232                                                               sizeof(uint64_t) *
1233                                                               ctx->sampler_view_count[stage],
1234                                                               sizeof(uint64_t));
1235         }
1236 }
1237
1238 void
1239 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1240                                   enum pipe_shader_type stage,
1241                                   struct mali_vertex_tiler_postfix *postfix)
1242 {
1243         struct panfrost_context *ctx = batch->ctx;
1244
1245         if (!ctx->sampler_count[stage])
1246                 return;
1247
1248         size_t desc_size = MALI_BIFROST_SAMPLER_LENGTH;
1249         assert(MALI_BIFROST_SAMPLER_LENGTH == MALI_MIDGARD_SAMPLER_LENGTH);
1250
1251         size_t sz = desc_size * ctx->sampler_count[stage];
1252         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool, sz, desc_size);
1253         struct mali_midgard_sampler_packed *out = (struct mali_midgard_sampler_packed *) T.cpu;
1254
1255         for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i)
1256                 out[i] = ctx->samplers[stage][i]->hw;
1257
1258         postfix->sampler_descriptor = T.gpu;
1259 }
1260
1261 void
1262 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1263                           struct mali_vertex_tiler_postfix *vertex_postfix)
1264 {
1265         struct panfrost_context *ctx = batch->ctx;
1266         struct panfrost_vertex_state *so = ctx->vertex;
1267         struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1268
1269         unsigned instance_shift = vertex_postfix->instance_shift;
1270         unsigned instance_odd = vertex_postfix->instance_odd;
1271
1272         /* Worst case: everything is NPOT, which is only possible if instancing
1273          * is enabled. Otherwise single record is gauranteed */
1274         bool could_npot = instance_shift || instance_odd;
1275
1276         struct panfrost_transfer S = panfrost_pool_alloc_aligned(&batch->pool,
1277                         MALI_ATTRIBUTE_BUFFER_LENGTH * vs->attribute_count *
1278                         (could_npot ? 2 : 1),
1279                         MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1280
1281         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1282                         MALI_ATTRIBUTE_LENGTH * vs->attribute_count,
1283                         MALI_ATTRIBUTE_LENGTH);
1284
1285         struct mali_attribute_buffer_packed *bufs =
1286                 (struct mali_attribute_buffer_packed *) S.cpu;
1287
1288         struct mali_attribute_packed *out =
1289                 (struct mali_attribute_packed *) T.cpu;
1290
1291         unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 };
1292         unsigned k = 0;
1293
1294         for (unsigned i = 0; i < so->num_elements; ++i) {
1295                 /* We map buffers 1:1 with the attributes, which
1296                  * means duplicating some vertex buffers (who cares? aside from
1297                  * maybe some caching implications but I somehow doubt that
1298                  * matters) */
1299
1300                 struct pipe_vertex_element *elem = &so->pipe[i];
1301                 unsigned vbi = elem->vertex_buffer_index;
1302                 attrib_to_buffer[i] = k;
1303
1304                 if (!(ctx->vb_mask & (1 << vbi)))
1305                         continue;
1306
1307                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1308                 struct panfrost_resource *rsrc;
1309
1310                 rsrc = pan_resource(buf->buffer.resource);
1311                 if (!rsrc)
1312                         continue;
1313
1314                 /* Add a dependency of the batch on the vertex buffer */
1315                 panfrost_batch_add_bo(batch, rsrc->bo,
1316                                       PAN_BO_ACCESS_SHARED |
1317                                       PAN_BO_ACCESS_READ |
1318                                       PAN_BO_ACCESS_VERTEX_TILER);
1319
1320                 /* Mask off lower bits, see offset fixup below */
1321                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1322                 mali_ptr addr = raw_addr & ~63;
1323
1324                 /* Since we advanced the base pointer, we shrink the buffer
1325                  * size, but add the offset we subtracted */
1326                 unsigned size = rsrc->base.width0 + (raw_addr - addr)
1327                         - buf->buffer_offset;
1328
1329                 /* When there is a divisor, the hardware-level divisor is
1330                  * the product of the instance divisor and the padded count */
1331                 unsigned divisor = elem->instance_divisor;
1332                 unsigned hw_divisor = ctx->padded_count * divisor;
1333                 unsigned stride = buf->stride;
1334
1335                 /* If there's a divisor(=1) but no instancing, we want every
1336                  * attribute to be the same */
1337
1338                 if (divisor && ctx->instance_count == 1)
1339                         stride = 0;
1340
1341                 if (!divisor || ctx->instance_count <= 1) {
1342                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1343                                 if (ctx->instance_count > 1)
1344                                         cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
1345
1346                                 cfg.pointer = addr;
1347                                 cfg.stride = stride;
1348                                 cfg.size = size;
1349                                 cfg.divisor_r = instance_shift;
1350                                 cfg.divisor_p = instance_odd;
1351                         }
1352                 } else if (util_is_power_of_two_or_zero(hw_divisor)) {
1353                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1354                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
1355                                 cfg.pointer = addr;
1356                                 cfg.stride = stride;
1357                                 cfg.size = size;
1358                                 cfg.divisor_r = __builtin_ctz(hw_divisor);
1359                         }
1360
1361                 } else {
1362                         unsigned shift = 0, extra_flags = 0;
1363
1364                         unsigned magic_divisor =
1365                                 panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
1366
1367                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1368                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
1369                                 cfg.pointer = addr;
1370                                 cfg.stride = stride;
1371                                 cfg.size = size;
1372
1373                                 cfg.divisor_r = shift;
1374                                 cfg.divisor_e = extra_flags;
1375                         }
1376
1377                         pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
1378                                 cfg.divisor_numerator = magic_divisor;
1379                                 cfg.divisor = divisor;
1380                         }
1381
1382                         ++k;
1383                 }
1384
1385                 ++k;
1386         }
1387
1388         /* Add special gl_VertexID/gl_InstanceID buffers */
1389
1390         if (unlikely(vs->attribute_count >= PAN_VERTEX_ID)) {
1391                 panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1392
1393                 pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
1394                         cfg.buffer_index = k++;
1395                         cfg.format = so->formats[PAN_VERTEX_ID];
1396                 }
1397
1398                 panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1399
1400                 pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
1401                         cfg.buffer_index = k++;
1402                         cfg.format = so->formats[PAN_INSTANCE_ID];
1403                 }
1404         }
1405
1406         /* Attribute addresses require 64-byte alignment, so let:
1407          *
1408          *      base' = base & ~63 = base - (base & 63)
1409          *      offset' = offset + (base & 63)
1410          *
1411          * Since base' + offset' = base + offset, these are equivalent
1412          * addressing modes and now base is 64 aligned.
1413          */
1414
1415         unsigned start = vertex_postfix->offset_start;
1416
1417         for (unsigned i = 0; i < so->num_elements; ++i) {
1418                 unsigned vbi = so->pipe[i].vertex_buffer_index;
1419                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1420
1421                 /* Adjust by the masked off bits of the offset. Make sure we
1422                  * read src_offset from so->hw (which is not GPU visible)
1423                  * rather than target (which is) due to caching effects */
1424
1425                 unsigned src_offset = so->pipe[i].src_offset;
1426
1427                 /* BOs aligned to 4k so guaranteed aligned to 64 */
1428                 src_offset += (buf->buffer_offset & 63);
1429
1430                 /* Also, somewhat obscurely per-instance data needs to be
1431                  * offset in response to a delayed start in an indexed draw */
1432
1433                 if (so->pipe[i].instance_divisor && ctx->instance_count > 1 && start)
1434                         src_offset -= buf->stride * start;
1435
1436                 pan_pack(out + i, ATTRIBUTE, cfg) {
1437                         cfg.buffer_index = attrib_to_buffer[i];
1438                         cfg.format = so->formats[i];
1439                         cfg.offset = src_offset;
1440                 }
1441         }
1442
1443         vertex_postfix->attributes = S.gpu;
1444         vertex_postfix->attribute_meta = T.gpu;
1445 }
1446
1447 static mali_ptr
1448 panfrost_emit_varyings(struct panfrost_batch *batch,
1449                 struct mali_attribute_buffer_packed *slot,
1450                 unsigned stride, unsigned count)
1451 {
1452         unsigned size = stride * count;
1453         mali_ptr ptr = panfrost_pool_alloc_aligned(&batch->invisible_pool, size, 64).gpu;
1454
1455         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1456                 cfg.stride = stride;
1457                 cfg.size = size;
1458                 cfg.pointer = ptr;
1459         }
1460
1461         return ptr;
1462 }
1463
1464 static unsigned
1465 panfrost_streamout_offset(unsigned stride, unsigned offset,
1466                         struct pipe_stream_output_target *target)
1467 {
1468         return (target->buffer_offset + (offset * stride * 4)) & 63;
1469 }
1470
1471 static void
1472 panfrost_emit_streamout(struct panfrost_batch *batch,
1473                         struct mali_attribute_buffer_packed *slot,
1474                         unsigned stride_words, unsigned offset, unsigned count,
1475                         struct pipe_stream_output_target *target)
1476 {
1477         unsigned stride = stride_words * 4;
1478         unsigned max_size = target->buffer_size;
1479         unsigned expected_size = stride * count;
1480
1481         /* Grab the BO and bind it to the batch */
1482         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1483
1484         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1485          * the perspective of the TILER and FRAGMENT.
1486          */
1487         panfrost_batch_add_bo(batch, bo,
1488                               PAN_BO_ACCESS_SHARED |
1489                               PAN_BO_ACCESS_RW |
1490                               PAN_BO_ACCESS_VERTEX_TILER |
1491                               PAN_BO_ACCESS_FRAGMENT);
1492
1493         /* We will have an offset applied to get alignment */
1494         mali_ptr addr = bo->gpu + target->buffer_offset + (offset * stride);
1495
1496         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1497                 cfg.pointer = (addr & ~63);
1498                 cfg.stride = stride;
1499                 cfg.size = MIN2(max_size, expected_size) + (addr & 63);
1500         }
1501 }
1502
1503 static bool
1504 has_point_coord(unsigned mask, gl_varying_slot loc)
1505 {
1506         if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1507                 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1508         else if (loc == VARYING_SLOT_PNTC)
1509                 return (mask & (1 << 8));
1510         else
1511                 return false;
1512 }
1513
1514 /* Helpers for manipulating stream out information so we can pack varyings
1515  * accordingly. Compute the src_offset for a given captured varying */
1516
1517 static struct pipe_stream_output *
1518 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1519 {
1520         for (unsigned i = 0; i < info->num_outputs; ++i) {
1521                 if (info->output[i].register_index == loc)
1522                         return &info->output[i];
1523         }
1524
1525         unreachable("Varying not captured");
1526 }
1527
1528 static unsigned
1529 pan_varying_size(enum mali_format fmt)
1530 {
1531         unsigned type = MALI_EXTRACT_TYPE(fmt);
1532         unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1533         unsigned bits = MALI_EXTRACT_BITS(fmt);
1534         unsigned bpc = 0;
1535
1536         if (bits == MALI_CHANNEL_FLOAT) {
1537                 /* No doubles */
1538                 bool fp16 = (type == MALI_FORMAT_SINT);
1539                 assert(fp16 || (type == MALI_FORMAT_UNORM));
1540
1541                 bpc = fp16 ? 2 : 4;
1542         } else {
1543                 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1544
1545                 /* See the enums */
1546                 bits = 1 << bits;
1547                 assert(bits >= 8);
1548                 bpc = bits / 8;
1549         }
1550
1551         return bpc * chan;
1552 }
1553
1554 /* Indices for named (non-XFB) varyings that are present. These are packed
1555  * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1556  * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1557  * of a given special field given a shift S by:
1558  *
1559  *      idx = popcount(P & ((1 << S) - 1))
1560  *
1561  * That is... look at all of the varyings that come earlier and count them, the
1562  * count is the new index since plus one. Likewise, the total number of special
1563  * buffers required is simply popcount(P)
1564  */
1565
1566 enum pan_special_varying {
1567         PAN_VARY_GENERAL = 0,
1568         PAN_VARY_POSITION = 1,
1569         PAN_VARY_PSIZ = 2,
1570         PAN_VARY_PNTCOORD = 3,
1571         PAN_VARY_FACE = 4,
1572         PAN_VARY_FRAGCOORD = 5,
1573
1574         /* Keep last */
1575         PAN_VARY_MAX,
1576 };
1577
1578 /* Given a varying, figure out which index it correpsonds to */
1579
1580 static inline unsigned
1581 pan_varying_index(unsigned present, enum pan_special_varying v)
1582 {
1583         unsigned mask = (1 << v) - 1;
1584         return util_bitcount(present & mask);
1585 }
1586
1587 /* Get the base offset for XFB buffers, which by convention come after
1588  * everything else. Wrapper function for semantic reasons; by construction this
1589  * is just popcount. */
1590
1591 static inline unsigned
1592 pan_xfb_base(unsigned present)
1593 {
1594         return util_bitcount(present);
1595 }
1596
1597 /* Computes the present mask for varyings so we can start emitting varying records */
1598
1599 static inline unsigned
1600 pan_varying_present(
1601         struct panfrost_shader_state *vs,
1602         struct panfrost_shader_state *fs,
1603         unsigned quirks)
1604 {
1605         /* At the moment we always emit general and position buffers. Not
1606          * strictly necessary but usually harmless */
1607
1608         unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1609
1610         /* Enable special buffers by the shader info */
1611
1612         if (vs->writes_point_size)
1613                 present |= (1 << PAN_VARY_PSIZ);
1614
1615         if (fs->reads_point_coord)
1616                 present |= (1 << PAN_VARY_PNTCOORD);
1617
1618         if (fs->reads_face)
1619                 present |= (1 << PAN_VARY_FACE);
1620
1621         if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1622                 present |= (1 << PAN_VARY_FRAGCOORD);
1623
1624         /* Also, if we have a point sprite, we need a point coord buffer */
1625
1626         for (unsigned i = 0; i < fs->varying_count; i++)  {
1627                 gl_varying_slot loc = fs->varyings_loc[i];
1628
1629                 if (has_point_coord(fs->point_sprite_mask, loc))
1630                         present |= (1 << PAN_VARY_PNTCOORD);
1631         }
1632
1633         return present;
1634 }
1635
1636 /* Emitters for varying records */
1637
1638 static void
1639 pan_emit_vary(struct mali_attribute_packed *out,
1640                 unsigned present, enum pan_special_varying buf,
1641                 unsigned quirks, enum mali_format format,
1642                 unsigned offset)
1643 {
1644         unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1645         unsigned swizzle = quirks & HAS_SWIZZLES ?
1646                         panfrost_get_default_swizzle(nr_channels) :
1647                         panfrost_bifrost_swizzle(nr_channels);
1648
1649         pan_pack(out, ATTRIBUTE, cfg) {
1650                 cfg.buffer_index = pan_varying_index(present, buf);
1651                 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1652                 cfg.format = (format << 12) | swizzle;
1653                 cfg.offset = offset;
1654         }
1655 }
1656
1657 /* General varying that is unused */
1658
1659 static void
1660 pan_emit_vary_only(struct mali_attribute_packed *out,
1661                 unsigned present, unsigned quirks)
1662 {
1663         pan_emit_vary(out, present, 0, quirks, MALI_VARYING_DISCARD, 0);
1664 }
1665
1666 /* Special records */
1667
1668 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1669         [PAN_VARY_POSITION]     = MALI_VARYING_POS,
1670         [PAN_VARY_PSIZ]         = MALI_R16F,
1671         [PAN_VARY_PNTCOORD]     = MALI_R16F,
1672         [PAN_VARY_FACE]         = MALI_R32I,
1673         [PAN_VARY_FRAGCOORD]    = MALI_RGBA32F
1674 };
1675
1676 static void
1677 pan_emit_vary_special(struct mali_attribute_packed *out,
1678                 unsigned present, enum pan_special_varying buf,
1679                 unsigned quirks)
1680 {
1681         assert(buf < PAN_VARY_MAX);
1682         pan_emit_vary(out, present, buf, quirks, pan_varying_formats[buf], 0);
1683 }
1684
1685 static enum mali_format
1686 pan_xfb_format(enum mali_format format, unsigned nr)
1687 {
1688         if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1689                 return MALI_R32F | MALI_NR_CHANNELS(nr);
1690         else
1691                 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1692 }
1693
1694 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1695  * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1696  * value. */
1697
1698 static void
1699 pan_emit_vary_xfb(struct mali_attribute_packed *out,
1700                 unsigned present,
1701                 unsigned max_xfb,
1702                 unsigned *streamout_offsets,
1703                 unsigned quirks,
1704                 enum mali_format format,
1705                 struct pipe_stream_output o)
1706 {
1707         unsigned swizzle = quirks & HAS_SWIZZLES ?
1708                         panfrost_get_default_swizzle(o.num_components) :
1709                         panfrost_bifrost_swizzle(o.num_components);
1710
1711         pan_pack(out, ATTRIBUTE, cfg) {
1712                 /* XFB buffers come after everything else */
1713                 cfg.buffer_index = pan_xfb_base(present) + o.output_buffer;
1714                 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1715
1716                 /* Override number of channels and precision to highp */
1717                 cfg.format = (pan_xfb_format(format, o.num_components) << 12) | swizzle;
1718
1719                 /* Apply given offsets together */
1720                 cfg.offset = (o.dst_offset * 4) /* dwords */
1721                         + streamout_offsets[o.output_buffer];
1722         }
1723 }
1724
1725 /* Determine if we should capture a varying for XFB. This requires actually
1726  * having a buffer for it. If we don't capture it, we'll fallback to a general
1727  * varying path (linked or unlinked, possibly discarding the write) */
1728
1729 static bool
1730 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1731                 unsigned loc, unsigned max_xfb)
1732 {
1733         if (!(xfb->so_mask & (1ll << loc)))
1734                 return false;
1735
1736         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1737         return o->output_buffer < max_xfb;
1738 }
1739
1740 static void
1741 pan_emit_general_varying(struct mali_attribute_packed *out,
1742                 struct panfrost_shader_state *other,
1743                 struct panfrost_shader_state *xfb,
1744                 gl_varying_slot loc,
1745                 enum mali_format format,
1746                 unsigned present,
1747                 unsigned quirks,
1748                 unsigned *gen_offsets,
1749                 enum mali_format *gen_formats,
1750                 unsigned *gen_stride,
1751                 unsigned idx,
1752                 bool should_alloc)
1753 {
1754         /* Check if we're linked */
1755         signed other_idx = -1;
1756
1757         for (unsigned j = 0; j < other->varying_count; ++j) {
1758                 if (other->varyings_loc[j] == loc) {
1759                         other_idx = j;
1760                         break;
1761                 }
1762         }
1763
1764         if (other_idx < 0) {
1765                 pan_emit_vary_only(out, present, quirks);
1766                 return;
1767         }
1768
1769         unsigned offset = gen_offsets[other_idx];
1770
1771         if (should_alloc) {
1772                 /* We're linked, so allocate a space via a watermark allocation */
1773                 enum mali_format alt = other->varyings[other_idx];
1774
1775                 /* Do interpolation at minimum precision */
1776                 unsigned size_main = pan_varying_size(format);
1777                 unsigned size_alt = pan_varying_size(alt);
1778                 unsigned size = MIN2(size_main, size_alt);
1779
1780                 /* If a varying is marked for XFB but not actually captured, we
1781                  * should match the format to the format that would otherwise
1782                  * be used for XFB, since dEQP checks for invariance here. It's
1783                  * unclear if this is required by the spec. */
1784
1785                 if (xfb->so_mask & (1ull << loc)) {
1786                         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1787                         format = pan_xfb_format(format, o->num_components);
1788                         size = pan_varying_size(format);
1789                 } else if (size == size_alt) {
1790                         format = alt;
1791                 }
1792
1793                 gen_offsets[idx] = *gen_stride;
1794                 gen_formats[other_idx] = format;
1795                 offset = *gen_stride;
1796                 *gen_stride += size;
1797         }
1798
1799         pan_emit_vary(out, present, PAN_VARY_GENERAL, quirks, format, offset);
1800 }
1801
1802 /* Higher-level wrapper around all of the above, classifying a varying into one
1803  * of the above types */
1804
1805 static void
1806 panfrost_emit_varying(
1807                 struct mali_attribute_packed *out,
1808                 struct panfrost_shader_state *stage,
1809                 struct panfrost_shader_state *other,
1810                 struct panfrost_shader_state *xfb,
1811                 unsigned present,
1812                 unsigned max_xfb,
1813                 unsigned *streamout_offsets,
1814                 unsigned quirks,
1815                 unsigned *gen_offsets,
1816                 enum mali_format *gen_formats,
1817                 unsigned *gen_stride,
1818                 unsigned idx,
1819                 bool should_alloc,
1820                 bool is_fragment)
1821 {
1822         gl_varying_slot loc = stage->varyings_loc[idx];
1823         enum mali_format format = stage->varyings[idx];
1824
1825         /* Override format to match linkage */
1826         if (!should_alloc && gen_formats[idx])
1827                 format = gen_formats[idx];
1828
1829         if (has_point_coord(stage->point_sprite_mask, loc)) {
1830                 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1831         } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1832                 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1833                 pan_emit_vary_xfb(out, present, max_xfb, streamout_offsets, quirks, format, *o);
1834         } else if (loc == VARYING_SLOT_POS) {
1835                 if (is_fragment)
1836                         pan_emit_vary_special(out, present, PAN_VARY_FRAGCOORD, quirks);
1837                 else
1838                         pan_emit_vary_special(out, present, PAN_VARY_POSITION, quirks);
1839         } else if (loc == VARYING_SLOT_PSIZ) {
1840                 pan_emit_vary_special(out, present, PAN_VARY_PSIZ, quirks);
1841         } else if (loc == VARYING_SLOT_PNTC) {
1842                 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1843         } else if (loc == VARYING_SLOT_FACE) {
1844                 pan_emit_vary_special(out, present, PAN_VARY_FACE, quirks);
1845         } else {
1846                 pan_emit_general_varying(out, other, xfb, loc, format, present,
1847                                 quirks, gen_offsets, gen_formats, gen_stride,
1848                                 idx, should_alloc);
1849         }
1850 }
1851
1852 static void
1853 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
1854                 unsigned present,
1855                 enum pan_special_varying v,
1856                 unsigned special)
1857 {
1858         if (present & (1 << v)) {
1859                 unsigned idx = pan_varying_index(present, v);
1860
1861                 pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
1862                         cfg.special = special;
1863                         cfg.type = 0;
1864                 }
1865         }
1866 }
1867
1868 void
1869 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1870                                  unsigned vertex_count,
1871                                  struct mali_vertex_tiler_postfix *vertex_postfix,
1872                                  struct mali_vertex_tiler_postfix *tiler_postfix,
1873                                  union midgard_primitive_size *primitive_size)
1874 {
1875         /* Load the shaders */
1876         struct panfrost_context *ctx = batch->ctx;
1877         struct panfrost_device *dev = pan_device(ctx->base.screen);
1878         struct panfrost_shader_state *vs, *fs;
1879         size_t vs_size, fs_size;
1880
1881         /* Allocate the varying descriptor */
1882
1883         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1884         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1885         vs_size = MALI_ATTRIBUTE_LENGTH * vs->varying_count;
1886         fs_size = MALI_ATTRIBUTE_LENGTH * fs->varying_count;
1887
1888         struct panfrost_transfer trans = panfrost_pool_alloc_aligned(
1889                         &batch->pool, vs_size + fs_size, MALI_ATTRIBUTE_LENGTH);
1890
1891         struct pipe_stream_output_info *so = &vs->stream_output;
1892         unsigned present = pan_varying_present(vs, fs, dev->quirks);
1893
1894         /* Check if this varying is linked by us. This is the case for
1895          * general-purpose, non-captured varyings. If it is, link it. If it's
1896          * not, use the provided stream out information to determine the
1897          * offset, since it was already linked for us. */
1898
1899         unsigned gen_offsets[32];
1900         enum mali_format gen_formats[32];
1901         memset(gen_offsets, 0, sizeof(gen_offsets));
1902         memset(gen_formats, 0, sizeof(gen_formats));
1903
1904         unsigned gen_stride = 0;
1905         assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
1906         assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
1907
1908         unsigned streamout_offsets[32];
1909
1910         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1911                 streamout_offsets[i] = panfrost_streamout_offset(
1912                                         so->stride[i],
1913                                         ctx->streamout.offsets[i],
1914                                         ctx->streamout.targets[i]);
1915         }
1916
1917         struct mali_attribute_packed *ovs = (struct mali_attribute_packed *)trans.cpu;
1918         struct mali_attribute_packed *ofs = ovs + vs->varying_count;
1919
1920         for (unsigned i = 0; i < vs->varying_count; i++) {
1921                 panfrost_emit_varying(ovs + i, vs, fs, vs, present,
1922                                 ctx->streamout.num_targets, streamout_offsets,
1923                                 dev->quirks,
1924                                 gen_offsets, gen_formats, &gen_stride, i, true, false);
1925         }
1926
1927         for (unsigned i = 0; i < fs->varying_count; i++) {
1928                 panfrost_emit_varying(ofs + i, fs, vs, vs, present,
1929                                 ctx->streamout.num_targets, streamout_offsets,
1930                                 dev->quirks,
1931                                 gen_offsets, gen_formats, &gen_stride, i, false, true);
1932         }
1933
1934         unsigned xfb_base = pan_xfb_base(present);
1935         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1936                         MALI_ATTRIBUTE_BUFFER_LENGTH * (xfb_base + ctx->streamout.num_targets),
1937                         MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1938         struct mali_attribute_buffer_packed *varyings =
1939                 (struct mali_attribute_buffer_packed *) T.cpu;
1940
1941         /* Emit the stream out buffers */
1942
1943         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1944                                                            ctx->vertex_count);
1945
1946         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1947                 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
1948                                         so->stride[i],
1949                                         ctx->streamout.offsets[i],
1950                                         out_count,
1951                                         ctx->streamout.targets[i]);
1952         }
1953
1954         panfrost_emit_varyings(batch,
1955                         &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
1956                         gen_stride, vertex_count);
1957
1958         /* fp32 vec4 gl_Position */
1959         tiler_postfix->position_varying = panfrost_emit_varyings(batch,
1960                         &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
1961                         sizeof(float) * 4, vertex_count);
1962
1963         if (present & (1 << PAN_VARY_PSIZ)) {
1964                 primitive_size->pointer = panfrost_emit_varyings(batch,
1965                                 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
1966                                 2, vertex_count);
1967         }
1968
1969         pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD);
1970         pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
1971         pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
1972
1973         vertex_postfix->varyings = T.gpu;
1974         tiler_postfix->varyings = T.gpu;
1975
1976         vertex_postfix->varying_meta = trans.gpu;
1977         tiler_postfix->varying_meta = trans.gpu + vs_size;
1978 }
1979
1980 void
1981 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1982                                 struct mali_vertex_tiler_prefix *vertex_prefix,
1983                                 struct mali_vertex_tiler_postfix *vertex_postfix,
1984                                 struct mali_vertex_tiler_prefix *tiler_prefix,
1985                                 struct mali_vertex_tiler_postfix *tiler_postfix,
1986                                 union midgard_primitive_size *primitive_size)
1987 {
1988         struct panfrost_context *ctx = batch->ctx;
1989         struct panfrost_device *device = pan_device(ctx->base.screen);
1990         bool wallpapering = ctx->wallpaper_batch && batch->scoreboard.tiler_dep;
1991         struct bifrost_payload_vertex bifrost_vertex = {0,};
1992         struct bifrost_payload_tiler bifrost_tiler = {0,};
1993         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
1994         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
1995         void *vp, *tp;
1996         size_t vp_size, tp_size;
1997
1998         if (device->quirks & IS_BIFROST) {
1999                 bifrost_vertex.prefix = *vertex_prefix;
2000                 bifrost_vertex.postfix = *vertex_postfix;
2001                 vp = &bifrost_vertex;
2002                 vp_size = sizeof(bifrost_vertex);
2003
2004                 bifrost_tiler.prefix = *tiler_prefix;
2005                 bifrost_tiler.tiler.primitive_size = *primitive_size;
2006                 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2007                 bifrost_tiler.postfix = *tiler_postfix;
2008                 tp = &bifrost_tiler;
2009                 tp_size = sizeof(bifrost_tiler);
2010         } else {
2011                 midgard_vertex.prefix = *vertex_prefix;
2012                 midgard_vertex.postfix = *vertex_postfix;
2013                 vp = &midgard_vertex;
2014                 vp_size = sizeof(midgard_vertex);
2015
2016                 midgard_tiler.prefix = *tiler_prefix;
2017                 midgard_tiler.postfix = *tiler_postfix;
2018                 midgard_tiler.primitive_size = *primitive_size;
2019                 tp = &midgard_tiler;
2020                 tp_size = sizeof(midgard_tiler);
2021         }
2022
2023         if (wallpapering) {
2024                 /* Inject in reverse order, with "predicted" job indices.
2025                  * THIS IS A HACK XXX */
2026                 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false,
2027                                  batch->scoreboard.job_index + 2, tp, tp_size, true);
2028                 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2029                                  vp, vp_size, true);
2030                 return;
2031         }
2032
2033         /* If rasterizer discard is enable, only submit the vertex */
2034
2035         unsigned vertex = panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2036                                            vp, vp_size, false);
2037
2038         if (ctx->rasterizer->base.rasterizer_discard)
2039                 return;
2040
2041         panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false, vertex, tp, tp_size,
2042                          false);
2043 }
2044
2045 /* TODO: stop hardcoding this */
2046 mali_ptr
2047 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2048 {
2049         uint16_t locations[] = {
2050             128, 128,
2051             0, 256,
2052             0, 256,
2053             0, 256,
2054             0, 256,
2055             0, 256,
2056             0, 256,
2057             0, 256,
2058             0, 256,
2059             0, 256,
2060             0, 256,
2061             0, 256,
2062             0, 256,
2063             0, 256,
2064             0, 256,
2065             0, 256,
2066             0, 256,
2067             0, 256,
2068             0, 256,
2069             0, 256,
2070             0, 256,
2071             0, 256,
2072             0, 256,
2073             0, 256,
2074             0, 256,
2075             0, 256,
2076             0, 256,
2077             0, 256,
2078             0, 256,
2079             0, 256,
2080             0, 256,
2081             0, 256,
2082             128, 128,
2083             0, 0,
2084             0, 0,
2085             0, 0,
2086             0, 0,
2087             0, 0,
2088             0, 0,
2089             0, 0,
2090             0, 0,
2091             0, 0,
2092             0, 0,
2093             0, 0,
2094             0, 0,
2095             0, 0,
2096             0, 0,
2097             0, 0,
2098         };
2099
2100         return panfrost_pool_upload_aligned(&batch->pool, locations, 96 * sizeof(uint16_t), 64);
2101 }