src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28
  29 #include "panfrost-quirks.h"
  30
  31 #include "pan_pool.h"
  32 #include "pan_bo.h"
  33 #include "pan_cmdstream.h"
  34 #include "pan_context.h"
  35 #include "pan_job.h"
  36
  37 /* If a BO is accessed for a particular shader stage, will it be in the primary
  38  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  39  * fragment will be primary, e.g. compute jobs will be considered
  40  * "vertex/tiler" by analogy */
  41
  42 static inline uint32_t
  43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  44 {
  45         assert(stage == PIPE_SHADER_FRAGMENT ||
  46                stage == PIPE_SHADER_VERTEX ||
  47                stage == PIPE_SHADER_COMPUTE);
  48
  49         return stage == PIPE_SHADER_FRAGMENT ?
  50                PAN_BO_ACCESS_FRAGMENT :
  51                PAN_BO_ACCESS_VERTEX_TILER;
  52 }
  53
  54 static void
  55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
  56                                struct mali_vertex_tiler_postfix *postfix)
  57 {
  58         struct panfrost_device *dev = pan_device(ctx->base.screen);
  59         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  60
  61         struct mali_shared_memory shared = {
  62                 .shared_workgroup_count = ~0,
  63         };
  64
  65         if (batch->stack_size) {
  66                 struct panfrost_bo *stack =
  67                         panfrost_batch_get_scratchpad(batch, batch->stack_size,
  68                                         dev->thread_tls_alloc,
  69                                         dev->core_count);
  70
  71                 shared.stack_shift = panfrost_get_stack_shift(batch->stack_size);
  72                 shared.scratchpad = stack->gpu;
  73         }
  74
  75         postfix->shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared, sizeof(shared), 64);
  76 }
  77
  78 static void
  79 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
  80                                struct mali_vertex_tiler_postfix *postfix)
  81 {
  82         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  83         postfix->shared_memory = panfrost_batch_reserve_framebuffer(batch);
  84 }
  85
  86 static void
  87 panfrost_vt_update_rasterizer(struct panfrost_rasterizer *rasterizer,
  88                               struct mali_vertex_tiler_prefix *prefix,
  89                               struct mali_vertex_tiler_postfix *postfix)
  90 {
  91         postfix->gl_enables |= 0x7;
  92         SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
  93                 rasterizer->base.front_ccw);
  94         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
  95                 (rasterizer->base.cull_face & PIPE_FACE_FRONT));
  96         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
  97                 (rasterizer->base.cull_face & PIPE_FACE_BACK));
  98         SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
  99                 rasterizer->base.flatshade_first);
 100 }
 101
 102 void
 103 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
 104                                   struct mali_vertex_tiler_prefix *prefix,
 105                                   union midgard_primitive_size *primitive_size)
 106 {
 107         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 108
 109         if (!panfrost_writes_point_size(ctx)) {
 110                 float val = (prefix->draw_mode == MALI_DRAW_MODE_POINTS) ?
 111                               rasterizer->base.point_size :
 112                               rasterizer->base.line_width;
 113
 114                 primitive_size->constant = val;
 115         }
 116 }
 117
 118 static void
 119 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
 120                                    struct mali_vertex_tiler_postfix *postfix)
 121 {
 122         SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
 123         if (ctx->occlusion_query) {
 124                 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
 125                 panfrost_batch_add_bo(ctx->batch, ctx->occlusion_query->bo,
 126                                       PAN_BO_ACCESS_SHARED |
 127                                       PAN_BO_ACCESS_RW |
 128                                       PAN_BO_ACCESS_FRAGMENT);
 129         } else {
 130                 postfix->occlusion_counter = 0;
 131         }
 132 }
 133
 134 void
 135 panfrost_vt_init(struct panfrost_context *ctx,
 136                  enum pipe_shader_type stage,
 137                  struct mali_vertex_tiler_prefix *prefix,
 138                  struct mali_vertex_tiler_postfix *postfix)
 139 {
 140         struct panfrost_device *device = pan_device(ctx->base.screen);
 141
 142         if (!ctx->shader[stage])
 143                 return;
 144
 145         memset(prefix, 0, sizeof(*prefix));
 146         memset(postfix, 0, sizeof(*postfix));
 147
 148         if (device->quirks & IS_BIFROST) {
 149                 postfix->gl_enables = 0x2;
 150                 panfrost_vt_emit_shared_memory(ctx, postfix);
 151         } else {
 152                 postfix->gl_enables = 0x6;
 153                 panfrost_vt_attach_framebuffer(ctx, postfix);
 154         }
 155
 156         if (stage == PIPE_SHADER_FRAGMENT) {
 157                 panfrost_vt_update_occlusion_query(ctx, postfix);
 158                 panfrost_vt_update_rasterizer(ctx->rasterizer, prefix, postfix);
 159         }
 160 }
 161
 162 static unsigned
 163 panfrost_translate_index_size(unsigned size)
 164 {
 165         switch (size) {
 166         case 1:
 167                 return MALI_DRAW_INDEXED_UINT8;
 168
 169         case 2:
 170                 return MALI_DRAW_INDEXED_UINT16;
 171
 172         case 4:
 173                 return MALI_DRAW_INDEXED_UINT32;
 174
 175         default:
 176                 unreachable("Invalid index size");
 177         }
 178 }
 179
 180 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
 181  * good for the duration of the draw (transient), could last longer. Also get
 182  * the bounds on the index buffer for the range accessed by the draw. We do
 183  * these operations together because there are natural optimizations which
 184  * require them to be together. */
 185
 186 static mali_ptr
 187 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 188                                   const struct pipe_draw_info *info,
 189                                   unsigned *min_index, unsigned *max_index)
 190 {
 191         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 192         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 193         off_t offset = info->start * info->index_size;
 194         bool needs_indices = true;
 195         mali_ptr out = 0;
 196
 197         if (info->max_index != ~0u) {
 198                 *min_index = info->min_index;
 199                 *max_index = info->max_index;
 200                 needs_indices = false;
 201         }
 202
 203         if (!info->has_user_indices) {
 204                 /* Only resources can be directly mapped */
 205                 panfrost_batch_add_bo(batch, rsrc->bo,
 206                                       PAN_BO_ACCESS_SHARED |
 207                                       PAN_BO_ACCESS_READ |
 208                                       PAN_BO_ACCESS_VERTEX_TILER);
 209                 out = rsrc->bo->gpu + offset;
 210
 211                 /* Check the cache */
 212                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 213                                                            info->start,
 214                                                            info->count,
 215                                                            min_index,
 216                                                            max_index);
 217         } else {
 218                 /* Otherwise, we need to upload to transient memory */
 219                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 220                 struct panfrost_transfer T =
 221                         panfrost_pool_alloc_aligned(&batch->pool,
 222                                 info->count * info->index_size,
 223                                 info->index_size);
 224
 225                 memcpy(T.cpu, ibuf8 + offset, info->count * info->index_size);
 226                 out = T.gpu;
 227         }
 228
 229         if (needs_indices) {
 230                 /* Fallback */
 231                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 232
 233                 if (!info->has_user_indices)
 234                         panfrost_minmax_cache_add(rsrc->index_cache,
 235                                                   info->start, info->count,
 236                                                   *min_index, *max_index);
 237         }
 238
 239         return out;
 240 }
 241
 242 void
 243 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
 244                           const struct pipe_draw_info *info,
 245                           enum mali_draw_mode draw_mode,
 246                           struct mali_vertex_tiler_postfix *vertex_postfix,
 247                           struct mali_vertex_tiler_prefix *tiler_prefix,
 248                           struct mali_vertex_tiler_postfix *tiler_postfix,
 249                           unsigned *vertex_count,
 250                           unsigned *padded_count)
 251 {
 252         tiler_prefix->draw_mode = draw_mode;
 253
 254         unsigned draw_flags = 0;
 255
 256         if (panfrost_writes_point_size(ctx))
 257                 draw_flags |= MALI_DRAW_VARYING_SIZE;
 258
 259         if (info->primitive_restart)
 260                 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
 261
 262         /* These doesn't make much sense */
 263
 264         draw_flags |= 0x3000;
 265
 266         if (info->index_size) {
 267                 unsigned min_index = 0, max_index = 0;
 268
 269                 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
 270                                                                        info,
 271                                                                        &min_index,
 272                                                                        &max_index);
 273
 274                 /* Use the corresponding values */
 275                 *vertex_count = max_index - min_index + 1;
 276                 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
 277                 tiler_prefix->offset_bias_correction = -min_index;
 278                 tiler_prefix->index_count = MALI_POSITIVE(info->count);
 279                 draw_flags |= panfrost_translate_index_size(info->index_size);
 280         } else {
 281                 tiler_prefix->indices = 0;
 282                 *vertex_count = ctx->vertex_count;
 283                 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
 284                 tiler_prefix->offset_bias_correction = 0;
 285                 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
 286         }
 287
 288         tiler_prefix->unknown_draw = draw_flags;
 289
 290         /* Encode the padded vertex count */
 291
 292         if (info->instance_count > 1) {
 293                 *padded_count = panfrost_padded_vertex_count(*vertex_count);
 294
 295                 unsigned shift = __builtin_ctz(ctx->padded_count);
 296                 unsigned k = ctx->padded_count >> (shift + 1);
 297
 298                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
 299                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
 300         } else {
 301                 *padded_count = *vertex_count;
 302
 303                 /* Reset instancing state */
 304                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
 305                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
 306         }
 307 }
 308
 309 static void
 310 panfrost_emit_compute_shader(struct panfrost_context *ctx,
 311                           enum pipe_shader_type st,
 312                           struct mali_shader_meta *meta)
 313 {
 314         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 315         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 316
 317         memset(meta, 0, sizeof(*meta));
 318         meta->shader = ss->shader;
 319         meta->attribute_count = ss->attribute_count;
 320         meta->varying_count = ss->varying_count;
 321         meta->texture_count = ctx->sampler_view_count[st];
 322         meta->sampler_count = ctx->sampler_count[st];
 323
 324         if (dev->quirks & IS_BIFROST) {
 325                 struct mali_bifrost_properties_packed prop;
 326
 327                 pan_pack(&prop, BIFROST_PROPERTIES, cfg) {
 328                         cfg.unknown = 0x800000; /* XXX */
 329                         cfg.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 330                 }
 331
 332                 memcpy(&meta->bifrost_props, &prop, sizeof(prop));
 333
 334                 meta->bifrost2.preload_regs = 0xC0;
 335                 meta->bifrost2.uniform_count = ss->uniform_count;
 336         } else {
 337                 struct mali_midgard_properties_packed prop;
 338
 339                 pan_pack(&prop, MIDGARD_PROPERTIES, cfg) {
 340                         cfg.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 341                         cfg.uniform_count = ss->uniform_count;
 342                         cfg.work_register_count = ss->work_reg_count;
 343                         cfg.writes_globals = ss->writes_global;
 344                         cfg.suppress_inf_nan = true; /* XXX */
 345                 }
 346
 347                 memcpy(&meta->midgard_props, &prop, sizeof(prop));
 348         }
 349 }
 350
 351 static unsigned
 352 translate_tex_wrap(enum pipe_tex_wrap w)
 353 {
 354         switch (w) {
 355         case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT;
 356         case PIPE_TEX_WRAP_CLAMP: return MALI_WRAP_MODE_CLAMP;
 357         case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE;
 358         case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER;
 359         case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT;
 360         case PIPE_TEX_WRAP_MIRROR_CLAMP: return MALI_WRAP_MODE_MIRRORED_CLAMP;
 361         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
 362         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
 363         default: unreachable("Invalid wrap");
 364         }
 365 }
 366
 367 /* The hardware compares in the wrong order order, so we have to flip before
 368  * encoding. Yes, really. */
 369
 370 static enum mali_func
 371 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
 372 {
 373         if (!cso->compare_mode)
 374                 return MALI_FUNC_NEVER;
 375
 376         enum mali_func f = panfrost_translate_compare_func(cso->compare_func);
 377         return panfrost_flip_compare_func(f);
 378 }
 379
 380 static enum mali_mipmap_mode
 381 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
 382 {
 383         switch (f) {
 384         case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST;
 385         case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR;
 386         case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE;
 387         default: unreachable("Invalid");
 388         }
 389 }
 390
 391 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 392                                 struct mali_midgard_sampler_packed *hw)
 393 {
 394         pan_pack(hw, MIDGARD_SAMPLER, cfg) {
 395                 cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 396                 cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 397                 cfg.mipmap_mode = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) ?
 398                         MALI_MIPMAP_MODE_TRILINEAR : MALI_MIPMAP_MODE_NEAREST;
 399                 cfg.normalized_coordinates = cso->normalized_coords;
 400
 401                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
 402
 403                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
 404
 405                 /* If necessary, we disable mipmapping in the sampler descriptor by
 406                  * clamping the LOD as tight as possible (from 0 to epsilon,
 407                  * essentially -- remember these are fixed point numbers, so
 408                  * epsilon=1/256) */
 409
 410                 cfg.maximum_lod = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) ?
 411                         cfg.minimum_lod + 1 :
 412                         FIXED_16(cso->max_lod, false);
 413
 414                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
 415                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
 416                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
 417
 418                 cfg.compare_function = panfrost_sampler_compare_func(cso);
 419                 cfg.seamless_cube_map = cso->seamless_cube_map;
 420
 421                 cfg.border_color_r = cso->border_color.f[0];
 422                 cfg.border_color_g = cso->border_color.f[1];
 423                 cfg.border_color_b = cso->border_color.f[2];
 424                 cfg.border_color_a = cso->border_color.f[3];
 425         }
 426 }
 427
 428 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
 429                                         struct mali_bifrost_sampler_packed *hw)
 430 {
 431         pan_pack(hw, BIFROST_SAMPLER, cfg) {
 432                 cfg.magnify_linear = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR;
 433                 cfg.minify_linear = cso->min_img_filter == PIPE_TEX_FILTER_LINEAR;
 434                 cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
 435                 cfg.normalized_coordinates = cso->normalized_coords;
 436
 437                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
 438                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
 439                 cfg.maximum_lod = FIXED_16(cso->max_lod, false);
 440
 441                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
 442                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
 443                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
 444
 445                 cfg.compare_function = panfrost_sampler_compare_func(cso);
 446                 cfg.seamless_cube_map = cso->seamless_cube_map;
 447         }
 448 }
 449
 450 static bool
 451 panfrost_fs_required(
 452                 struct panfrost_shader_state *fs,
 453                 struct panfrost_blend_final *blend,
 454                 unsigned rt_count)
 455 {
 456         /* If we generally have side effects */
 457         if (fs->fs_sidefx)
 458                 return true;
 459
 460         /* If colour is written we need to execute */
 461         for (unsigned i = 0; i < rt_count; ++i) {
 462                 if (!blend[i].no_colour)
 463                         return true;
 464         }
 465
 466         /* If depth is written and not implied we need to execute.
 467          * TODO: Predicate on Z/S writes being enabled */
 468         return (fs->writes_depth || fs->writes_stencil);
 469 }
 470
 471 static void
 472 panfrost_emit_blend(struct panfrost_batch *batch, void *rts,
 473                 struct panfrost_blend_final *blend)
 474 {
 475         const struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
 476         struct panfrost_shader_state *fs = panfrost_get_shader_state(batch->ctx, PIPE_SHADER_FRAGMENT);
 477         unsigned rt_count = batch->key.nr_cbufs;
 478
 479         struct bifrost_blend_rt *brts = rts;
 480         struct midgard_blend_rt *mrts = rts;
 481
 482         /* Disable blending for depth-only on Bifrost */
 483
 484         if (rt_count == 0 && dev->quirks & IS_BIFROST)
 485                 brts[0].unk2 = 0x3;
 486
 487         for (unsigned i = 0; i < rt_count; ++i) {
 488                 unsigned flags = 0;
 489
 490                 pan_pack(&flags, BLEND_FLAGS, cfg) {
 491                         if (blend[i].no_colour) {
 492                                 cfg.enable = false;
 493                                 break;
 494                         }
 495
 496                         batch->draws |= (PIPE_CLEAR_COLOR0 << i);
 497
 498                         cfg.srgb = util_format_is_srgb(batch->key.cbufs[i]->format);
 499                         cfg.load_destination = blend[i].load_dest;
 500                         cfg.dither_disable = !batch->ctx->blend->base.dither;
 501
 502                         if (!(dev->quirks & IS_BIFROST))
 503                                 cfg.midgard_blend_shader = blend[i].is_shader;
 504                 }
 505
 506                 if (dev->quirks & IS_BIFROST) {
 507                         brts[i].flags = flags;
 508
 509                         if (blend[i].is_shader) {
 510                                 /* The blend shader's address needs to be at
 511                                  * the same top 32 bit as the fragment shader.
 512                                  * TODO: Ensure that's always the case.
 513                                  */
 514                                 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
 515                                        (fs->bo->gpu & (0xffffffffull << 32)));
 516                                 brts[i].shader = blend[i].shader.gpu;
 517                                 brts[i].unk2 = 0x0;
 518                         } else {
 519                                 enum pipe_format format = batch->key.cbufs[i]->format;
 520                                 const struct util_format_description *format_desc;
 521                                 format_desc = util_format_description(format);
 522
 523                                 brts[i].equation = blend[i].equation.equation;
 524
 525                                 /* TODO: this is a bit more complicated */
 526                                 brts[i].constant = blend[i].equation.constant;
 527
 528                                 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
 529
 530                                 /* 0x19 disables blending and forces REPLACE
 531                                  * mode (equivalent to rgb_mode = alpha_mode =
 532                                  * x122, colour mask = 0xF). 0x1a allows
 533                                  * blending. */
 534                                 brts[i].unk2 = blend[i].opaque ? 0x19 : 0x1a;
 535
 536                                 brts[i].shader_type = fs->blend_types[i];
 537                         }
 538                 } else {
 539                         memcpy(&mrts[i].flags, &flags, sizeof(flags));
 540
 541                         if (blend[i].is_shader) {
 542                                 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 543                         } else {
 544                                 mrts[i].blend.equation = blend[i].equation.equation;
 545                                 mrts[i].blend.constant = blend[i].equation.constant;
 546                         }
 547                 }
 548         }
 549 }
 550
 551 static void
 552 panfrost_emit_frag_shader(struct panfrost_context *ctx,
 553                                struct mali_shader_meta *fragmeta,
 554                                struct panfrost_blend_final *blend)
 555 {
 556         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 557         struct panfrost_shader_state *fs;
 558
 559         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 560
 561         struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 562         const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
 563         unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
 564
 565         memset(fragmeta, 0, sizeof(*fragmeta));
 566
 567         fragmeta->shader = fs->shader;
 568         fragmeta->attribute_count = fs->attribute_count;
 569         fragmeta->varying_count = fs->varying_count;
 570         fragmeta->texture_count = ctx->sampler_view_count[PIPE_SHADER_FRAGMENT];
 571         fragmeta->sampler_count = ctx->sampler_count[PIPE_SHADER_FRAGMENT];
 572
 573         if (dev->quirks & IS_BIFROST) {
 574                 struct mali_bifrost_properties_packed prop;
 575
 576                 bool no_blend = true;
 577
 578                 for (unsigned i = 0; i < rt_count; ++i)
 579                         no_blend &= (!blend[i].load_dest | blend[i].no_colour);
 580
 581                 pan_pack(&prop, BIFROST_PROPERTIES, cfg) {
 582                         cfg.unknown = 0x950020; /* XXX */
 583                         cfg.uniform_buffer_count = panfrost_ubo_count(ctx, PIPE_SHADER_FRAGMENT);
 584                         cfg.early_z_enable = !fs->can_discard && !fs->writes_depth && no_blend;
 585                 }
 586
 587                 memcpy(&fragmeta->bifrost_props, &prop, sizeof(prop));
 588
 589                 fragmeta->bifrost2.preload_regs = 0x1;
 590                 SET_BIT(fragmeta->bifrost2.preload_regs, 0x10, fs->reads_frag_coord);
 591
 592                 fragmeta->bifrost2.uniform_count = fs->uniform_count;
 593         } else {
 594                 struct mali_midgard_properties_packed prop;
 595
 596                 /* Reasons to disable early-Z from a shader perspective */
 597                 bool late_z = fs->can_discard || fs->writes_global ||
 598                         fs->writes_depth || fs->writes_stencil;
 599
 600                 /* Reasons to disable early-Z from a CSO perspective */
 601                 bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
 602
 603                 /* If either depth or stencil is enabled, discard matters */
 604                 bool zs_enabled =
 605                         (zsa->base.depth.enabled && zsa->base.depth.func != PIPE_FUNC_ALWAYS) ||
 606                         zsa->base.stencil[0].enabled;
 607
 608                 bool has_blend_shader = false;
 609
 610                 for (unsigned c = 0; c < rt_count; ++c)
 611                         has_blend_shader |= blend[c].is_shader;
 612
 613                 pan_pack(&prop, MIDGARD_PROPERTIES, cfg) {
 614                         cfg.uniform_buffer_count = panfrost_ubo_count(ctx, PIPE_SHADER_FRAGMENT);
 615                         cfg.uniform_count = fs->uniform_count;
 616                         cfg.work_register_count = fs->work_reg_count;
 617                         cfg.writes_globals = fs->writes_global;
 618                         cfg.suppress_inf_nan = true; /* XXX */
 619
 620                         /* TODO: Reduce this limit? */
 621                         if (has_blend_shader)
 622                                 cfg.work_register_count = MAX2(cfg.work_register_count, 8);
 623
 624                         cfg.stencil_from_shader = fs->writes_stencil;
 625                         cfg.helper_invocation_enable = fs->helper_invocations;
 626                         cfg.depth_source = fs->writes_depth ?
 627                                 MALI_DEPTH_SOURCE_SHADER :
 628                                 MALI_DEPTH_SOURCE_FIXED_FUNCTION;
 629
 630                         /* Depend on other state */
 631                         cfg.early_z_enable = !(late_z || alpha_to_coverage);
 632                         cfg.reads_tilebuffer = fs->outputs_read || (!zs_enabled && fs->can_discard);
 633                         cfg.reads_depth_stencil = zs_enabled && fs->can_discard;
 634                 }
 635
 636                 memcpy(&fragmeta->midgard_props, &prop, sizeof(prop));
 637         }
 638
 639         bool msaa = rast->multisample;
 640         fragmeta->coverage_mask = msaa ? ctx->sample_mask : ~0;
 641
 642         fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x10;
 643         fragmeta->unknown2_4 = 0x4e0;
 644
 645         /* TODO: Sample size */
 646         SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
 647         SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
 648
 649         /* EXT_shader_framebuffer_fetch requires the shader to be run
 650          * per-sample when outputs are read. */
 651         bool per_sample = ctx->min_samples > 1 || fs->outputs_read;
 652         SET_BIT(fragmeta->unknown2_3, MALI_PER_SAMPLE, msaa && per_sample);
 653
 654         fragmeta->depth_units = rast->offset_units * 2.0f;
 655         fragmeta->depth_factor = rast->offset_scale;
 656
 657         /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
 658
 659         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, rast->offset_tri);
 660         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, rast->offset_tri);
 661
 662         SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, rast->depth_clip_near);
 663         SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, rast->depth_clip_far);
 664
 665         SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
 666                 zsa->base.stencil[0].enabled);
 667
 668         fragmeta->stencil_mask_front = zsa->stencil_mask_front;
 669         fragmeta->stencil_mask_back = zsa->stencil_mask_back;
 670
 671         /* Bottom bits for stencil ref, exactly one word */
 672         fragmeta->stencil_front.opaque[0] = zsa->stencil_front.opaque[0] | ctx->stencil_ref.ref_value[0];
 673
 674         /* If back-stencil is not enabled, use the front values */
 675
 676         if (zsa->base.stencil[1].enabled)
 677                 fragmeta->stencil_back.opaque[0] = zsa->stencil_back.opaque[0] | ctx->stencil_ref.ref_value[1];
 678         else
 679                 fragmeta->stencil_back = fragmeta->stencil_front;
 680
 681         SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
 682                 zsa->base.depth.writemask);
 683
 684         fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
 685         fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(
 686                 zsa->base.depth.enabled ? zsa->base.depth.func : PIPE_FUNC_ALWAYS));
 687
 688         SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
 689                 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
 690                 !ctx->blend->base.dither);
 691
 692         SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
 693
 694         SET_BIT(fragmeta->unknown2_4, MALI_ALPHA_TO_COVERAGE,
 695                         ctx->blend->base.alpha_to_coverage);
 696
 697         /* Disable shader execution if we can */
 698         if (dev->quirks & MIDGARD_SHADERLESS
 699                         && !panfrost_fs_required(fs, blend, rt_count)) {
 700                 fragmeta->shader = 0x1;
 701                 fragmeta->attribute_count = 0;
 702                 fragmeta->varying_count = 0;
 703                 fragmeta->texture_count = 0;
 704                 fragmeta->sampler_count = 0;
 705
 706                 /* This feature is not known to work on Bifrost */
 707                 struct mali_midgard_properties_packed prop;
 708
 709                 pan_pack(&prop, MIDGARD_PROPERTIES, cfg) {
 710                         cfg.work_register_count = 1;
 711                         cfg.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION;
 712                         cfg.early_z_enable = true;
 713                 }
 714
 715                 memcpy(&fragmeta->midgard_props, &prop, sizeof(prop));
 716         }
 717
 718         if (dev->quirks & MIDGARD_SFBD) {
 719                 /* When only a single render target platform is used, the blend
 720                  * information is inside the shader meta itself. We additionally
 721                  * need to signal CAN_DISCARD for nontrivial blend modes (so
 722                  * we're able to read back the destination buffer) */
 723
 724                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
 725                         blend[0].is_shader);
 726
 727                 if (blend[0].is_shader) {
 728                         fragmeta->blend.shader = blend[0].shader.gpu |
 729                                 blend[0].shader.first_tag;
 730                 } else {
 731                         fragmeta->blend.equation = blend[0].equation.equation;
 732                         fragmeta->blend.constant = blend[0].equation.constant;
 733                 }
 734
 735                 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
 736                         blend[0].load_dest);
 737         } else if (!(dev->quirks & IS_BIFROST)) {
 738                 /* Bug where MRT-capable hw apparently reads the last blend
 739                  * shader from here instead of the usual location? */
 740
 741                 for (signed rt = ((signed) rt_count - 1); rt >= 0; --rt) {
 742                         if (!blend[rt].is_shader)
 743                                 continue;
 744
 745                         fragmeta->blend.shader = blend[rt].shader.gpu |
 746                                                  blend[rt].shader.first_tag;
 747                         break;
 748                 }
 749         }
 750 }
 751
 752 void
 753 panfrost_emit_shader_meta(struct panfrost_batch *batch,
 754                           enum pipe_shader_type st,
 755                           struct mali_vertex_tiler_postfix *postfix)
 756 {
 757         struct panfrost_context *ctx = batch->ctx;
 758         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 759
 760         if (!ss) {
 761                 postfix->shader = 0;
 762                 return;
 763         }
 764
 765         struct mali_shader_meta meta;
 766
 767         /* Add the shader BO to the batch. */
 768         panfrost_batch_add_bo(batch, ss->bo,
 769                               PAN_BO_ACCESS_PRIVATE |
 770                               PAN_BO_ACCESS_READ |
 771                               panfrost_bo_access_for_stage(st));
 772
 773         mali_ptr shader_ptr;
 774
 775         if (st == PIPE_SHADER_FRAGMENT) {
 776                 struct panfrost_device *dev = pan_device(ctx->base.screen);
 777                 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 778                 size_t desc_size = sizeof(meta);
 779                 void *rts = NULL;
 780                 struct panfrost_transfer xfer;
 781                 unsigned rt_size;
 782
 783                 if (dev->quirks & MIDGARD_SFBD)
 784                         rt_size = 0;
 785                 else if (dev->quirks & IS_BIFROST)
 786                         rt_size = sizeof(struct bifrost_blend_rt);
 787                 else
 788                         rt_size = sizeof(struct midgard_blend_rt);
 789
 790                 desc_size += rt_size * rt_count;
 791
 792                 if (rt_size)
 793                         rts = rzalloc_size(ctx, rt_size * rt_count);
 794
 795                 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 796
 797                 for (unsigned c = 0; c < ctx->pipe_framebuffer.nr_cbufs; ++c)
 798                         blend[c] = panfrost_get_blend_for_context(ctx, c);
 799
 800                 panfrost_emit_frag_shader(ctx, &meta, blend);
 801
 802                 if (!(dev->quirks & MIDGARD_SFBD))
 803                         panfrost_emit_blend(batch, rts, blend);
 804                 else
 805                         batch->draws |= PIPE_CLEAR_COLOR0;
 806
 807                 xfer = panfrost_pool_alloc_aligned(&batch->pool, desc_size, sizeof(meta));
 808
 809                 memcpy(xfer.cpu, &meta, sizeof(meta));
 810                 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
 811
 812                 if (rt_size)
 813                         ralloc_free(rts);
 814
 815                 shader_ptr = xfer.gpu;
 816         } else {
 817                 panfrost_emit_compute_shader(ctx, st, &meta);
 818
 819                 shader_ptr = panfrost_pool_upload(&batch->pool, &meta,
 820                                                        sizeof(meta));
 821         }
 822
 823         postfix->shader = shader_ptr;
 824 }
 825
 826 void
 827 panfrost_emit_viewport(struct panfrost_batch *batch,
 828                        struct mali_vertex_tiler_postfix *tiler_postfix)
 829 {
 830         struct panfrost_context *ctx = batch->ctx;
 831         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 832         const struct pipe_scissor_state *ss = &ctx->scissor;
 833         const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 834         const struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
 835
 836         /* Derive min/max from translate/scale. Note since |x| >= 0 by
 837          * definition, we have that -|x| <= |x| hence translate - |scale| <=
 838          * translate + |scale|, so the ordering is correct here. */
 839         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 840         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 841         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 842         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 843         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 844         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 845
 846         /* Scissor to the intersection of viewport and to the scissor, clamped
 847          * to the framebuffer */
 848
 849         unsigned minx = MIN2(fb->width, vp_minx);
 850         unsigned maxx = MIN2(fb->width, vp_maxx);
 851         unsigned miny = MIN2(fb->height, vp_miny);
 852         unsigned maxy = MIN2(fb->height, vp_maxy);
 853
 854         if (ss && rast->scissor) {
 855                 minx = MAX2(ss->minx, minx);
 856                 miny = MAX2(ss->miny, miny);
 857                 maxx = MIN2(ss->maxx, maxx);
 858                 maxy = MIN2(ss->maxy, maxy);
 859         }
 860
 861         struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, MALI_VIEWPORT_LENGTH);
 862
 863         pan_pack(T.cpu, VIEWPORT, cfg) {
 864                 cfg.scissor_minimum_x = minx;
 865                 cfg.scissor_minimum_y = miny;
 866                 cfg.scissor_maximum_x = maxx - 1;
 867                 cfg.scissor_maximum_y = maxy - 1;
 868
 869                 cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY;
 870                 cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY;
 871         }
 872
 873         tiler_postfix->viewport = T.gpu;
 874         panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
 875 }
 876
 877 static mali_ptr
 878 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
 879                                  enum pipe_shader_type st,
 880                                  struct panfrost_constant_buffer *buf,
 881                                  unsigned index)
 882 {
 883         struct pipe_constant_buffer *cb = &buf->cb[index];
 884         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
 885
 886         if (rsrc) {
 887                 panfrost_batch_add_bo(batch, rsrc->bo,
 888                                       PAN_BO_ACCESS_SHARED |
 889                                       PAN_BO_ACCESS_READ |
 890                                       panfrost_bo_access_for_stage(st));
 891
 892                 /* Alignment gauranteed by
 893                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
 894                 return rsrc->bo->gpu + cb->buffer_offset;
 895         } else if (cb->user_buffer) {
 896                 return panfrost_pool_upload_aligned(&batch->pool,
 897                                                  cb->user_buffer +
 898                                                  cb->buffer_offset,
 899                                                  cb->buffer_size, 16);
 900         } else {
 901                 unreachable("No constant buffer");
 902         }
 903 }
 904
 905 struct sysval_uniform {
 906         union {
 907                 float f[4];
 908                 int32_t i[4];
 909                 uint32_t u[4];
 910                 uint64_t du[2];
 911         };
 912 };
 913
 914 static void
 915 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
 916                                       struct sysval_uniform *uniform)
 917 {
 918         struct panfrost_context *ctx = batch->ctx;
 919         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 920
 921         uniform->f[0] = vp->scale[0];
 922         uniform->f[1] = vp->scale[1];
 923         uniform->f[2] = vp->scale[2];
 924 }
 925
 926 static void
 927 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
 928                                        struct sysval_uniform *uniform)
 929 {
 930         struct panfrost_context *ctx = batch->ctx;
 931         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 932
 933         uniform->f[0] = vp->translate[0];
 934         uniform->f[1] = vp->translate[1];
 935         uniform->f[2] = vp->translate[2];
 936 }
 937
 938 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
 939                                        enum pipe_shader_type st,
 940                                        unsigned int sysvalid,
 941                                        struct sysval_uniform *uniform)
 942 {
 943         struct panfrost_context *ctx = batch->ctx;
 944         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
 945         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
 946         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
 947         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
 948
 949         assert(dim);
 950         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
 951
 952         if (dim > 1)
 953                 uniform->i[1] = u_minify(tex->texture->height0,
 954                                          tex->u.tex.first_level);
 955
 956         if (dim > 2)
 957                 uniform->i[2] = u_minify(tex->texture->depth0,
 958                                          tex->u.tex.first_level);
 959
 960         if (is_array)
 961                 uniform->i[dim] = tex->texture->array_size;
 962 }
 963
 964 static void
 965 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
 966                             enum pipe_shader_type st,
 967                             unsigned ssbo_id,
 968                             struct sysval_uniform *uniform)
 969 {
 970         struct panfrost_context *ctx = batch->ctx;
 971
 972         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
 973         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
 974
 975         /* Compute address */
 976         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
 977
 978         panfrost_batch_add_bo(batch, bo,
 979                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
 980                               panfrost_bo_access_for_stage(st));
 981
 982         /* Upload address and size as sysval */
 983         uniform->du[0] = bo->gpu + sb.buffer_offset;
 984         uniform->u[2] = sb.buffer_size;
 985 }
 986
 987 static void
 988 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
 989                                enum pipe_shader_type st,
 990                                unsigned samp_idx,
 991                                struct sysval_uniform *uniform)
 992 {
 993         struct panfrost_context *ctx = batch->ctx;
 994         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
 995
 996         uniform->f[0] = sampl->min_lod;
 997         uniform->f[1] = sampl->max_lod;
 998         uniform->f[2] = sampl->lod_bias;
 999
1000         /* Even without any errata, Midgard represents "no mipmapping" as
1001          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1002          * panfrost_create_sampler_state which also explains our choice of
1003          * epsilon value (again to keep behaviour consistent) */
1004
1005         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1006                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1007 }
1008
1009 static void
1010 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1011                                        struct sysval_uniform *uniform)
1012 {
1013         struct panfrost_context *ctx = batch->ctx;
1014
1015         uniform->u[0] = ctx->compute_grid->grid[0];
1016         uniform->u[1] = ctx->compute_grid->grid[1];
1017         uniform->u[2] = ctx->compute_grid->grid[2];
1018 }
1019
1020 static void
1021 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1022                         struct panfrost_shader_state *ss,
1023                         enum pipe_shader_type st)
1024 {
1025         struct sysval_uniform *uniforms = (void *)buf;
1026
1027         for (unsigned i = 0; i < ss->sysval_count; ++i) {
1028                 int sysval = ss->sysval[i];
1029
1030                 switch (PAN_SYSVAL_TYPE(sysval)) {
1031                 case PAN_SYSVAL_VIEWPORT_SCALE:
1032                         panfrost_upload_viewport_scale_sysval(batch,
1033                                                               &uniforms[i]);
1034                         break;
1035                 case PAN_SYSVAL_VIEWPORT_OFFSET:
1036                         panfrost_upload_viewport_offset_sysval(batch,
1037                                                                &uniforms[i]);
1038                         break;
1039                 case PAN_SYSVAL_TEXTURE_SIZE:
1040                         panfrost_upload_txs_sysval(batch, st,
1041                                                    PAN_SYSVAL_ID(sysval),
1042                                                    &uniforms[i]);
1043                         break;
1044                 case PAN_SYSVAL_SSBO:
1045                         panfrost_upload_ssbo_sysval(batch, st,
1046                                                     PAN_SYSVAL_ID(sysval),
1047                                                     &uniforms[i]);
1048                         break;
1049                 case PAN_SYSVAL_NUM_WORK_GROUPS:
1050                         panfrost_upload_num_work_groups_sysval(batch,
1051                                                                &uniforms[i]);
1052                         break;
1053                 case PAN_SYSVAL_SAMPLER:
1054                         panfrost_upload_sampler_sysval(batch, st,
1055                                                        PAN_SYSVAL_ID(sysval),
1056                                                        &uniforms[i]);
1057                         break;
1058                 default:
1059                         assert(0);
1060                 }
1061         }
1062 }
1063
1064 static const void *
1065 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1066                                  unsigned index)
1067 {
1068         struct pipe_constant_buffer *cb = &buf->cb[index];
1069         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1070
1071         if (rsrc)
1072                 return rsrc->bo->cpu;
1073         else if (cb->user_buffer)
1074                 return cb->user_buffer;
1075         else
1076                 unreachable("No constant buffer");
1077 }
1078
1079 void
1080 panfrost_emit_const_buf(struct panfrost_batch *batch,
1081                         enum pipe_shader_type stage,
1082                         struct mali_vertex_tiler_postfix *postfix)
1083 {
1084         struct panfrost_context *ctx = batch->ctx;
1085         struct panfrost_shader_variants *all = ctx->shader[stage];
1086
1087         if (!all)
1088                 return;
1089
1090         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1091
1092         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1093
1094         /* Uniforms are implicitly UBO #0 */
1095         bool has_uniforms = buf->enabled_mask & (1 << 0);
1096
1097         /* Allocate room for the sysval and the uniforms */
1098         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1099         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1100         size_t size = sys_size + uniform_size;
1101         struct panfrost_transfer transfer =
1102                 panfrost_pool_alloc_aligned(&batch->pool, size, 16);
1103
1104         /* Upload sysvals requested by the shader */
1105         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1106
1107         /* Upload uniforms */
1108         if (has_uniforms && uniform_size) {
1109                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1110                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1111         }
1112
1113         /* Next up, attach UBOs. UBO #0 is the uniforms we just
1114          * uploaded */
1115
1116         unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1117         assert(ubo_count >= 1);
1118
1119         size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count;
1120         struct panfrost_transfer ubos =
1121                 panfrost_pool_alloc_aligned(&batch->pool, sz,
1122                                 MALI_UNIFORM_BUFFER_LENGTH);
1123
1124         uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
1125
1126         /* Upload uniforms as a UBO */
1127
1128         if (size) {
1129                 pan_pack(ubo_ptr, UNIFORM_BUFFER, cfg) {
1130                         cfg.entries = DIV_ROUND_UP(size, 16);
1131                         cfg.pointer = transfer.gpu;
1132                 }
1133         } else {
1134                 *ubo_ptr = 0;
1135         }
1136
1137         /* The rest are honest-to-goodness UBOs */
1138
1139         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1140                 size_t usz = buf->cb[ubo].buffer_size;
1141                 bool enabled = buf->enabled_mask & (1 << ubo);
1142                 bool empty = usz == 0;
1143
1144                 if (!enabled || empty) {
1145                         ubo_ptr[ubo] = 0;
1146                         continue;
1147                 }
1148
1149                 pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) {
1150                         cfg.entries = DIV_ROUND_UP(usz, 16);
1151                         cfg.pointer = panfrost_map_constant_buffer_gpu(batch,
1152                                         stage, buf, ubo);
1153                 }
1154         }
1155
1156         postfix->uniforms = transfer.gpu;
1157         postfix->uniform_buffers = ubos.gpu;
1158
1159         buf->dirty_mask = 0;
1160 }
1161
1162 void
1163 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1164                             const struct pipe_grid_info *info,
1165                             struct midgard_payload_vertex_tiler *vtp)
1166 {
1167         struct panfrost_context *ctx = batch->ctx;
1168         struct panfrost_device *dev = pan_device(ctx->base.screen);
1169         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1170         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1171         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1172                                                            128));
1173
1174         unsigned log2_instances =
1175                 util_logbase2_ceil(info->grid[0]) +
1176                 util_logbase2_ceil(info->grid[1]) +
1177                 util_logbase2_ceil(info->grid[2]);
1178
1179         unsigned shared_size = single_size * (1 << log2_instances) * dev->core_count;
1180         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1181                                                                   shared_size,
1182                                                                   1);
1183
1184         struct mali_shared_memory shared = {
1185                 .shared_memory = bo->gpu,
1186                 .shared_workgroup_count = log2_instances,
1187                 .shared_shift = util_logbase2(single_size) + 1
1188         };
1189
1190         vtp->postfix.shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared,
1191                                                                sizeof(shared), 64);
1192 }
1193
1194 static mali_ptr
1195 panfrost_get_tex_desc(struct panfrost_batch *batch,
1196                       enum pipe_shader_type st,
1197                       struct panfrost_sampler_view *view)
1198 {
1199         if (!view)
1200                 return (mali_ptr) 0;
1201
1202         struct pipe_sampler_view *pview = &view->base;
1203         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1204
1205         /* Add the BO to the job so it's retained until the job is done. */
1206
1207         panfrost_batch_add_bo(batch, rsrc->bo,
1208                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1209                               panfrost_bo_access_for_stage(st));
1210
1211         panfrost_batch_add_bo(batch, view->bo,
1212                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1213                               panfrost_bo_access_for_stage(st));
1214
1215         return view->bo->gpu;
1216 }
1217
1218 static void
1219 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1220                              struct pipe_context *pctx)
1221 {
1222         struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1223         if (view->texture_bo != rsrc->bo->gpu ||
1224             view->modifier != rsrc->modifier) {
1225                 panfrost_bo_unreference(view->bo);
1226                 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1227         }
1228 }
1229
1230 void
1231 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1232                                   enum pipe_shader_type stage,
1233                                   struct mali_vertex_tiler_postfix *postfix)
1234 {
1235         struct panfrost_context *ctx = batch->ctx;
1236         struct panfrost_device *device = pan_device(ctx->base.screen);
1237
1238         if (!ctx->sampler_view_count[stage])
1239                 return;
1240
1241         if (device->quirks & IS_BIFROST) {
1242                 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1243                                 MALI_BIFROST_TEXTURE_LENGTH *
1244                                 ctx->sampler_view_count[stage],
1245                                 MALI_BIFROST_TEXTURE_LENGTH);
1246
1247                 struct mali_bifrost_texture_packed *out =
1248                         (struct mali_bifrost_texture_packed *) T.cpu;
1249
1250                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1251                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1252                         struct pipe_sampler_view *pview = &view->base;
1253                         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1254
1255                         panfrost_update_sampler_view(view, &ctx->base);
1256                         out[i] = view->bifrost_descriptor;
1257
1258                         /* Add the BOs to the job so they are retained until the job is done. */
1259
1260                         panfrost_batch_add_bo(batch, rsrc->bo,
1261                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1262                                               panfrost_bo_access_for_stage(stage));
1263
1264                         panfrost_batch_add_bo(batch, view->bo,
1265                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1266                                               panfrost_bo_access_for_stage(stage));
1267                 }
1268
1269                 postfix->textures = T.gpu;
1270         } else {
1271                 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1272
1273                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1274                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1275
1276                         panfrost_update_sampler_view(view, &ctx->base);
1277
1278                         trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1279                 }
1280
1281                 postfix->textures = panfrost_pool_upload_aligned(&batch->pool,
1282                                                               trampolines,
1283                                                               sizeof(uint64_t) *
1284                                                               ctx->sampler_view_count[stage],
1285                                                               sizeof(uint64_t));
1286         }
1287 }
1288
1289 void
1290 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1291                                   enum pipe_shader_type stage,
1292                                   struct mali_vertex_tiler_postfix *postfix)
1293 {
1294         struct panfrost_context *ctx = batch->ctx;
1295
1296         if (!ctx->sampler_count[stage])
1297                 return;
1298
1299         size_t desc_size = MALI_BIFROST_SAMPLER_LENGTH;
1300         assert(MALI_BIFROST_SAMPLER_LENGTH == MALI_MIDGARD_SAMPLER_LENGTH);
1301
1302         size_t sz = desc_size * ctx->sampler_count[stage];
1303         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool, sz, desc_size);
1304         struct mali_midgard_sampler_packed *out = (struct mali_midgard_sampler_packed *) T.cpu;
1305
1306         for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i)
1307                 out[i] = ctx->samplers[stage][i]->hw;
1308
1309         postfix->sampler_descriptor = T.gpu;
1310 }
1311
1312 void
1313 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1314                           struct mali_vertex_tiler_postfix *vertex_postfix)
1315 {
1316         struct panfrost_context *ctx = batch->ctx;
1317         struct panfrost_vertex_state *so = ctx->vertex;
1318         struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1319
1320         unsigned instance_shift = vertex_postfix->instance_shift;
1321         unsigned instance_odd = vertex_postfix->instance_odd;
1322
1323         /* Worst case: everything is NPOT, which is only possible if instancing
1324          * is enabled. Otherwise single record is gauranteed */
1325         bool could_npot = instance_shift || instance_odd;
1326
1327         struct panfrost_transfer S = panfrost_pool_alloc_aligned(&batch->pool,
1328                         MALI_ATTRIBUTE_BUFFER_LENGTH * vs->attribute_count *
1329                         (could_npot ? 2 : 1),
1330                         MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1331
1332         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1333                         MALI_ATTRIBUTE_LENGTH * vs->attribute_count,
1334                         MALI_ATTRIBUTE_LENGTH);
1335
1336         struct mali_attribute_buffer_packed *bufs =
1337                 (struct mali_attribute_buffer_packed *) S.cpu;
1338
1339         struct mali_attribute_packed *out =
1340                 (struct mali_attribute_packed *) T.cpu;
1341
1342         unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 };
1343         unsigned k = 0;
1344
1345         for (unsigned i = 0; i < so->num_elements; ++i) {
1346                 /* We map buffers 1:1 with the attributes, which
1347                  * means duplicating some vertex buffers (who cares? aside from
1348                  * maybe some caching implications but I somehow doubt that
1349                  * matters) */
1350
1351                 struct pipe_vertex_element *elem = &so->pipe[i];
1352                 unsigned vbi = elem->vertex_buffer_index;
1353                 attrib_to_buffer[i] = k;
1354
1355                 if (!(ctx->vb_mask & (1 << vbi)))
1356                         continue;
1357
1358                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1359                 struct panfrost_resource *rsrc;
1360
1361                 rsrc = pan_resource(buf->buffer.resource);
1362                 if (!rsrc)
1363                         continue;
1364
1365                 /* Add a dependency of the batch on the vertex buffer */
1366                 panfrost_batch_add_bo(batch, rsrc->bo,
1367                                       PAN_BO_ACCESS_SHARED |
1368                                       PAN_BO_ACCESS_READ |
1369                                       PAN_BO_ACCESS_VERTEX_TILER);
1370
1371                 /* Mask off lower bits, see offset fixup below */
1372                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1373                 mali_ptr addr = raw_addr & ~63;
1374
1375                 /* Since we advanced the base pointer, we shrink the buffer
1376                  * size, but add the offset we subtracted */
1377                 unsigned size = rsrc->base.width0 + (raw_addr - addr)
1378                         - buf->buffer_offset;
1379
1380                 /* When there is a divisor, the hardware-level divisor is
1381                  * the product of the instance divisor and the padded count */
1382                 unsigned divisor = elem->instance_divisor;
1383                 unsigned hw_divisor = ctx->padded_count * divisor;
1384                 unsigned stride = buf->stride;
1385
1386                 /* If there's a divisor(=1) but no instancing, we want every
1387                  * attribute to be the same */
1388
1389                 if (divisor && ctx->instance_count == 1)
1390                         stride = 0;
1391
1392                 if (!divisor || ctx->instance_count <= 1) {
1393                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1394                                 if (ctx->instance_count > 1)
1395                                         cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
1396
1397                                 cfg.pointer = addr;
1398                                 cfg.stride = stride;
1399                                 cfg.size = size;
1400                                 cfg.divisor_r = instance_shift;
1401                                 cfg.divisor_p = instance_odd;
1402                         }
1403                 } else if (util_is_power_of_two_or_zero(hw_divisor)) {
1404                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1405                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
1406                                 cfg.pointer = addr;
1407                                 cfg.stride = stride;
1408                                 cfg.size = size;
1409                                 cfg.divisor_r = __builtin_ctz(hw_divisor);
1410                         }
1411
1412                 } else {
1413                         unsigned shift = 0, extra_flags = 0;
1414
1415                         unsigned magic_divisor =
1416                                 panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
1417
1418                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1419                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
1420                                 cfg.pointer = addr;
1421                                 cfg.stride = stride;
1422                                 cfg.size = size;
1423
1424                                 cfg.divisor_r = shift;
1425                                 cfg.divisor_e = extra_flags;
1426                         }
1427
1428                         pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
1429                                 cfg.divisor_numerator = magic_divisor;
1430                                 cfg.divisor = divisor;
1431                         }
1432
1433                         ++k;
1434                 }
1435
1436                 ++k;
1437         }
1438
1439         /* Add special gl_VertexID/gl_InstanceID buffers */
1440
1441         if (unlikely(vs->attribute_count >= PAN_VERTEX_ID)) {
1442                 panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1443
1444                 pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
1445                         cfg.buffer_index = k++;
1446                         cfg.format = so->formats[PAN_VERTEX_ID];
1447                 }
1448
1449                 panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1450
1451                 pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
1452                         cfg.buffer_index = k++;
1453                         cfg.format = so->formats[PAN_INSTANCE_ID];
1454                 }
1455         }
1456
1457         /* Attribute addresses require 64-byte alignment, so let:
1458          *
1459          *      base' = base & ~63 = base - (base & 63)
1460          *      offset' = offset + (base & 63)
1461          *
1462          * Since base' + offset' = base + offset, these are equivalent
1463          * addressing modes and now base is 64 aligned.
1464          */
1465
1466         unsigned start = vertex_postfix->offset_start;
1467
1468         for (unsigned i = 0; i < so->num_elements; ++i) {
1469                 unsigned vbi = so->pipe[i].vertex_buffer_index;
1470                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1471
1472                 /* Adjust by the masked off bits of the offset. Make sure we
1473                  * read src_offset from so->hw (which is not GPU visible)
1474                  * rather than target (which is) due to caching effects */
1475
1476                 unsigned src_offset = so->pipe[i].src_offset;
1477
1478                 /* BOs aligned to 4k so guaranteed aligned to 64 */
1479                 src_offset += (buf->buffer_offset & 63);
1480
1481                 /* Also, somewhat obscurely per-instance data needs to be
1482                  * offset in response to a delayed start in an indexed draw */
1483
1484                 if (so->pipe[i].instance_divisor && ctx->instance_count > 1 && start)
1485                         src_offset -= buf->stride * start;
1486
1487                 pan_pack(out + i, ATTRIBUTE, cfg) {
1488                         cfg.buffer_index = attrib_to_buffer[i];
1489                         cfg.format = so->formats[i];
1490                         cfg.offset = src_offset;
1491                 }
1492         }
1493
1494         vertex_postfix->attributes = S.gpu;
1495         vertex_postfix->attribute_meta = T.gpu;
1496 }
1497
1498 static mali_ptr
1499 panfrost_emit_varyings(struct panfrost_batch *batch,
1500                 struct mali_attribute_buffer_packed *slot,
1501                 unsigned stride, unsigned count)
1502 {
1503         unsigned size = stride * count;
1504         mali_ptr ptr = panfrost_pool_alloc_aligned(&batch->invisible_pool, size, 64).gpu;
1505
1506         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1507                 cfg.stride = stride;
1508                 cfg.size = size;
1509                 cfg.pointer = ptr;
1510         }
1511
1512         return ptr;
1513 }
1514
1515 static unsigned
1516 panfrost_streamout_offset(unsigned stride, unsigned offset,
1517                         struct pipe_stream_output_target *target)
1518 {
1519         return (target->buffer_offset + (offset * stride * 4)) & 63;
1520 }
1521
1522 static void
1523 panfrost_emit_streamout(struct panfrost_batch *batch,
1524                         struct mali_attribute_buffer_packed *slot,
1525                         unsigned stride_words, unsigned offset, unsigned count,
1526                         struct pipe_stream_output_target *target)
1527 {
1528         unsigned stride = stride_words * 4;
1529         unsigned max_size = target->buffer_size;
1530         unsigned expected_size = stride * count;
1531
1532         /* Grab the BO and bind it to the batch */
1533         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1534
1535         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1536          * the perspective of the TILER and FRAGMENT.
1537          */
1538         panfrost_batch_add_bo(batch, bo,
1539                               PAN_BO_ACCESS_SHARED |
1540                               PAN_BO_ACCESS_RW |
1541                               PAN_BO_ACCESS_VERTEX_TILER |
1542                               PAN_BO_ACCESS_FRAGMENT);
1543
1544         /* We will have an offset applied to get alignment */
1545         mali_ptr addr = bo->gpu + target->buffer_offset + (offset * stride);
1546
1547         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1548                 cfg.pointer = (addr & ~63);
1549                 cfg.stride = stride;
1550                 cfg.size = MIN2(max_size, expected_size) + (addr & 63);
1551         }
1552 }
1553
1554 static bool
1555 has_point_coord(unsigned mask, gl_varying_slot loc)
1556 {
1557         if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1558                 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1559         else if (loc == VARYING_SLOT_PNTC)
1560                 return (mask & (1 << 8));
1561         else
1562                 return false;
1563 }
1564
1565 /* Helpers for manipulating stream out information so we can pack varyings
1566  * accordingly. Compute the src_offset for a given captured varying */
1567
1568 static struct pipe_stream_output *
1569 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1570 {
1571         for (unsigned i = 0; i < info->num_outputs; ++i) {
1572                 if (info->output[i].register_index == loc)
1573                         return &info->output[i];
1574         }
1575
1576         unreachable("Varying not captured");
1577 }
1578
1579 static unsigned
1580 pan_varying_size(enum mali_format fmt)
1581 {
1582         unsigned type = MALI_EXTRACT_TYPE(fmt);
1583         unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1584         unsigned bits = MALI_EXTRACT_BITS(fmt);
1585         unsigned bpc = 0;
1586
1587         if (bits == MALI_CHANNEL_FLOAT) {
1588                 /* No doubles */
1589                 bool fp16 = (type == MALI_FORMAT_SINT);
1590                 assert(fp16 || (type == MALI_FORMAT_UNORM));
1591
1592                 bpc = fp16 ? 2 : 4;
1593         } else {
1594                 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1595
1596                 /* See the enums */
1597                 bits = 1 << bits;
1598                 assert(bits >= 8);
1599                 bpc = bits / 8;
1600         }
1601
1602         return bpc * chan;
1603 }
1604
1605 /* Indices for named (non-XFB) varyings that are present. These are packed
1606  * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1607  * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1608  * of a given special field given a shift S by:
1609  *
1610  *      idx = popcount(P & ((1 << S) - 1))
1611  *
1612  * That is... look at all of the varyings that come earlier and count them, the
1613  * count is the new index since plus one. Likewise, the total number of special
1614  * buffers required is simply popcount(P)
1615  */
1616
1617 enum pan_special_varying {
1618         PAN_VARY_GENERAL = 0,
1619         PAN_VARY_POSITION = 1,
1620         PAN_VARY_PSIZ = 2,
1621         PAN_VARY_PNTCOORD = 3,
1622         PAN_VARY_FACE = 4,
1623         PAN_VARY_FRAGCOORD = 5,
1624
1625         /* Keep last */
1626         PAN_VARY_MAX,
1627 };
1628
1629 /* Given a varying, figure out which index it correpsonds to */
1630
1631 static inline unsigned
1632 pan_varying_index(unsigned present, enum pan_special_varying v)
1633 {
1634         unsigned mask = (1 << v) - 1;
1635         return util_bitcount(present & mask);
1636 }
1637
1638 /* Get the base offset for XFB buffers, which by convention come after
1639  * everything else. Wrapper function for semantic reasons; by construction this
1640  * is just popcount. */
1641
1642 static inline unsigned
1643 pan_xfb_base(unsigned present)
1644 {
1645         return util_bitcount(present);
1646 }
1647
1648 /* Computes the present mask for varyings so we can start emitting varying records */
1649
1650 static inline unsigned
1651 pan_varying_present(
1652         struct panfrost_shader_state *vs,
1653         struct panfrost_shader_state *fs,
1654         unsigned quirks)
1655 {
1656         /* At the moment we always emit general and position buffers. Not
1657          * strictly necessary but usually harmless */
1658
1659         unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1660
1661         /* Enable special buffers by the shader info */
1662
1663         if (vs->writes_point_size)
1664                 present |= (1 << PAN_VARY_PSIZ);
1665
1666         if (fs->reads_point_coord)
1667                 present |= (1 << PAN_VARY_PNTCOORD);
1668
1669         if (fs->reads_face)
1670                 present |= (1 << PAN_VARY_FACE);
1671
1672         if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1673                 present |= (1 << PAN_VARY_FRAGCOORD);
1674
1675         /* Also, if we have a point sprite, we need a point coord buffer */
1676
1677         for (unsigned i = 0; i < fs->varying_count; i++)  {
1678                 gl_varying_slot loc = fs->varyings_loc[i];
1679
1680                 if (has_point_coord(fs->point_sprite_mask, loc))
1681                         present |= (1 << PAN_VARY_PNTCOORD);
1682         }
1683
1684         return present;
1685 }
1686
1687 /* Emitters for varying records */
1688
1689 static void
1690 pan_emit_vary(struct mali_attribute_packed *out,
1691                 unsigned present, enum pan_special_varying buf,
1692                 unsigned quirks, enum mali_format format,
1693                 unsigned offset)
1694 {
1695         unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1696         unsigned swizzle = quirks & HAS_SWIZZLES ?
1697                         panfrost_get_default_swizzle(nr_channels) :
1698                         panfrost_bifrost_swizzle(nr_channels);
1699
1700         pan_pack(out, ATTRIBUTE, cfg) {
1701                 cfg.buffer_index = pan_varying_index(present, buf);
1702                 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1703                 cfg.format = (format << 12) | swizzle;
1704                 cfg.offset = offset;
1705         }
1706 }
1707
1708 /* General varying that is unused */
1709
1710 static void
1711 pan_emit_vary_only(struct mali_attribute_packed *out,
1712                 unsigned present, unsigned quirks)
1713 {
1714         pan_emit_vary(out, present, 0, quirks, MALI_VARYING_DISCARD, 0);
1715 }
1716
1717 /* Special records */
1718
1719 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1720         [PAN_VARY_POSITION]     = MALI_VARYING_POS,
1721         [PAN_VARY_PSIZ]         = MALI_R16F,
1722         [PAN_VARY_PNTCOORD]     = MALI_R16F,
1723         [PAN_VARY_FACE]         = MALI_R32I,
1724         [PAN_VARY_FRAGCOORD]    = MALI_RGBA32F
1725 };
1726
1727 static void
1728 pan_emit_vary_special(struct mali_attribute_packed *out,
1729                 unsigned present, enum pan_special_varying buf,
1730                 unsigned quirks)
1731 {
1732         assert(buf < PAN_VARY_MAX);
1733         pan_emit_vary(out, present, buf, quirks, pan_varying_formats[buf], 0);
1734 }
1735
1736 static enum mali_format
1737 pan_xfb_format(enum mali_format format, unsigned nr)
1738 {
1739         if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1740                 return MALI_R32F | MALI_NR_CHANNELS(nr);
1741         else
1742                 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1743 }
1744
1745 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1746  * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1747  * value. */
1748
1749 static void
1750 pan_emit_vary_xfb(struct mali_attribute_packed *out,
1751                 unsigned present,
1752                 unsigned max_xfb,
1753                 unsigned *streamout_offsets,
1754                 unsigned quirks,
1755                 enum mali_format format,
1756                 struct pipe_stream_output o)
1757 {
1758         unsigned swizzle = quirks & HAS_SWIZZLES ?
1759                         panfrost_get_default_swizzle(o.num_components) :
1760                         panfrost_bifrost_swizzle(o.num_components);
1761
1762         pan_pack(out, ATTRIBUTE, cfg) {
1763                 /* XFB buffers come after everything else */
1764                 cfg.buffer_index = pan_xfb_base(present) + o.output_buffer;
1765                 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1766
1767                 /* Override number of channels and precision to highp */
1768                 cfg.format = (pan_xfb_format(format, o.num_components) << 12) | swizzle;
1769
1770                 /* Apply given offsets together */
1771                 cfg.offset = (o.dst_offset * 4) /* dwords */
1772                         + streamout_offsets[o.output_buffer];
1773         }
1774 }
1775
1776 /* Determine if we should capture a varying for XFB. This requires actually
1777  * having a buffer for it. If we don't capture it, we'll fallback to a general
1778  * varying path (linked or unlinked, possibly discarding the write) */
1779
1780 static bool
1781 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1782                 unsigned loc, unsigned max_xfb)
1783 {
1784         if (!(xfb->so_mask & (1ll << loc)))
1785                 return false;
1786
1787         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1788         return o->output_buffer < max_xfb;
1789 }
1790
1791 static void
1792 pan_emit_general_varying(struct mali_attribute_packed *out,
1793                 struct panfrost_shader_state *other,
1794                 struct panfrost_shader_state *xfb,
1795                 gl_varying_slot loc,
1796                 enum mali_format format,
1797                 unsigned present,
1798                 unsigned quirks,
1799                 unsigned *gen_offsets,
1800                 enum mali_format *gen_formats,
1801                 unsigned *gen_stride,
1802                 unsigned idx,
1803                 bool should_alloc)
1804 {
1805         /* Check if we're linked */
1806         signed other_idx = -1;
1807
1808         for (unsigned j = 0; j < other->varying_count; ++j) {
1809                 if (other->varyings_loc[j] == loc) {
1810                         other_idx = j;
1811                         break;
1812                 }
1813         }
1814
1815         if (other_idx < 0) {
1816                 pan_emit_vary_only(out, present, quirks);
1817                 return;
1818         }
1819
1820         unsigned offset = gen_offsets[other_idx];
1821
1822         if (should_alloc) {
1823                 /* We're linked, so allocate a space via a watermark allocation */
1824                 enum mali_format alt = other->varyings[other_idx];
1825
1826                 /* Do interpolation at minimum precision */
1827                 unsigned size_main = pan_varying_size(format);
1828                 unsigned size_alt = pan_varying_size(alt);
1829                 unsigned size = MIN2(size_main, size_alt);
1830
1831                 /* If a varying is marked for XFB but not actually captured, we
1832                  * should match the format to the format that would otherwise
1833                  * be used for XFB, since dEQP checks for invariance here. It's
1834                  * unclear if this is required by the spec. */
1835
1836                 if (xfb->so_mask & (1ull << loc)) {
1837                         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1838                         format = pan_xfb_format(format, o->num_components);
1839                         size = pan_varying_size(format);
1840                 } else if (size == size_alt) {
1841                         format = alt;
1842                 }
1843
1844                 gen_offsets[idx] = *gen_stride;
1845                 gen_formats[other_idx] = format;
1846                 offset = *gen_stride;
1847                 *gen_stride += size;
1848         }
1849
1850         pan_emit_vary(out, present, PAN_VARY_GENERAL, quirks, format, offset);
1851 }
1852
1853 /* Higher-level wrapper around all of the above, classifying a varying into one
1854  * of the above types */
1855
1856 static void
1857 panfrost_emit_varying(
1858                 struct mali_attribute_packed *out,
1859                 struct panfrost_shader_state *stage,
1860                 struct panfrost_shader_state *other,
1861                 struct panfrost_shader_state *xfb,
1862                 unsigned present,
1863                 unsigned max_xfb,
1864                 unsigned *streamout_offsets,
1865                 unsigned quirks,
1866                 unsigned *gen_offsets,
1867                 enum mali_format *gen_formats,
1868                 unsigned *gen_stride,
1869                 unsigned idx,
1870                 bool should_alloc,
1871                 bool is_fragment)
1872 {
1873         gl_varying_slot loc = stage->varyings_loc[idx];
1874         enum mali_format format = stage->varyings[idx];
1875
1876         /* Override format to match linkage */
1877         if (!should_alloc && gen_formats[idx])
1878                 format = gen_formats[idx];
1879
1880         if (has_point_coord(stage->point_sprite_mask, loc)) {
1881                 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1882         } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1883                 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1884                 pan_emit_vary_xfb(out, present, max_xfb, streamout_offsets, quirks, format, *o);
1885         } else if (loc == VARYING_SLOT_POS) {
1886                 if (is_fragment)
1887                         pan_emit_vary_special(out, present, PAN_VARY_FRAGCOORD, quirks);
1888                 else
1889                         pan_emit_vary_special(out, present, PAN_VARY_POSITION, quirks);
1890         } else if (loc == VARYING_SLOT_PSIZ) {
1891                 pan_emit_vary_special(out, present, PAN_VARY_PSIZ, quirks);
1892         } else if (loc == VARYING_SLOT_PNTC) {
1893                 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1894         } else if (loc == VARYING_SLOT_FACE) {
1895                 pan_emit_vary_special(out, present, PAN_VARY_FACE, quirks);
1896         } else {
1897                 pan_emit_general_varying(out, other, xfb, loc, format, present,
1898                                 quirks, gen_offsets, gen_formats, gen_stride,
1899                                 idx, should_alloc);
1900         }
1901 }
1902
1903 static void
1904 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
1905                 unsigned present,
1906                 enum pan_special_varying v,
1907                 unsigned special)
1908 {
1909         if (present & (1 << v)) {
1910                 unsigned idx = pan_varying_index(present, v);
1911
1912                 pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
1913                         cfg.special = special;
1914                         cfg.type = 0;
1915                 }
1916         }
1917 }
1918
1919 void
1920 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1921                                  unsigned vertex_count,
1922                                  struct mali_vertex_tiler_postfix *vertex_postfix,
1923                                  struct mali_vertex_tiler_postfix *tiler_postfix,
1924                                  union midgard_primitive_size *primitive_size)
1925 {
1926         /* Load the shaders */
1927         struct panfrost_context *ctx = batch->ctx;
1928         struct panfrost_device *dev = pan_device(ctx->base.screen);
1929         struct panfrost_shader_state *vs, *fs;
1930         size_t vs_size, fs_size;
1931
1932         /* Allocate the varying descriptor */
1933
1934         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1935         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1936         vs_size = MALI_ATTRIBUTE_LENGTH * vs->varying_count;
1937         fs_size = MALI_ATTRIBUTE_LENGTH * fs->varying_count;
1938
1939         struct panfrost_transfer trans = panfrost_pool_alloc_aligned(
1940                         &batch->pool, vs_size + fs_size, MALI_ATTRIBUTE_LENGTH);
1941
1942         struct pipe_stream_output_info *so = &vs->stream_output;
1943         unsigned present = pan_varying_present(vs, fs, dev->quirks);
1944
1945         /* Check if this varying is linked by us. This is the case for
1946          * general-purpose, non-captured varyings. If it is, link it. If it's
1947          * not, use the provided stream out information to determine the
1948          * offset, since it was already linked for us. */
1949
1950         unsigned gen_offsets[32];
1951         enum mali_format gen_formats[32];
1952         memset(gen_offsets, 0, sizeof(gen_offsets));
1953         memset(gen_formats, 0, sizeof(gen_formats));
1954
1955         unsigned gen_stride = 0;
1956         assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
1957         assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
1958
1959         unsigned streamout_offsets[32];
1960
1961         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1962                 streamout_offsets[i] = panfrost_streamout_offset(
1963                                         so->stride[i],
1964                                         ctx->streamout.offsets[i],
1965                                         ctx->streamout.targets[i]);
1966         }
1967
1968         struct mali_attribute_packed *ovs = (struct mali_attribute_packed *)trans.cpu;
1969         struct mali_attribute_packed *ofs = ovs + vs->varying_count;
1970
1971         for (unsigned i = 0; i < vs->varying_count; i++) {
1972                 panfrost_emit_varying(ovs + i, vs, fs, vs, present,
1973                                 ctx->streamout.num_targets, streamout_offsets,
1974                                 dev->quirks,
1975                                 gen_offsets, gen_formats, &gen_stride, i, true, false);
1976         }
1977
1978         for (unsigned i = 0; i < fs->varying_count; i++) {
1979                 panfrost_emit_varying(ofs + i, fs, vs, vs, present,
1980                                 ctx->streamout.num_targets, streamout_offsets,
1981                                 dev->quirks,
1982                                 gen_offsets, gen_formats, &gen_stride, i, false, true);
1983         }
1984
1985         unsigned xfb_base = pan_xfb_base(present);
1986         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1987                         MALI_ATTRIBUTE_BUFFER_LENGTH * (xfb_base + ctx->streamout.num_targets),
1988                         MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1989         struct mali_attribute_buffer_packed *varyings =
1990                 (struct mali_attribute_buffer_packed *) T.cpu;
1991
1992         /* Emit the stream out buffers */
1993
1994         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1995                                                            ctx->vertex_count);
1996
1997         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1998                 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
1999                                         so->stride[i],
2000                                         ctx->streamout.offsets[i],
2001                                         out_count,
2002                                         ctx->streamout.targets[i]);
2003         }
2004
2005         panfrost_emit_varyings(batch,
2006                         &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
2007                         gen_stride, vertex_count);
2008
2009         /* fp32 vec4 gl_Position */
2010         tiler_postfix->position_varying = panfrost_emit_varyings(batch,
2011                         &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
2012                         sizeof(float) * 4, vertex_count);
2013
2014         if (present & (1 << PAN_VARY_PSIZ)) {
2015                 primitive_size->pointer = panfrost_emit_varyings(batch,
2016                                 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
2017                                 2, vertex_count);
2018         }
2019
2020         pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD);
2021         pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
2022         pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
2023
2024         vertex_postfix->varyings = T.gpu;
2025         tiler_postfix->varyings = T.gpu;
2026
2027         vertex_postfix->varying_meta = trans.gpu;
2028         tiler_postfix->varying_meta = trans.gpu + vs_size;
2029 }
2030
2031 void
2032 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
2033                                 struct mali_vertex_tiler_prefix *vertex_prefix,
2034                                 struct mali_vertex_tiler_postfix *vertex_postfix,
2035                                 struct mali_vertex_tiler_prefix *tiler_prefix,
2036                                 struct mali_vertex_tiler_postfix *tiler_postfix,
2037                                 union midgard_primitive_size *primitive_size)
2038 {
2039         struct panfrost_context *ctx = batch->ctx;
2040         struct panfrost_device *device = pan_device(ctx->base.screen);
2041         bool wallpapering = ctx->wallpaper_batch && batch->scoreboard.tiler_dep;
2042         struct bifrost_payload_vertex bifrost_vertex = {0,};
2043         struct bifrost_payload_tiler bifrost_tiler = {0,};
2044         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
2045         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
2046         void *vp, *tp;
2047         size_t vp_size, tp_size;
2048
2049         if (device->quirks & IS_BIFROST) {
2050                 bifrost_vertex.prefix = *vertex_prefix;
2051                 bifrost_vertex.postfix = *vertex_postfix;
2052                 vp = &bifrost_vertex;
2053                 vp_size = sizeof(bifrost_vertex);
2054
2055                 bifrost_tiler.prefix = *tiler_prefix;
2056                 bifrost_tiler.tiler.primitive_size = *primitive_size;
2057                 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2058                 bifrost_tiler.postfix = *tiler_postfix;
2059                 tp = &bifrost_tiler;
2060                 tp_size = sizeof(bifrost_tiler);
2061         } else {
2062                 midgard_vertex.prefix = *vertex_prefix;
2063                 midgard_vertex.postfix = *vertex_postfix;
2064                 vp = &midgard_vertex;
2065                 vp_size = sizeof(midgard_vertex);
2066
2067                 midgard_tiler.prefix = *tiler_prefix;
2068                 midgard_tiler.postfix = *tiler_postfix;
2069                 midgard_tiler.primitive_size = *primitive_size;
2070                 tp = &midgard_tiler;
2071                 tp_size = sizeof(midgard_tiler);
2072         }
2073
2074         if (wallpapering) {
2075                 /* Inject in reverse order, with "predicted" job indices.
2076                  * THIS IS A HACK XXX */
2077                 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false,
2078                                  batch->scoreboard.job_index + 2, tp, tp_size, true);
2079                 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2080                                  vp, vp_size, true);
2081                 return;
2082         }
2083
2084         /* If rasterizer discard is enable, only submit the vertex */
2085
2086         unsigned vertex = panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2087                                            vp, vp_size, false);
2088
2089         if (ctx->rasterizer->base.rasterizer_discard)
2090                 return;
2091
2092         panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false, vertex, tp, tp_size,
2093                          false);
2094 }
2095
2096 /* TODO: stop hardcoding this */
2097 mali_ptr
2098 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2099 {
2100         uint16_t locations[] = {
2101             128, 128,
2102             0, 256,
2103             0, 256,
2104             0, 256,
2105             0, 256,
2106             0, 256,
2107             0, 256,
2108             0, 256,
2109             0, 256,
2110             0, 256,
2111             0, 256,
2112             0, 256,
2113             0, 256,
2114             0, 256,
2115             0, 256,
2116             0, 256,
2117             0, 256,
2118             0, 256,
2119             0, 256,
2120             0, 256,
2121             0, 256,
2122             0, 256,
2123             0, 256,
2124             0, 256,
2125             0, 256,
2126             0, 256,
2127             0, 256,
2128             0, 256,
2129             0, 256,
2130             0, 256,
2131             0, 256,
2132             0, 256,
2133             128, 128,
2134             0, 0,
2135             0, 0,
2136             0, 0,
2137             0, 0,
2138             0, 0,
2139             0, 0,
2140             0, 0,
2141             0, 0,
2142             0, 0,
2143             0, 0,
2144             0, 0,
2145             0, 0,
2146             0, 0,
2147             0, 0,
2148             0, 0,
2149         };
2150
2151         return panfrost_pool_upload_aligned(&batch->pool, locations, 96 * sizeof(uint16_t), 64);
2152 }