src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28
  29 #include "panfrost-quirks.h"
  30
  31 #include "pan_pool.h"
  32 #include "pan_bo.h"
  33 #include "pan_cmdstream.h"
  34 #include "pan_context.h"
  35 #include "pan_job.h"
  36
  37 /* If a BO is accessed for a particular shader stage, will it be in the primary
  38  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  39  * fragment will be primary, e.g. compute jobs will be considered
  40  * "vertex/tiler" by analogy */
  41
  42 static inline uint32_t
  43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  44 {
  45         assert(stage == PIPE_SHADER_FRAGMENT ||
  46                stage == PIPE_SHADER_VERTEX ||
  47                stage == PIPE_SHADER_COMPUTE);
  48
  49         return stage == PIPE_SHADER_FRAGMENT ?
  50                PAN_BO_ACCESS_FRAGMENT :
  51                PAN_BO_ACCESS_VERTEX_TILER;
  52 }
  53
  54 static void
  55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
  56                                struct mali_vertex_tiler_postfix *postfix)
  57 {
  58         struct panfrost_device *dev = pan_device(ctx->base.screen);
  59         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  60
  61         struct mali_shared_memory shared = {
  62                 .shared_workgroup_count = ~0,
  63         };
  64
  65         if (batch->stack_size) {
  66                 struct panfrost_bo *stack =
  67                         panfrost_batch_get_scratchpad(batch, batch->stack_size,
  68                                         dev->thread_tls_alloc,
  69                                         dev->core_count);
  70
  71                 shared.stack_shift = panfrost_get_stack_shift(batch->stack_size);
  72                 shared.scratchpad = stack->gpu;
  73         }
  74
  75         postfix->shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared, sizeof(shared), 64);
  76 }
  77
  78 static void
  79 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
  80                                struct mali_vertex_tiler_postfix *postfix)
  81 {
  82         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  83         postfix->shared_memory = panfrost_batch_reserve_framebuffer(batch);
  84 }
  85
  86 static void
  87 panfrost_vt_update_rasterizer(struct panfrost_rasterizer *rasterizer,
  88                               struct mali_vertex_tiler_prefix *prefix,
  89                               struct mali_vertex_tiler_postfix *postfix)
  90 {
  91         postfix->gl_enables |= 0x7;
  92         SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
  93                 rasterizer->base.front_ccw);
  94         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
  95                 (rasterizer->base.cull_face & PIPE_FACE_FRONT));
  96         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
  97                 (rasterizer->base.cull_face & PIPE_FACE_BACK));
  98         SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
  99                 rasterizer->base.flatshade_first);
 100 }
 101
 102 void
 103 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
 104                                   struct mali_vertex_tiler_prefix *prefix,
 105                                   union midgard_primitive_size *primitive_size)
 106 {
 107         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 108
 109         if (!panfrost_writes_point_size(ctx)) {
 110                 float val = (prefix->draw_mode == MALI_DRAW_MODE_POINTS) ?
 111                               rasterizer->base.point_size :
 112                               rasterizer->base.line_width;
 113
 114                 primitive_size->constant = val;
 115         }
 116 }
 117
 118 static void
 119 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
 120                                    struct mali_vertex_tiler_postfix *postfix)
 121 {
 122         SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
 123         if (ctx->occlusion_query) {
 124                 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
 125                 panfrost_batch_add_bo(ctx->batch, ctx->occlusion_query->bo,
 126                                       PAN_BO_ACCESS_SHARED |
 127                                       PAN_BO_ACCESS_RW |
 128                                       PAN_BO_ACCESS_FRAGMENT);
 129         } else {
 130                 postfix->occlusion_counter = 0;
 131         }
 132 }
 133
 134 void
 135 panfrost_vt_init(struct panfrost_context *ctx,
 136                  enum pipe_shader_type stage,
 137                  struct mali_vertex_tiler_prefix *prefix,
 138                  struct mali_vertex_tiler_postfix *postfix)
 139 {
 140         struct panfrost_device *device = pan_device(ctx->base.screen);
 141
 142         if (!ctx->shader[stage])
 143                 return;
 144
 145         memset(prefix, 0, sizeof(*prefix));
 146         memset(postfix, 0, sizeof(*postfix));
 147
 148         if (device->quirks & IS_BIFROST) {
 149                 postfix->gl_enables = 0x2;
 150                 panfrost_vt_emit_shared_memory(ctx, postfix);
 151         } else {
 152                 postfix->gl_enables = 0x6;
 153                 panfrost_vt_attach_framebuffer(ctx, postfix);
 154         }
 155
 156         if (stage == PIPE_SHADER_FRAGMENT) {
 157                 panfrost_vt_update_occlusion_query(ctx, postfix);
 158                 panfrost_vt_update_rasterizer(ctx->rasterizer, prefix, postfix);
 159         }
 160 }
 161
 162 static unsigned
 163 panfrost_translate_index_size(unsigned size)
 164 {
 165         switch (size) {
 166         case 1:
 167                 return MALI_DRAW_INDEXED_UINT8;
 168
 169         case 2:
 170                 return MALI_DRAW_INDEXED_UINT16;
 171
 172         case 4:
 173                 return MALI_DRAW_INDEXED_UINT32;
 174
 175         default:
 176                 unreachable("Invalid index size");
 177         }
 178 }
 179
 180 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
 181  * good for the duration of the draw (transient), could last longer. Also get
 182  * the bounds on the index buffer for the range accessed by the draw. We do
 183  * these operations together because there are natural optimizations which
 184  * require them to be together. */
 185
 186 static mali_ptr
 187 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 188                                   const struct pipe_draw_info *info,
 189                                   unsigned *min_index, unsigned *max_index)
 190 {
 191         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 192         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 193         off_t offset = info->start * info->index_size;
 194         bool needs_indices = true;
 195         mali_ptr out = 0;
 196
 197         if (info->max_index != ~0u) {
 198                 *min_index = info->min_index;
 199                 *max_index = info->max_index;
 200                 needs_indices = false;
 201         }
 202
 203         if (!info->has_user_indices) {
 204                 /* Only resources can be directly mapped */
 205                 panfrost_batch_add_bo(batch, rsrc->bo,
 206                                       PAN_BO_ACCESS_SHARED |
 207                                       PAN_BO_ACCESS_READ |
 208                                       PAN_BO_ACCESS_VERTEX_TILER);
 209                 out = rsrc->bo->gpu + offset;
 210
 211                 /* Check the cache */
 212                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 213                                                            info->start,
 214                                                            info->count,
 215                                                            min_index,
 216                                                            max_index);
 217         } else {
 218                 /* Otherwise, we need to upload to transient memory */
 219                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 220                 struct panfrost_transfer T =
 221                         panfrost_pool_alloc_aligned(&batch->pool,
 222                                 info->count * info->index_size,
 223                                 info->index_size);
 224
 225                 memcpy(T.cpu, ibuf8 + offset, info->count * info->index_size);
 226                 out = T.gpu;
 227         }
 228
 229         if (needs_indices) {
 230                 /* Fallback */
 231                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 232
 233                 if (!info->has_user_indices)
 234                         panfrost_minmax_cache_add(rsrc->index_cache,
 235                                                   info->start, info->count,
 236                                                   *min_index, *max_index);
 237         }
 238
 239         return out;
 240 }
 241
 242 void
 243 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
 244                           const struct pipe_draw_info *info,
 245                           enum mali_draw_mode draw_mode,
 246                           struct mali_vertex_tiler_postfix *vertex_postfix,
 247                           struct mali_vertex_tiler_prefix *tiler_prefix,
 248                           struct mali_vertex_tiler_postfix *tiler_postfix,
 249                           unsigned *vertex_count,
 250                           unsigned *padded_count)
 251 {
 252         tiler_prefix->draw_mode = draw_mode;
 253
 254         unsigned draw_flags = 0;
 255
 256         if (panfrost_writes_point_size(ctx))
 257                 draw_flags |= MALI_DRAW_VARYING_SIZE;
 258
 259         if (info->primitive_restart)
 260                 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
 261
 262         /* These doesn't make much sense */
 263
 264         draw_flags |= 0x3000;
 265
 266         if (info->index_size) {
 267                 unsigned min_index = 0, max_index = 0;
 268
 269                 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
 270                                                                        info,
 271                                                                        &min_index,
 272                                                                        &max_index);
 273
 274                 /* Use the corresponding values */
 275                 *vertex_count = max_index - min_index + 1;
 276                 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
 277                 tiler_prefix->offset_bias_correction = -min_index;
 278                 tiler_prefix->index_count = MALI_POSITIVE(info->count);
 279                 draw_flags |= panfrost_translate_index_size(info->index_size);
 280         } else {
 281                 tiler_prefix->indices = 0;
 282                 *vertex_count = ctx->vertex_count;
 283                 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
 284                 tiler_prefix->offset_bias_correction = 0;
 285                 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
 286         }
 287
 288         tiler_prefix->unknown_draw = draw_flags;
 289
 290         /* Encode the padded vertex count */
 291
 292         if (info->instance_count > 1) {
 293                 *padded_count = panfrost_padded_vertex_count(*vertex_count);
 294
 295                 unsigned shift = __builtin_ctz(ctx->padded_count);
 296                 unsigned k = ctx->padded_count >> (shift + 1);
 297
 298                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
 299                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
 300         } else {
 301                 *padded_count = *vertex_count;
 302
 303                 /* Reset instancing state */
 304                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
 305                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
 306         }
 307 }
 308
 309 static void
 310 panfrost_emit_compute_shader(struct panfrost_context *ctx,
 311                           enum pipe_shader_type st,
 312                           struct mali_shader_meta *meta)
 313 {
 314         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 315         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 316
 317         memset(meta, 0, sizeof(*meta));
 318         meta->shader = ss->shader;
 319         meta->attribute_count = ss->attribute_count;
 320         meta->varying_count = ss->varying_count;
 321         meta->texture_count = ctx->sampler_view_count[st];
 322         meta->sampler_count = ctx->sampler_count[st];
 323
 324         if (dev->quirks & IS_BIFROST) {
 325                 meta->bifrost1.unk1 = 0x800000;
 326                 meta->bifrost2.preload_regs = 0xC0;
 327                 meta->bifrost2.uniform_count = ss->uniform_count;
 328                 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 329         } else {
 330                 struct mali_midgard_properties_packed prop;
 331
 332                 pan_pack(&prop, MIDGARD_PROPERTIES, cfg) {
 333                         cfg.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 334                         cfg.uniform_count = ss->uniform_count;
 335                         cfg.work_register_count = ss->work_reg_count;
 336                         cfg.writes_globals = ss->writes_global;
 337                         cfg.suppress_inf_nan = true; /* XXX */
 338                 }
 339
 340                 memcpy(&meta->midgard1, &prop, sizeof(prop));
 341         }
 342 }
 343
 344 static unsigned
 345 translate_tex_wrap(enum pipe_tex_wrap w)
 346 {
 347         switch (w) {
 348         case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT;
 349         case PIPE_TEX_WRAP_CLAMP: return MALI_WRAP_MODE_CLAMP;
 350         case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE;
 351         case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER;
 352         case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT;
 353         case PIPE_TEX_WRAP_MIRROR_CLAMP: return MALI_WRAP_MODE_MIRRORED_CLAMP;
 354         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
 355         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
 356         default: unreachable("Invalid wrap");
 357         }
 358 }
 359
 360 /* The hardware compares in the wrong order order, so we have to flip before
 361  * encoding. Yes, really. */
 362
 363 static enum mali_func
 364 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
 365 {
 366         if (!cso->compare_mode)
 367                 return MALI_FUNC_NEVER;
 368
 369         enum mali_func f = panfrost_translate_compare_func(cso->compare_func);
 370         return panfrost_flip_compare_func(f);
 371 }
 372
 373 static enum mali_mipmap_mode
 374 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
 375 {
 376         switch (f) {
 377         case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST;
 378         case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR;
 379         case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE;
 380         default: unreachable("Invalid");
 381         }
 382 }
 383
 384 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 385                                 struct mali_midgard_sampler_packed *hw)
 386 {
 387         pan_pack(hw, MIDGARD_SAMPLER, cfg) {
 388                 cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 389                 cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 390                 cfg.mipmap_mode = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) ?
 391                         MALI_MIPMAP_MODE_TRILINEAR : MALI_MIPMAP_MODE_NEAREST;
 392                 cfg.normalized_coordinates = cso->normalized_coords;
 393
 394                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
 395
 396                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
 397
 398                 /* If necessary, we disable mipmapping in the sampler descriptor by
 399                  * clamping the LOD as tight as possible (from 0 to epsilon,
 400                  * essentially -- remember these are fixed point numbers, so
 401                  * epsilon=1/256) */
 402
 403                 cfg.maximum_lod = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) ?
 404                         cfg.minimum_lod + 1 :
 405                         FIXED_16(cso->max_lod, false);
 406
 407                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
 408                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
 409                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
 410
 411                 cfg.compare_function = panfrost_sampler_compare_func(cso);
 412                 cfg.seamless_cube_map = cso->seamless_cube_map;
 413
 414                 cfg.border_color_r = cso->border_color.f[0];
 415                 cfg.border_color_g = cso->border_color.f[1];
 416                 cfg.border_color_b = cso->border_color.f[2];
 417                 cfg.border_color_a = cso->border_color.f[3];
 418         }
 419 }
 420
 421 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
 422                                         struct mali_bifrost_sampler_packed *hw)
 423 {
 424         pan_pack(hw, BIFROST_SAMPLER, cfg) {
 425                 cfg.magnify_linear = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR;
 426                 cfg.minify_linear = cso->min_img_filter == PIPE_TEX_FILTER_LINEAR;
 427                 cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
 428                 cfg.normalized_coordinates = cso->normalized_coords;
 429
 430                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
 431                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
 432                 cfg.maximum_lod = FIXED_16(cso->max_lod, false);
 433
 434                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
 435                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
 436                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
 437
 438                 cfg.compare_function = panfrost_sampler_compare_func(cso);
 439                 cfg.seamless_cube_map = cso->seamless_cube_map;
 440         }
 441 }
 442
 443 static bool
 444 panfrost_fs_required(
 445                 struct panfrost_shader_state *fs,
 446                 struct panfrost_blend_final *blend,
 447                 unsigned rt_count)
 448 {
 449         /* If we generally have side effects */
 450         if (fs->fs_sidefx)
 451                 return true;
 452
 453         /* If colour is written we need to execute */
 454         for (unsigned i = 0; i < rt_count; ++i) {
 455                 if (!blend[i].no_colour)
 456                         return true;
 457         }
 458
 459         /* If depth is written and not implied we need to execute.
 460          * TODO: Predicate on Z/S writes being enabled */
 461         return (fs->writes_depth || fs->writes_stencil);
 462 }
 463
 464 static void
 465 panfrost_emit_blend(struct panfrost_batch *batch, void *rts,
 466                 struct panfrost_blend_final *blend)
 467 {
 468         const struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
 469         struct panfrost_shader_state *fs = panfrost_get_shader_state(batch->ctx, PIPE_SHADER_FRAGMENT);
 470         unsigned rt_count = batch->key.nr_cbufs;
 471
 472         struct bifrost_blend_rt *brts = rts;
 473         struct midgard_blend_rt *mrts = rts;
 474
 475         /* Disable blending for depth-only on Bifrost */
 476
 477         if (rt_count == 0 && dev->quirks & IS_BIFROST)
 478                 brts[0].unk2 = 0x3;
 479
 480         for (unsigned i = 0; i < rt_count; ++i) {
 481                 unsigned flags = 0;
 482
 483                 pan_pack(&flags, BLEND_FLAGS, cfg) {
 484                         if (blend[i].no_colour) {
 485                                 cfg.enable = false;
 486                                 break;
 487                         }
 488
 489                         batch->draws |= (PIPE_CLEAR_COLOR0 << i);
 490
 491                         cfg.srgb = util_format_is_srgb(batch->key.cbufs[i]->format);
 492                         cfg.load_destination = blend[i].load_dest;
 493                         cfg.dither_disable = !batch->ctx->blend->base.dither;
 494
 495                         if (!(dev->quirks & IS_BIFROST))
 496                                 cfg.midgard_blend_shader = blend[i].is_shader;
 497                 }
 498
 499                 if (dev->quirks & IS_BIFROST) {
 500                         brts[i].flags = flags;
 501
 502                         if (blend[i].is_shader) {
 503                                 /* The blend shader's address needs to be at
 504                                  * the same top 32 bit as the fragment shader.
 505                                  * TODO: Ensure that's always the case.
 506                                  */
 507                                 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
 508                                        (fs->bo->gpu & (0xffffffffull << 32)));
 509                                 brts[i].shader = blend[i].shader.gpu;
 510                                 brts[i].unk2 = 0x0;
 511                         } else {
 512                                 enum pipe_format format = batch->key.cbufs[i]->format;
 513                                 const struct util_format_description *format_desc;
 514                                 format_desc = util_format_description(format);
 515
 516                                 brts[i].equation = blend[i].equation.equation;
 517
 518                                 /* TODO: this is a bit more complicated */
 519                                 brts[i].constant = blend[i].equation.constant;
 520
 521                                 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
 522
 523                                 /* 0x19 disables blending and forces REPLACE
 524                                  * mode (equivalent to rgb_mode = alpha_mode =
 525                                  * x122, colour mask = 0xF). 0x1a allows
 526                                  * blending. */
 527                                 brts[i].unk2 = blend[i].opaque ? 0x19 : 0x1a;
 528
 529                                 brts[i].shader_type = fs->blend_types[i];
 530                         }
 531                 } else {
 532                         memcpy(&mrts[i].flags, &flags, sizeof(flags));
 533
 534                         if (blend[i].is_shader) {
 535                                 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 536                         } else {
 537                                 mrts[i].blend.equation = blend[i].equation.equation;
 538                                 mrts[i].blend.constant = blend[i].equation.constant;
 539                         }
 540                 }
 541         }
 542 }
 543
 544 static void
 545 panfrost_emit_frag_shader(struct panfrost_context *ctx,
 546                                struct mali_shader_meta *fragmeta,
 547                                struct panfrost_blend_final *blend)
 548 {
 549         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 550         struct panfrost_shader_state *fs;
 551
 552         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 553
 554         struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 555         const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
 556
 557         memset(fragmeta, 0, sizeof(*fragmeta));
 558
 559         fragmeta->shader = fs->shader;
 560         fragmeta->attribute_count = fs->attribute_count;
 561         fragmeta->varying_count = fs->varying_count;
 562         fragmeta->texture_count = ctx->sampler_view_count[PIPE_SHADER_FRAGMENT];
 563         fragmeta->sampler_count = ctx->sampler_count[PIPE_SHADER_FRAGMENT];
 564
 565         if (dev->quirks & IS_BIFROST) {
 566                 /* First clause ATEST |= 0x4000000.
 567                  * Lefs than 32 regs |= 0x200 */
 568                 fragmeta->bifrost1.unk1 = 0x950020;
 569
 570                 fragmeta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, PIPE_SHADER_FRAGMENT);
 571                 fragmeta->bifrost2.preload_regs = 0x1;
 572                 SET_BIT(fragmeta->bifrost2.preload_regs, 0x10, fs->reads_frag_coord);
 573
 574                 fragmeta->bifrost2.uniform_count = fs->uniform_count;
 575         } else {
 576                 fragmeta->midgard1.uniform_count = fs->uniform_count;
 577                 fragmeta->midgard1.work_count = fs->work_reg_count;
 578
 579                 /* TODO: This is not conformant on ES3 */
 580                 fragmeta->midgard1.flags_hi = MALI_SUPPRESS_INF_NAN;
 581
 582                 fragmeta->midgard1.flags_lo = 0x20;
 583                 fragmeta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, PIPE_SHADER_FRAGMENT);
 584
 585                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_GLOBAL, fs->writes_global);
 586         }
 587
 588         bool msaa = rast->multisample;
 589         fragmeta->coverage_mask = msaa ? ctx->sample_mask : ~0;
 590
 591         fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x10;
 592         fragmeta->unknown2_4 = 0x4e0;
 593
 594         if (dev->quirks & IS_BIFROST) {
 595                 /* TODO */
 596         } else {
 597                 /* Depending on whether it's legal to in the given shader, we try to
 598                  * enable early-z testing. TODO: respect e-z force */
 599
 600                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
 601                         !fs->can_discard && !fs->writes_global &&
 602                         !fs->writes_depth && !fs->writes_stencil &&
 603                         !ctx->blend->base.alpha_to_coverage);
 604
 605                 /* Add the writes Z/S flags if needed. */
 606                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
 607                 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
 608
 609                 /* Any time texturing is used, derivatives are implicitly calculated,
 610                  * so we need to enable helper invocations */
 611
 612                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
 613                         fs->helper_invocations);
 614
 615                 /* If discard is enabled, which bit we set to convey this
 616                  * depends on if depth/stencil is used for the draw or not.
 617                  * Just one of depth OR stencil is enough to trigger this. */
 618
 619                 bool zs_enabled =
 620                         fs->writes_depth || fs->writes_stencil ||
 621                         (zsa->base.depth.enabled && zsa->base.depth.func != PIPE_FUNC_ALWAYS) ||
 622                         zsa->base.stencil[0].enabled;
 623
 624                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_TILEBUFFER,
 625                         fs->outputs_read || (!zs_enabled && fs->can_discard));
 626                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, zs_enabled && fs->can_discard);
 627         }
 628
 629         /* TODO: Sample size */
 630         SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
 631         SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
 632
 633         /* EXT_shader_framebuffer_fetch requires the shader to be run
 634          * per-sample when outputs are read. */
 635         bool per_sample = ctx->min_samples > 1 || fs->outputs_read;
 636         SET_BIT(fragmeta->unknown2_3, MALI_PER_SAMPLE, msaa && per_sample);
 637
 638         fragmeta->depth_units = rast->offset_units * 2.0f;
 639         fragmeta->depth_factor = rast->offset_scale;
 640
 641         /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
 642
 643         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, rast->offset_tri);
 644         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, rast->offset_tri);
 645
 646         SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, rast->depth_clip_near);
 647         SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, rast->depth_clip_far);
 648
 649         SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
 650                 zsa->base.stencil[0].enabled);
 651
 652         fragmeta->stencil_mask_front = zsa->stencil_mask_front;
 653         fragmeta->stencil_mask_back = zsa->stencil_mask_back;
 654
 655         /* Bottom bits for stencil ref, exactly one word */
 656         fragmeta->stencil_front.opaque[0] = zsa->stencil_front.opaque[0] | ctx->stencil_ref.ref_value[0];
 657
 658         /* If back-stencil is not enabled, use the front values */
 659
 660         if (zsa->base.stencil[1].enabled)
 661                 fragmeta->stencil_back.opaque[0] = zsa->stencil_back.opaque[0] | ctx->stencil_ref.ref_value[1];
 662         else
 663                 fragmeta->stencil_back = fragmeta->stencil_front;
 664
 665         SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
 666                 zsa->base.depth.writemask);
 667
 668         fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
 669         fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(
 670                 zsa->base.depth.enabled ? zsa->base.depth.func : PIPE_FUNC_ALWAYS));
 671
 672         SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
 673                 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
 674                 !ctx->blend->base.dither);
 675
 676         SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
 677
 678         SET_BIT(fragmeta->unknown2_4, MALI_ALPHA_TO_COVERAGE,
 679                         ctx->blend->base.alpha_to_coverage);
 680
 681         /* Get blending setup */
 682         unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
 683
 684         /* Disable shader execution if we can */
 685         if (dev->quirks & MIDGARD_SHADERLESS
 686                         && !panfrost_fs_required(fs, blend, rt_count)) {
 687                 fragmeta->shader = 0;
 688                 fragmeta->attribute_count = 0;
 689                 fragmeta->varying_count = 0;
 690                 fragmeta->texture_count = 0;
 691                 fragmeta->sampler_count = 0;
 692
 693                 /* This feature is not known to work on Bifrost */
 694                 fragmeta->midgard1.work_count = 1;
 695                 fragmeta->midgard1.uniform_count = 0;
 696                 fragmeta->midgard1.uniform_buffer_count = 0;
 697         }
 698
 699          /* If there is a blend shader, work registers are shared. We impose 8
 700           * work registers as a limit for blend shaders. Should be lower XXX */
 701
 702         if (!(dev->quirks & IS_BIFROST)) {
 703                 for (unsigned c = 0; c < rt_count; ++c) {
 704                         if (blend[c].is_shader) {
 705                                 fragmeta->midgard1.work_count =
 706                                         MAX2(fragmeta->midgard1.work_count, 8);
 707                         }
 708                 }
 709         }
 710
 711         if (dev->quirks & MIDGARD_SFBD) {
 712                 /* When only a single render target platform is used, the blend
 713                  * information is inside the shader meta itself. We additionally
 714                  * need to signal CAN_DISCARD for nontrivial blend modes (so
 715                  * we're able to read back the destination buffer) */
 716
 717                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
 718                         blend[0].is_shader);
 719
 720                 if (blend[0].is_shader) {
 721                         fragmeta->blend.shader = blend[0].shader.gpu |
 722                                 blend[0].shader.first_tag;
 723                 } else {
 724                         fragmeta->blend.equation = blend[0].equation.equation;
 725                         fragmeta->blend.constant = blend[0].equation.constant;
 726                 }
 727
 728                 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
 729                         blend[0].load_dest);
 730         } else if (!(dev->quirks & IS_BIFROST)) {
 731                 /* Bug where MRT-capable hw apparently reads the last blend
 732                  * shader from here instead of the usual location? */
 733
 734                 for (signed rt = ((signed) rt_count - 1); rt >= 0; --rt) {
 735                         if (!blend[rt].is_shader)
 736                                 continue;
 737
 738                         fragmeta->blend.shader = blend[rt].shader.gpu |
 739                                                  blend[rt].shader.first_tag;
 740                         break;
 741                 }
 742         }
 743
 744         if (dev->quirks & IS_BIFROST) {
 745                 bool no_blend = true;
 746
 747                 for (unsigned i = 0; i < rt_count; ++i)
 748                         no_blend &= (!blend[i].load_dest | blend[i].no_colour);
 749
 750                 SET_BIT(fragmeta->bifrost1.unk1, MALI_BIFROST_EARLY_Z,
 751                         !fs->can_discard && !fs->writes_depth && no_blend);
 752         }
 753 }
 754
 755 void
 756 panfrost_emit_shader_meta(struct panfrost_batch *batch,
 757                           enum pipe_shader_type st,
 758                           struct mali_vertex_tiler_postfix *postfix)
 759 {
 760         struct panfrost_context *ctx = batch->ctx;
 761         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 762
 763         if (!ss) {
 764                 postfix->shader = 0;
 765                 return;
 766         }
 767
 768         struct mali_shader_meta meta;
 769
 770         /* Add the shader BO to the batch. */
 771         panfrost_batch_add_bo(batch, ss->bo,
 772                               PAN_BO_ACCESS_PRIVATE |
 773                               PAN_BO_ACCESS_READ |
 774                               panfrost_bo_access_for_stage(st));
 775
 776         mali_ptr shader_ptr;
 777
 778         if (st == PIPE_SHADER_FRAGMENT) {
 779                 struct panfrost_device *dev = pan_device(ctx->base.screen);
 780                 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 781                 size_t desc_size = sizeof(meta);
 782                 void *rts = NULL;
 783                 struct panfrost_transfer xfer;
 784                 unsigned rt_size;
 785
 786                 if (dev->quirks & MIDGARD_SFBD)
 787                         rt_size = 0;
 788                 else if (dev->quirks & IS_BIFROST)
 789                         rt_size = sizeof(struct bifrost_blend_rt);
 790                 else
 791                         rt_size = sizeof(struct midgard_blend_rt);
 792
 793                 desc_size += rt_size * rt_count;
 794
 795                 if (rt_size)
 796                         rts = rzalloc_size(ctx, rt_size * rt_count);
 797
 798                 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 799
 800                 for (unsigned c = 0; c < ctx->pipe_framebuffer.nr_cbufs; ++c)
 801                         blend[c] = panfrost_get_blend_for_context(ctx, c);
 802
 803                 panfrost_emit_frag_shader(ctx, &meta, blend);
 804
 805                 if (!(dev->quirks & MIDGARD_SFBD))
 806                         panfrost_emit_blend(batch, rts, blend);
 807                 else
 808                         batch->draws |= PIPE_CLEAR_COLOR0;
 809
 810                 xfer = panfrost_pool_alloc_aligned(&batch->pool, desc_size, sizeof(meta));
 811
 812                 memcpy(xfer.cpu, &meta, sizeof(meta));
 813                 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
 814
 815                 if (rt_size)
 816                         ralloc_free(rts);
 817
 818                 shader_ptr = xfer.gpu;
 819         } else {
 820                 panfrost_emit_compute_shader(ctx, st, &meta);
 821
 822                 shader_ptr = panfrost_pool_upload(&batch->pool, &meta,
 823                                                        sizeof(meta));
 824         }
 825
 826         postfix->shader = shader_ptr;
 827 }
 828
 829 void
 830 panfrost_emit_viewport(struct panfrost_batch *batch,
 831                        struct mali_vertex_tiler_postfix *tiler_postfix)
 832 {
 833         struct panfrost_context *ctx = batch->ctx;
 834         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 835         const struct pipe_scissor_state *ss = &ctx->scissor;
 836         const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 837         const struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
 838
 839         /* Derive min/max from translate/scale. Note since |x| >= 0 by
 840          * definition, we have that -|x| <= |x| hence translate - |scale| <=
 841          * translate + |scale|, so the ordering is correct here. */
 842         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 843         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 844         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 845         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 846         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 847         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 848
 849         /* Scissor to the intersection of viewport and to the scissor, clamped
 850          * to the framebuffer */
 851
 852         unsigned minx = MIN2(fb->width, vp_minx);
 853         unsigned maxx = MIN2(fb->width, vp_maxx);
 854         unsigned miny = MIN2(fb->height, vp_miny);
 855         unsigned maxy = MIN2(fb->height, vp_maxy);
 856
 857         if (ss && rast->scissor) {
 858                 minx = MAX2(ss->minx, minx);
 859                 miny = MAX2(ss->miny, miny);
 860                 maxx = MIN2(ss->maxx, maxx);
 861                 maxy = MIN2(ss->maxy, maxy);
 862         }
 863
 864         struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, MALI_VIEWPORT_LENGTH);
 865
 866         pan_pack(T.cpu, VIEWPORT, cfg) {
 867                 cfg.scissor_minimum_x = minx;
 868                 cfg.scissor_minimum_y = miny;
 869                 cfg.scissor_maximum_x = maxx - 1;
 870                 cfg.scissor_maximum_y = maxy - 1;
 871
 872                 cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY;
 873                 cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY;
 874         }
 875
 876         tiler_postfix->viewport = T.gpu;
 877         panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
 878 }
 879
 880 static mali_ptr
 881 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
 882                                  enum pipe_shader_type st,
 883                                  struct panfrost_constant_buffer *buf,
 884                                  unsigned index)
 885 {
 886         struct pipe_constant_buffer *cb = &buf->cb[index];
 887         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
 888
 889         if (rsrc) {
 890                 panfrost_batch_add_bo(batch, rsrc->bo,
 891                                       PAN_BO_ACCESS_SHARED |
 892                                       PAN_BO_ACCESS_READ |
 893                                       panfrost_bo_access_for_stage(st));
 894
 895                 /* Alignment gauranteed by
 896                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
 897                 return rsrc->bo->gpu + cb->buffer_offset;
 898         } else if (cb->user_buffer) {
 899                 return panfrost_pool_upload_aligned(&batch->pool,
 900                                                  cb->user_buffer +
 901                                                  cb->buffer_offset,
 902                                                  cb->buffer_size, 16);
 903         } else {
 904                 unreachable("No constant buffer");
 905         }
 906 }
 907
 908 struct sysval_uniform {
 909         union {
 910                 float f[4];
 911                 int32_t i[4];
 912                 uint32_t u[4];
 913                 uint64_t du[2];
 914         };
 915 };
 916
 917 static void
 918 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
 919                                       struct sysval_uniform *uniform)
 920 {
 921         struct panfrost_context *ctx = batch->ctx;
 922         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 923
 924         uniform->f[0] = vp->scale[0];
 925         uniform->f[1] = vp->scale[1];
 926         uniform->f[2] = vp->scale[2];
 927 }
 928
 929 static void
 930 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
 931                                        struct sysval_uniform *uniform)
 932 {
 933         struct panfrost_context *ctx = batch->ctx;
 934         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 935
 936         uniform->f[0] = vp->translate[0];
 937         uniform->f[1] = vp->translate[1];
 938         uniform->f[2] = vp->translate[2];
 939 }
 940
 941 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
 942                                        enum pipe_shader_type st,
 943                                        unsigned int sysvalid,
 944                                        struct sysval_uniform *uniform)
 945 {
 946         struct panfrost_context *ctx = batch->ctx;
 947         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
 948         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
 949         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
 950         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
 951
 952         assert(dim);
 953         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
 954
 955         if (dim > 1)
 956                 uniform->i[1] = u_minify(tex->texture->height0,
 957                                          tex->u.tex.first_level);
 958
 959         if (dim > 2)
 960                 uniform->i[2] = u_minify(tex->texture->depth0,
 961                                          tex->u.tex.first_level);
 962
 963         if (is_array)
 964                 uniform->i[dim] = tex->texture->array_size;
 965 }
 966
 967 static void
 968 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
 969                             enum pipe_shader_type st,
 970                             unsigned ssbo_id,
 971                             struct sysval_uniform *uniform)
 972 {
 973         struct panfrost_context *ctx = batch->ctx;
 974
 975         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
 976         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
 977
 978         /* Compute address */
 979         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
 980
 981         panfrost_batch_add_bo(batch, bo,
 982                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
 983                               panfrost_bo_access_for_stage(st));
 984
 985         /* Upload address and size as sysval */
 986         uniform->du[0] = bo->gpu + sb.buffer_offset;
 987         uniform->u[2] = sb.buffer_size;
 988 }
 989
 990 static void
 991 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
 992                                enum pipe_shader_type st,
 993                                unsigned samp_idx,
 994                                struct sysval_uniform *uniform)
 995 {
 996         struct panfrost_context *ctx = batch->ctx;
 997         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
 998
 999         uniform->f[0] = sampl->min_lod;
1000         uniform->f[1] = sampl->max_lod;
1001         uniform->f[2] = sampl->lod_bias;
1002
1003         /* Even without any errata, Midgard represents "no mipmapping" as
1004          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1005          * panfrost_create_sampler_state which also explains our choice of
1006          * epsilon value (again to keep behaviour consistent) */
1007
1008         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1009                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1010 }
1011
1012 static void
1013 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1014                                        struct sysval_uniform *uniform)
1015 {
1016         struct panfrost_context *ctx = batch->ctx;
1017
1018         uniform->u[0] = ctx->compute_grid->grid[0];
1019         uniform->u[1] = ctx->compute_grid->grid[1];
1020         uniform->u[2] = ctx->compute_grid->grid[2];
1021 }
1022
1023 static void
1024 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1025                         struct panfrost_shader_state *ss,
1026                         enum pipe_shader_type st)
1027 {
1028         struct sysval_uniform *uniforms = (void *)buf;
1029
1030         for (unsigned i = 0; i < ss->sysval_count; ++i) {
1031                 int sysval = ss->sysval[i];
1032
1033                 switch (PAN_SYSVAL_TYPE(sysval)) {
1034                 case PAN_SYSVAL_VIEWPORT_SCALE:
1035                         panfrost_upload_viewport_scale_sysval(batch,
1036                                                               &uniforms[i]);
1037                         break;
1038                 case PAN_SYSVAL_VIEWPORT_OFFSET:
1039                         panfrost_upload_viewport_offset_sysval(batch,
1040                                                                &uniforms[i]);
1041                         break;
1042                 case PAN_SYSVAL_TEXTURE_SIZE:
1043                         panfrost_upload_txs_sysval(batch, st,
1044                                                    PAN_SYSVAL_ID(sysval),
1045                                                    &uniforms[i]);
1046                         break;
1047                 case PAN_SYSVAL_SSBO:
1048                         panfrost_upload_ssbo_sysval(batch, st,
1049                                                     PAN_SYSVAL_ID(sysval),
1050                                                     &uniforms[i]);
1051                         break;
1052                 case PAN_SYSVAL_NUM_WORK_GROUPS:
1053                         panfrost_upload_num_work_groups_sysval(batch,
1054                                                                &uniforms[i]);
1055                         break;
1056                 case PAN_SYSVAL_SAMPLER:
1057                         panfrost_upload_sampler_sysval(batch, st,
1058                                                        PAN_SYSVAL_ID(sysval),
1059                                                        &uniforms[i]);
1060                         break;
1061                 default:
1062                         assert(0);
1063                 }
1064         }
1065 }
1066
1067 static const void *
1068 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1069                                  unsigned index)
1070 {
1071         struct pipe_constant_buffer *cb = &buf->cb[index];
1072         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1073
1074         if (rsrc)
1075                 return rsrc->bo->cpu;
1076         else if (cb->user_buffer)
1077                 return cb->user_buffer;
1078         else
1079                 unreachable("No constant buffer");
1080 }
1081
1082 void
1083 panfrost_emit_const_buf(struct panfrost_batch *batch,
1084                         enum pipe_shader_type stage,
1085                         struct mali_vertex_tiler_postfix *postfix)
1086 {
1087         struct panfrost_context *ctx = batch->ctx;
1088         struct panfrost_shader_variants *all = ctx->shader[stage];
1089
1090         if (!all)
1091                 return;
1092
1093         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1094
1095         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1096
1097         /* Uniforms are implicitly UBO #0 */
1098         bool has_uniforms = buf->enabled_mask & (1 << 0);
1099
1100         /* Allocate room for the sysval and the uniforms */
1101         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1102         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1103         size_t size = sys_size + uniform_size;
1104         struct panfrost_transfer transfer =
1105                 panfrost_pool_alloc_aligned(&batch->pool, size, 16);
1106
1107         /* Upload sysvals requested by the shader */
1108         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1109
1110         /* Upload uniforms */
1111         if (has_uniforms && uniform_size) {
1112                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1113                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1114         }
1115
1116         /* Next up, attach UBOs. UBO #0 is the uniforms we just
1117          * uploaded */
1118
1119         unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1120         assert(ubo_count >= 1);
1121
1122         size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count;
1123         struct panfrost_transfer ubos =
1124                 panfrost_pool_alloc_aligned(&batch->pool, sz,
1125                                 MALI_UNIFORM_BUFFER_LENGTH);
1126
1127         uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
1128
1129         /* Upload uniforms as a UBO */
1130
1131         if (size) {
1132                 pan_pack(ubo_ptr, UNIFORM_BUFFER, cfg) {
1133                         cfg.entries = DIV_ROUND_UP(size, 16);
1134                         cfg.pointer = transfer.gpu;
1135                 }
1136         } else {
1137                 *ubo_ptr = 0;
1138         }
1139
1140         /* The rest are honest-to-goodness UBOs */
1141
1142         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1143                 size_t usz = buf->cb[ubo].buffer_size;
1144                 bool enabled = buf->enabled_mask & (1 << ubo);
1145                 bool empty = usz == 0;
1146
1147                 if (!enabled || empty) {
1148                         ubo_ptr[ubo] = 0;
1149                         continue;
1150                 }
1151
1152                 pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) {
1153                         cfg.entries = DIV_ROUND_UP(usz, 16);
1154                         cfg.pointer = panfrost_map_constant_buffer_gpu(batch,
1155                                         stage, buf, ubo);
1156                 }
1157         }
1158
1159         postfix->uniforms = transfer.gpu;
1160         postfix->uniform_buffers = ubos.gpu;
1161
1162         buf->dirty_mask = 0;
1163 }
1164
1165 void
1166 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1167                             const struct pipe_grid_info *info,
1168                             struct midgard_payload_vertex_tiler *vtp)
1169 {
1170         struct panfrost_context *ctx = batch->ctx;
1171         struct panfrost_device *dev = pan_device(ctx->base.screen);
1172         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1173         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1174         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1175                                                            128));
1176
1177         unsigned log2_instances =
1178                 util_logbase2_ceil(info->grid[0]) +
1179                 util_logbase2_ceil(info->grid[1]) +
1180                 util_logbase2_ceil(info->grid[2]);
1181
1182         unsigned shared_size = single_size * (1 << log2_instances) * dev->core_count;
1183         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1184                                                                   shared_size,
1185                                                                   1);
1186
1187         struct mali_shared_memory shared = {
1188                 .shared_memory = bo->gpu,
1189                 .shared_workgroup_count = log2_instances,
1190                 .shared_shift = util_logbase2(single_size) + 1
1191         };
1192
1193         vtp->postfix.shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared,
1194                                                                sizeof(shared), 64);
1195 }
1196
1197 static mali_ptr
1198 panfrost_get_tex_desc(struct panfrost_batch *batch,
1199                       enum pipe_shader_type st,
1200                       struct panfrost_sampler_view *view)
1201 {
1202         if (!view)
1203                 return (mali_ptr) 0;
1204
1205         struct pipe_sampler_view *pview = &view->base;
1206         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1207
1208         /* Add the BO to the job so it's retained until the job is done. */
1209
1210         panfrost_batch_add_bo(batch, rsrc->bo,
1211                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1212                               panfrost_bo_access_for_stage(st));
1213
1214         panfrost_batch_add_bo(batch, view->bo,
1215                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1216                               panfrost_bo_access_for_stage(st));
1217
1218         return view->bo->gpu;
1219 }
1220
1221 static void
1222 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1223                              struct pipe_context *pctx)
1224 {
1225         struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1226         if (view->texture_bo != rsrc->bo->gpu ||
1227             view->modifier != rsrc->modifier) {
1228                 panfrost_bo_unreference(view->bo);
1229                 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1230         }
1231 }
1232
1233 void
1234 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1235                                   enum pipe_shader_type stage,
1236                                   struct mali_vertex_tiler_postfix *postfix)
1237 {
1238         struct panfrost_context *ctx = batch->ctx;
1239         struct panfrost_device *device = pan_device(ctx->base.screen);
1240
1241         if (!ctx->sampler_view_count[stage])
1242                 return;
1243
1244         if (device->quirks & IS_BIFROST) {
1245                 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1246                                 MALI_BIFROST_TEXTURE_LENGTH *
1247                                 ctx->sampler_view_count[stage],
1248                                 MALI_BIFROST_TEXTURE_LENGTH);
1249
1250                 struct mali_bifrost_texture_packed *out =
1251                         (struct mali_bifrost_texture_packed *) T.cpu;
1252
1253                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1254                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1255                         struct pipe_sampler_view *pview = &view->base;
1256                         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1257
1258                         panfrost_update_sampler_view(view, &ctx->base);
1259                         out[i] = view->bifrost_descriptor;
1260
1261                         /* Add the BOs to the job so they are retained until the job is done. */
1262
1263                         panfrost_batch_add_bo(batch, rsrc->bo,
1264                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1265                                               panfrost_bo_access_for_stage(stage));
1266
1267                         panfrost_batch_add_bo(batch, view->bo,
1268                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1269                                               panfrost_bo_access_for_stage(stage));
1270                 }
1271
1272                 postfix->textures = T.gpu;
1273         } else {
1274                 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1275
1276                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1277                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1278
1279                         panfrost_update_sampler_view(view, &ctx->base);
1280
1281                         trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1282                 }
1283
1284                 postfix->textures = panfrost_pool_upload_aligned(&batch->pool,
1285                                                               trampolines,
1286                                                               sizeof(uint64_t) *
1287                                                               ctx->sampler_view_count[stage],
1288                                                               sizeof(uint64_t));
1289         }
1290 }
1291
1292 void
1293 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1294                                   enum pipe_shader_type stage,
1295                                   struct mali_vertex_tiler_postfix *postfix)
1296 {
1297         struct panfrost_context *ctx = batch->ctx;
1298
1299         if (!ctx->sampler_count[stage])
1300                 return;
1301
1302         size_t desc_size = MALI_BIFROST_SAMPLER_LENGTH;
1303         assert(MALI_BIFROST_SAMPLER_LENGTH == MALI_MIDGARD_SAMPLER_LENGTH);
1304
1305         size_t sz = desc_size * ctx->sampler_count[stage];
1306         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool, sz, desc_size);
1307         struct mali_midgard_sampler_packed *out = (struct mali_midgard_sampler_packed *) T.cpu;
1308
1309         for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i)
1310                 out[i] = ctx->samplers[stage][i]->hw;
1311
1312         postfix->sampler_descriptor = T.gpu;
1313 }
1314
1315 void
1316 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1317                           struct mali_vertex_tiler_postfix *vertex_postfix)
1318 {
1319         struct panfrost_context *ctx = batch->ctx;
1320         struct panfrost_vertex_state *so = ctx->vertex;
1321         struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1322
1323         unsigned instance_shift = vertex_postfix->instance_shift;
1324         unsigned instance_odd = vertex_postfix->instance_odd;
1325
1326         /* Worst case: everything is NPOT, which is only possible if instancing
1327          * is enabled. Otherwise single record is gauranteed */
1328         bool could_npot = instance_shift || instance_odd;
1329
1330         struct panfrost_transfer S = panfrost_pool_alloc_aligned(&batch->pool,
1331                         MALI_ATTRIBUTE_BUFFER_LENGTH * vs->attribute_count *
1332                         (could_npot ? 2 : 1),
1333                         MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1334
1335         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1336                         MALI_ATTRIBUTE_LENGTH * vs->attribute_count,
1337                         MALI_ATTRIBUTE_LENGTH);
1338
1339         struct mali_attribute_buffer_packed *bufs =
1340                 (struct mali_attribute_buffer_packed *) S.cpu;
1341
1342         struct mali_attribute_packed *out =
1343                 (struct mali_attribute_packed *) T.cpu;
1344
1345         unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 };
1346         unsigned k = 0;
1347
1348         for (unsigned i = 0; i < so->num_elements; ++i) {
1349                 /* We map buffers 1:1 with the attributes, which
1350                  * means duplicating some vertex buffers (who cares? aside from
1351                  * maybe some caching implications but I somehow doubt that
1352                  * matters) */
1353
1354                 struct pipe_vertex_element *elem = &so->pipe[i];
1355                 unsigned vbi = elem->vertex_buffer_index;
1356                 attrib_to_buffer[i] = k;
1357
1358                 if (!(ctx->vb_mask & (1 << vbi)))
1359                         continue;
1360
1361                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1362                 struct panfrost_resource *rsrc;
1363
1364                 rsrc = pan_resource(buf->buffer.resource);
1365                 if (!rsrc)
1366                         continue;
1367
1368                 /* Add a dependency of the batch on the vertex buffer */
1369                 panfrost_batch_add_bo(batch, rsrc->bo,
1370                                       PAN_BO_ACCESS_SHARED |
1371                                       PAN_BO_ACCESS_READ |
1372                                       PAN_BO_ACCESS_VERTEX_TILER);
1373
1374                 /* Mask off lower bits, see offset fixup below */
1375                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1376                 mali_ptr addr = raw_addr & ~63;
1377
1378                 /* Since we advanced the base pointer, we shrink the buffer
1379                  * size, but add the offset we subtracted */
1380                 unsigned size = rsrc->base.width0 + (raw_addr - addr)
1381                         - buf->buffer_offset;
1382
1383                 /* When there is a divisor, the hardware-level divisor is
1384                  * the product of the instance divisor and the padded count */
1385                 unsigned divisor = elem->instance_divisor;
1386                 unsigned hw_divisor = ctx->padded_count * divisor;
1387                 unsigned stride = buf->stride;
1388
1389                 /* If there's a divisor(=1) but no instancing, we want every
1390                  * attribute to be the same */
1391
1392                 if (divisor && ctx->instance_count == 1)
1393                         stride = 0;
1394
1395                 if (!divisor || ctx->instance_count <= 1) {
1396                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1397                                 if (ctx->instance_count > 1)
1398                                         cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
1399
1400                                 cfg.pointer = addr;
1401                                 cfg.stride = stride;
1402                                 cfg.size = size;
1403                                 cfg.divisor_r = instance_shift;
1404                                 cfg.divisor_p = instance_odd;
1405                         }
1406                 } else if (util_is_power_of_two_or_zero(hw_divisor)) {
1407                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1408                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
1409                                 cfg.pointer = addr;
1410                                 cfg.stride = stride;
1411                                 cfg.size = size;
1412                                 cfg.divisor_r = __builtin_ctz(hw_divisor);
1413                         }
1414
1415                 } else {
1416                         unsigned shift = 0, extra_flags = 0;
1417
1418                         unsigned magic_divisor =
1419                                 panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
1420
1421                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1422                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
1423                                 cfg.pointer = addr;
1424                                 cfg.stride = stride;
1425                                 cfg.size = size;
1426
1427                                 cfg.divisor_r = shift;
1428                                 cfg.divisor_e = extra_flags;
1429                         }
1430
1431                         pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
1432                                 cfg.divisor_numerator = magic_divisor;
1433                                 cfg.divisor = divisor;
1434                         }
1435
1436                         ++k;
1437                 }
1438
1439                 ++k;
1440         }
1441
1442         /* Add special gl_VertexID/gl_InstanceID buffers */
1443
1444         if (unlikely(vs->attribute_count >= PAN_VERTEX_ID)) {
1445                 panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1446
1447                 pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
1448                         cfg.buffer_index = k++;
1449                         cfg.format = so->formats[PAN_VERTEX_ID];
1450                 }
1451
1452                 panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1453
1454                 pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
1455                         cfg.buffer_index = k++;
1456                         cfg.format = so->formats[PAN_INSTANCE_ID];
1457                 }
1458         }
1459
1460         /* Attribute addresses require 64-byte alignment, so let:
1461          *
1462          *      base' = base & ~63 = base - (base & 63)
1463          *      offset' = offset + (base & 63)
1464          *
1465          * Since base' + offset' = base + offset, these are equivalent
1466          * addressing modes and now base is 64 aligned.
1467          */
1468
1469         unsigned start = vertex_postfix->offset_start;
1470
1471         for (unsigned i = 0; i < so->num_elements; ++i) {
1472                 unsigned vbi = so->pipe[i].vertex_buffer_index;
1473                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1474
1475                 /* Adjust by the masked off bits of the offset. Make sure we
1476                  * read src_offset from so->hw (which is not GPU visible)
1477                  * rather than target (which is) due to caching effects */
1478
1479                 unsigned src_offset = so->pipe[i].src_offset;
1480
1481                 /* BOs aligned to 4k so guaranteed aligned to 64 */
1482                 src_offset += (buf->buffer_offset & 63);
1483
1484                 /* Also, somewhat obscurely per-instance data needs to be
1485                  * offset in response to a delayed start in an indexed draw */
1486
1487                 if (so->pipe[i].instance_divisor && ctx->instance_count > 1 && start)
1488                         src_offset -= buf->stride * start;
1489
1490                 pan_pack(out + i, ATTRIBUTE, cfg) {
1491                         cfg.buffer_index = attrib_to_buffer[i];
1492                         cfg.format = so->formats[i];
1493                         cfg.offset = src_offset;
1494                 }
1495         }
1496
1497         vertex_postfix->attributes = S.gpu;
1498         vertex_postfix->attribute_meta = T.gpu;
1499 }
1500
1501 static mali_ptr
1502 panfrost_emit_varyings(struct panfrost_batch *batch,
1503                 struct mali_attribute_buffer_packed *slot,
1504                 unsigned stride, unsigned count)
1505 {
1506         unsigned size = stride * count;
1507         mali_ptr ptr = panfrost_pool_alloc_aligned(&batch->invisible_pool, size, 64).gpu;
1508
1509         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1510                 cfg.stride = stride;
1511                 cfg.size = size;
1512                 cfg.pointer = ptr;
1513         }
1514
1515         return ptr;
1516 }
1517
1518 static unsigned
1519 panfrost_streamout_offset(unsigned stride, unsigned offset,
1520                         struct pipe_stream_output_target *target)
1521 {
1522         return (target->buffer_offset + (offset * stride * 4)) & 63;
1523 }
1524
1525 static void
1526 panfrost_emit_streamout(struct panfrost_batch *batch,
1527                         struct mali_attribute_buffer_packed *slot,
1528                         unsigned stride_words, unsigned offset, unsigned count,
1529                         struct pipe_stream_output_target *target)
1530 {
1531         unsigned stride = stride_words * 4;
1532         unsigned max_size = target->buffer_size;
1533         unsigned expected_size = stride * count;
1534
1535         /* Grab the BO and bind it to the batch */
1536         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1537
1538         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1539          * the perspective of the TILER and FRAGMENT.
1540          */
1541         panfrost_batch_add_bo(batch, bo,
1542                               PAN_BO_ACCESS_SHARED |
1543                               PAN_BO_ACCESS_RW |
1544                               PAN_BO_ACCESS_VERTEX_TILER |
1545                               PAN_BO_ACCESS_FRAGMENT);
1546
1547         /* We will have an offset applied to get alignment */
1548         mali_ptr addr = bo->gpu + target->buffer_offset + (offset * stride);
1549
1550         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1551                 cfg.pointer = (addr & ~63);
1552                 cfg.stride = stride;
1553                 cfg.size = MIN2(max_size, expected_size) + (addr & 63);
1554         }
1555 }
1556
1557 static bool
1558 has_point_coord(unsigned mask, gl_varying_slot loc)
1559 {
1560         if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1561                 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1562         else if (loc == VARYING_SLOT_PNTC)
1563                 return (mask & (1 << 8));
1564         else
1565                 return false;
1566 }
1567
1568 /* Helpers for manipulating stream out information so we can pack varyings
1569  * accordingly. Compute the src_offset for a given captured varying */
1570
1571 static struct pipe_stream_output *
1572 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1573 {
1574         for (unsigned i = 0; i < info->num_outputs; ++i) {
1575                 if (info->output[i].register_index == loc)
1576                         return &info->output[i];
1577         }
1578
1579         unreachable("Varying not captured");
1580 }
1581
1582 static unsigned
1583 pan_varying_size(enum mali_format fmt)
1584 {
1585         unsigned type = MALI_EXTRACT_TYPE(fmt);
1586         unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1587         unsigned bits = MALI_EXTRACT_BITS(fmt);
1588         unsigned bpc = 0;
1589
1590         if (bits == MALI_CHANNEL_FLOAT) {
1591                 /* No doubles */
1592                 bool fp16 = (type == MALI_FORMAT_SINT);
1593                 assert(fp16 || (type == MALI_FORMAT_UNORM));
1594
1595                 bpc = fp16 ? 2 : 4;
1596         } else {
1597                 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1598
1599                 /* See the enums */
1600                 bits = 1 << bits;
1601                 assert(bits >= 8);
1602                 bpc = bits / 8;
1603         }
1604
1605         return bpc * chan;
1606 }
1607
1608 /* Indices for named (non-XFB) varyings that are present. These are packed
1609  * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1610  * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1611  * of a given special field given a shift S by:
1612  *
1613  *      idx = popcount(P & ((1 << S) - 1))
1614  *
1615  * That is... look at all of the varyings that come earlier and count them, the
1616  * count is the new index since plus one. Likewise, the total number of special
1617  * buffers required is simply popcount(P)
1618  */
1619
1620 enum pan_special_varying {
1621         PAN_VARY_GENERAL = 0,
1622         PAN_VARY_POSITION = 1,
1623         PAN_VARY_PSIZ = 2,
1624         PAN_VARY_PNTCOORD = 3,
1625         PAN_VARY_FACE = 4,
1626         PAN_VARY_FRAGCOORD = 5,
1627
1628         /* Keep last */
1629         PAN_VARY_MAX,
1630 };
1631
1632 /* Given a varying, figure out which index it correpsonds to */
1633
1634 static inline unsigned
1635 pan_varying_index(unsigned present, enum pan_special_varying v)
1636 {
1637         unsigned mask = (1 << v) - 1;
1638         return util_bitcount(present & mask);
1639 }
1640
1641 /* Get the base offset for XFB buffers, which by convention come after
1642  * everything else. Wrapper function for semantic reasons; by construction this
1643  * is just popcount. */
1644
1645 static inline unsigned
1646 pan_xfb_base(unsigned present)
1647 {
1648         return util_bitcount(present);
1649 }
1650
1651 /* Computes the present mask for varyings so we can start emitting varying records */
1652
1653 static inline unsigned
1654 pan_varying_present(
1655         struct panfrost_shader_state *vs,
1656         struct panfrost_shader_state *fs,
1657         unsigned quirks)
1658 {
1659         /* At the moment we always emit general and position buffers. Not
1660          * strictly necessary but usually harmless */
1661
1662         unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1663
1664         /* Enable special buffers by the shader info */
1665
1666         if (vs->writes_point_size)
1667                 present |= (1 << PAN_VARY_PSIZ);
1668
1669         if (fs->reads_point_coord)
1670                 present |= (1 << PAN_VARY_PNTCOORD);
1671
1672         if (fs->reads_face)
1673                 present |= (1 << PAN_VARY_FACE);
1674
1675         if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1676                 present |= (1 << PAN_VARY_FRAGCOORD);
1677
1678         /* Also, if we have a point sprite, we need a point coord buffer */
1679
1680         for (unsigned i = 0; i < fs->varying_count; i++)  {
1681                 gl_varying_slot loc = fs->varyings_loc[i];
1682
1683                 if (has_point_coord(fs->point_sprite_mask, loc))
1684                         present |= (1 << PAN_VARY_PNTCOORD);
1685         }
1686
1687         return present;
1688 }
1689
1690 /* Emitters for varying records */
1691
1692 static void
1693 pan_emit_vary(struct mali_attribute_packed *out,
1694                 unsigned present, enum pan_special_varying buf,
1695                 unsigned quirks, enum mali_format format,
1696                 unsigned offset)
1697 {
1698         unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1699         unsigned swizzle = quirks & HAS_SWIZZLES ?
1700                         panfrost_get_default_swizzle(nr_channels) :
1701                         panfrost_bifrost_swizzle(nr_channels);
1702
1703         pan_pack(out, ATTRIBUTE, cfg) {
1704                 cfg.buffer_index = pan_varying_index(present, buf);
1705                 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1706                 cfg.format = (format << 12) | swizzle;
1707                 cfg.offset = offset;
1708         }
1709 }
1710
1711 /* General varying that is unused */
1712
1713 static void
1714 pan_emit_vary_only(struct mali_attribute_packed *out,
1715                 unsigned present, unsigned quirks)
1716 {
1717         pan_emit_vary(out, present, 0, quirks, MALI_VARYING_DISCARD, 0);
1718 }
1719
1720 /* Special records */
1721
1722 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1723         [PAN_VARY_POSITION]     = MALI_VARYING_POS,
1724         [PAN_VARY_PSIZ]         = MALI_R16F,
1725         [PAN_VARY_PNTCOORD]     = MALI_R16F,
1726         [PAN_VARY_FACE]         = MALI_R32I,
1727         [PAN_VARY_FRAGCOORD]    = MALI_RGBA32F
1728 };
1729
1730 static void
1731 pan_emit_vary_special(struct mali_attribute_packed *out,
1732                 unsigned present, enum pan_special_varying buf,
1733                 unsigned quirks)
1734 {
1735         assert(buf < PAN_VARY_MAX);
1736         pan_emit_vary(out, present, buf, quirks, pan_varying_formats[buf], 0);
1737 }
1738
1739 static enum mali_format
1740 pan_xfb_format(enum mali_format format, unsigned nr)
1741 {
1742         if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1743                 return MALI_R32F | MALI_NR_CHANNELS(nr);
1744         else
1745                 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1746 }
1747
1748 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1749  * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1750  * value. */
1751
1752 static void
1753 pan_emit_vary_xfb(struct mali_attribute_packed *out,
1754                 unsigned present,
1755                 unsigned max_xfb,
1756                 unsigned *streamout_offsets,
1757                 unsigned quirks,
1758                 enum mali_format format,
1759                 struct pipe_stream_output o)
1760 {
1761         unsigned swizzle = quirks & HAS_SWIZZLES ?
1762                         panfrost_get_default_swizzle(o.num_components) :
1763                         panfrost_bifrost_swizzle(o.num_components);
1764
1765         pan_pack(out, ATTRIBUTE, cfg) {
1766                 /* XFB buffers come after everything else */
1767                 cfg.buffer_index = pan_xfb_base(present) + o.output_buffer;
1768                 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1769
1770                 /* Override number of channels and precision to highp */
1771                 cfg.format = (pan_xfb_format(format, o.num_components) << 12) | swizzle;
1772
1773                 /* Apply given offsets together */
1774                 cfg.offset = (o.dst_offset * 4) /* dwords */
1775                         + streamout_offsets[o.output_buffer];
1776         }
1777 }
1778
1779 /* Determine if we should capture a varying for XFB. This requires actually
1780  * having a buffer for it. If we don't capture it, we'll fallback to a general
1781  * varying path (linked or unlinked, possibly discarding the write) */
1782
1783 static bool
1784 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1785                 unsigned loc, unsigned max_xfb)
1786 {
1787         if (!(xfb->so_mask & (1ll << loc)))
1788                 return false;
1789
1790         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1791         return o->output_buffer < max_xfb;
1792 }
1793
1794 static void
1795 pan_emit_general_varying(struct mali_attribute_packed *out,
1796                 struct panfrost_shader_state *other,
1797                 struct panfrost_shader_state *xfb,
1798                 gl_varying_slot loc,
1799                 enum mali_format format,
1800                 unsigned present,
1801                 unsigned quirks,
1802                 unsigned *gen_offsets,
1803                 enum mali_format *gen_formats,
1804                 unsigned *gen_stride,
1805                 unsigned idx,
1806                 bool should_alloc)
1807 {
1808         /* Check if we're linked */
1809         signed other_idx = -1;
1810
1811         for (unsigned j = 0; j < other->varying_count; ++j) {
1812                 if (other->varyings_loc[j] == loc) {
1813                         other_idx = j;
1814                         break;
1815                 }
1816         }
1817
1818         if (other_idx < 0) {
1819                 pan_emit_vary_only(out, present, quirks);
1820                 return;
1821         }
1822
1823         unsigned offset = gen_offsets[other_idx];
1824
1825         if (should_alloc) {
1826                 /* We're linked, so allocate a space via a watermark allocation */
1827                 enum mali_format alt = other->varyings[other_idx];
1828
1829                 /* Do interpolation at minimum precision */
1830                 unsigned size_main = pan_varying_size(format);
1831                 unsigned size_alt = pan_varying_size(alt);
1832                 unsigned size = MIN2(size_main, size_alt);
1833
1834                 /* If a varying is marked for XFB but not actually captured, we
1835                  * should match the format to the format that would otherwise
1836                  * be used for XFB, since dEQP checks for invariance here. It's
1837                  * unclear if this is required by the spec. */
1838
1839                 if (xfb->so_mask & (1ull << loc)) {
1840                         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1841                         format = pan_xfb_format(format, o->num_components);
1842                         size = pan_varying_size(format);
1843                 } else if (size == size_alt) {
1844                         format = alt;
1845                 }
1846
1847                 gen_offsets[idx] = *gen_stride;
1848                 gen_formats[other_idx] = format;
1849                 offset = *gen_stride;
1850                 *gen_stride += size;
1851         }
1852
1853         pan_emit_vary(out, present, PAN_VARY_GENERAL, quirks, format, offset);
1854 }
1855
1856 /* Higher-level wrapper around all of the above, classifying a varying into one
1857  * of the above types */
1858
1859 static void
1860 panfrost_emit_varying(
1861                 struct mali_attribute_packed *out,
1862                 struct panfrost_shader_state *stage,
1863                 struct panfrost_shader_state *other,
1864                 struct panfrost_shader_state *xfb,
1865                 unsigned present,
1866                 unsigned max_xfb,
1867                 unsigned *streamout_offsets,
1868                 unsigned quirks,
1869                 unsigned *gen_offsets,
1870                 enum mali_format *gen_formats,
1871                 unsigned *gen_stride,
1872                 unsigned idx,
1873                 bool should_alloc,
1874                 bool is_fragment)
1875 {
1876         gl_varying_slot loc = stage->varyings_loc[idx];
1877         enum mali_format format = stage->varyings[idx];
1878
1879         /* Override format to match linkage */
1880         if (!should_alloc && gen_formats[idx])
1881                 format = gen_formats[idx];
1882
1883         if (has_point_coord(stage->point_sprite_mask, loc)) {
1884                 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1885         } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1886                 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1887                 pan_emit_vary_xfb(out, present, max_xfb, streamout_offsets, quirks, format, *o);
1888         } else if (loc == VARYING_SLOT_POS) {
1889                 if (is_fragment)
1890                         pan_emit_vary_special(out, present, PAN_VARY_FRAGCOORD, quirks);
1891                 else
1892                         pan_emit_vary_special(out, present, PAN_VARY_POSITION, quirks);
1893         } else if (loc == VARYING_SLOT_PSIZ) {
1894                 pan_emit_vary_special(out, present, PAN_VARY_PSIZ, quirks);
1895         } else if (loc == VARYING_SLOT_PNTC) {
1896                 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1897         } else if (loc == VARYING_SLOT_FACE) {
1898                 pan_emit_vary_special(out, present, PAN_VARY_FACE, quirks);
1899         } else {
1900                 pan_emit_general_varying(out, other, xfb, loc, format, present,
1901                                 quirks, gen_offsets, gen_formats, gen_stride,
1902                                 idx, should_alloc);
1903         }
1904 }
1905
1906 static void
1907 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
1908                 unsigned present,
1909                 enum pan_special_varying v,
1910                 unsigned special)
1911 {
1912         if (present & (1 << v)) {
1913                 unsigned idx = pan_varying_index(present, v);
1914
1915                 pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
1916                         cfg.special = special;
1917                         cfg.type = 0;
1918                 }
1919         }
1920 }
1921
1922 void
1923 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1924                                  unsigned vertex_count,
1925                                  struct mali_vertex_tiler_postfix *vertex_postfix,
1926                                  struct mali_vertex_tiler_postfix *tiler_postfix,
1927                                  union midgard_primitive_size *primitive_size)
1928 {
1929         /* Load the shaders */
1930         struct panfrost_context *ctx = batch->ctx;
1931         struct panfrost_device *dev = pan_device(ctx->base.screen);
1932         struct panfrost_shader_state *vs, *fs;
1933         size_t vs_size, fs_size;
1934
1935         /* Allocate the varying descriptor */
1936
1937         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1938         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1939         vs_size = MALI_ATTRIBUTE_LENGTH * vs->varying_count;
1940         fs_size = MALI_ATTRIBUTE_LENGTH * fs->varying_count;
1941
1942         struct panfrost_transfer trans = panfrost_pool_alloc_aligned(
1943                         &batch->pool, vs_size + fs_size, MALI_ATTRIBUTE_LENGTH);
1944
1945         struct pipe_stream_output_info *so = &vs->stream_output;
1946         unsigned present = pan_varying_present(vs, fs, dev->quirks);
1947
1948         /* Check if this varying is linked by us. This is the case for
1949          * general-purpose, non-captured varyings. If it is, link it. If it's
1950          * not, use the provided stream out information to determine the
1951          * offset, since it was already linked for us. */
1952
1953         unsigned gen_offsets[32];
1954         enum mali_format gen_formats[32];
1955         memset(gen_offsets, 0, sizeof(gen_offsets));
1956         memset(gen_formats, 0, sizeof(gen_formats));
1957
1958         unsigned gen_stride = 0;
1959         assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
1960         assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
1961
1962         unsigned streamout_offsets[32];
1963
1964         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1965                 streamout_offsets[i] = panfrost_streamout_offset(
1966                                         so->stride[i],
1967                                         ctx->streamout.offsets[i],
1968                                         ctx->streamout.targets[i]);
1969         }
1970
1971         struct mali_attribute_packed *ovs = (struct mali_attribute_packed *)trans.cpu;
1972         struct mali_attribute_packed *ofs = ovs + vs->varying_count;
1973
1974         for (unsigned i = 0; i < vs->varying_count; i++) {
1975                 panfrost_emit_varying(ovs + i, vs, fs, vs, present,
1976                                 ctx->streamout.num_targets, streamout_offsets,
1977                                 dev->quirks,
1978                                 gen_offsets, gen_formats, &gen_stride, i, true, false);
1979         }
1980
1981         for (unsigned i = 0; i < fs->varying_count; i++) {
1982                 panfrost_emit_varying(ofs + i, fs, vs, vs, present,
1983                                 ctx->streamout.num_targets, streamout_offsets,
1984                                 dev->quirks,
1985                                 gen_offsets, gen_formats, &gen_stride, i, false, true);
1986         }
1987
1988         unsigned xfb_base = pan_xfb_base(present);
1989         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1990                         MALI_ATTRIBUTE_BUFFER_LENGTH * (xfb_base + ctx->streamout.num_targets),
1991                         MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1992         struct mali_attribute_buffer_packed *varyings =
1993                 (struct mali_attribute_buffer_packed *) T.cpu;
1994
1995         /* Emit the stream out buffers */
1996
1997         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1998                                                            ctx->vertex_count);
1999
2000         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2001                 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
2002                                         so->stride[i],
2003                                         ctx->streamout.offsets[i],
2004                                         out_count,
2005                                         ctx->streamout.targets[i]);
2006         }
2007
2008         panfrost_emit_varyings(batch,
2009                         &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
2010                         gen_stride, vertex_count);
2011
2012         /* fp32 vec4 gl_Position */
2013         tiler_postfix->position_varying = panfrost_emit_varyings(batch,
2014                         &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
2015                         sizeof(float) * 4, vertex_count);
2016
2017         if (present & (1 << PAN_VARY_PSIZ)) {
2018                 primitive_size->pointer = panfrost_emit_varyings(batch,
2019                                 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
2020                                 2, vertex_count);
2021         }
2022
2023         pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD);
2024         pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
2025         pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
2026
2027         vertex_postfix->varyings = T.gpu;
2028         tiler_postfix->varyings = T.gpu;
2029
2030         vertex_postfix->varying_meta = trans.gpu;
2031         tiler_postfix->varying_meta = trans.gpu + vs_size;
2032 }
2033
2034 void
2035 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
2036                                 struct mali_vertex_tiler_prefix *vertex_prefix,
2037                                 struct mali_vertex_tiler_postfix *vertex_postfix,
2038                                 struct mali_vertex_tiler_prefix *tiler_prefix,
2039                                 struct mali_vertex_tiler_postfix *tiler_postfix,
2040                                 union midgard_primitive_size *primitive_size)
2041 {
2042         struct panfrost_context *ctx = batch->ctx;
2043         struct panfrost_device *device = pan_device(ctx->base.screen);
2044         bool wallpapering = ctx->wallpaper_batch && batch->scoreboard.tiler_dep;
2045         struct bifrost_payload_vertex bifrost_vertex = {0,};
2046         struct bifrost_payload_tiler bifrost_tiler = {0,};
2047         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
2048         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
2049         void *vp, *tp;
2050         size_t vp_size, tp_size;
2051
2052         if (device->quirks & IS_BIFROST) {
2053                 bifrost_vertex.prefix = *vertex_prefix;
2054                 bifrost_vertex.postfix = *vertex_postfix;
2055                 vp = &bifrost_vertex;
2056                 vp_size = sizeof(bifrost_vertex);
2057
2058                 bifrost_tiler.prefix = *tiler_prefix;
2059                 bifrost_tiler.tiler.primitive_size = *primitive_size;
2060                 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2061                 bifrost_tiler.postfix = *tiler_postfix;
2062                 tp = &bifrost_tiler;
2063                 tp_size = sizeof(bifrost_tiler);
2064         } else {
2065                 midgard_vertex.prefix = *vertex_prefix;
2066                 midgard_vertex.postfix = *vertex_postfix;
2067                 vp = &midgard_vertex;
2068                 vp_size = sizeof(midgard_vertex);
2069
2070                 midgard_tiler.prefix = *tiler_prefix;
2071                 midgard_tiler.postfix = *tiler_postfix;
2072                 midgard_tiler.primitive_size = *primitive_size;
2073                 tp = &midgard_tiler;
2074                 tp_size = sizeof(midgard_tiler);
2075         }
2076
2077         if (wallpapering) {
2078                 /* Inject in reverse order, with "predicted" job indices.
2079                  * THIS IS A HACK XXX */
2080                 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false,
2081                                  batch->scoreboard.job_index + 2, tp, tp_size, true);
2082                 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2083                                  vp, vp_size, true);
2084                 return;
2085         }
2086
2087         /* If rasterizer discard is enable, only submit the vertex */
2088
2089         unsigned vertex = panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2090                                            vp, vp_size, false);
2091
2092         if (ctx->rasterizer->base.rasterizer_discard)
2093                 return;
2094
2095         panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false, vertex, tp, tp_size,
2096                          false);
2097 }
2098
2099 /* TODO: stop hardcoding this */
2100 mali_ptr
2101 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2102 {
2103         uint16_t locations[] = {
2104             128, 128,
2105             0, 256,
2106             0, 256,
2107             0, 256,
2108             0, 256,
2109             0, 256,
2110             0, 256,
2111             0, 256,
2112             0, 256,
2113             0, 256,
2114             0, 256,
2115             0, 256,
2116             0, 256,
2117             0, 256,
2118             0, 256,
2119             0, 256,
2120             0, 256,
2121             0, 256,
2122             0, 256,
2123             0, 256,
2124             0, 256,
2125             0, 256,
2126             0, 256,
2127             0, 256,
2128             0, 256,
2129             0, 256,
2130             0, 256,
2131             0, 256,
2132             0, 256,
2133             0, 256,
2134             0, 256,
2135             0, 256,
2136             128, 128,
2137             0, 0,
2138             0, 0,
2139             0, 0,
2140             0, 0,
2141             0, 0,
2142             0, 0,
2143             0, 0,
2144             0, 0,
2145             0, 0,
2146             0, 0,
2147             0, 0,
2148             0, 0,
2149             0, 0,
2150             0, 0,
2151             0, 0,
2152         };
2153
2154         return panfrost_pool_upload_aligned(&batch->pool, locations, 96 * sizeof(uint16_t), 64);
2155 }