src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28
  29 #include "panfrost-quirks.h"
  30
  31 #include "pan_pool.h"
  32 #include "pan_bo.h"
  33 #include "pan_cmdstream.h"
  34 #include "pan_context.h"
  35 #include "pan_job.h"
  36
  37 /* If a BO is accessed for a particular shader stage, will it be in the primary
  38  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  39  * fragment will be primary, e.g. compute jobs will be considered
  40  * "vertex/tiler" by analogy */
  41
  42 static inline uint32_t
  43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  44 {
  45         assert(stage == PIPE_SHADER_FRAGMENT ||
  46                stage == PIPE_SHADER_VERTEX ||
  47                stage == PIPE_SHADER_COMPUTE);
  48
  49         return stage == PIPE_SHADER_FRAGMENT ?
  50                PAN_BO_ACCESS_FRAGMENT :
  51                PAN_BO_ACCESS_VERTEX_TILER;
  52 }
  53
  54 static void
  55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
  56                                struct mali_vertex_tiler_postfix *postfix)
  57 {
  58         struct panfrost_device *dev = pan_device(ctx->base.screen);
  59         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  60
  61         unsigned shift = panfrost_get_stack_shift(batch->stack_size);
  62         struct mali_shared_memory shared = {
  63                 .stack_shift = shift,
  64                 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
  65                 .shared_workgroup_count = ~0,
  66         };
  67         postfix->shared_memory = panfrost_pool_upload(&batch->pool, &shared, sizeof(shared));
  68 }
  69
  70 static void
  71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
  72                                struct mali_vertex_tiler_postfix *postfix)
  73 {
  74         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  75         postfix->shared_memory = panfrost_batch_reserve_framebuffer(batch);
  76 }
  77
  78 static void
  79 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
  80                               struct mali_vertex_tiler_prefix *prefix,
  81                               struct mali_vertex_tiler_postfix *postfix)
  82 {
  83         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
  84
  85         postfix->gl_enables |= 0x7;
  86         SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
  87                 rasterizer && rasterizer->base.front_ccw);
  88         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
  89                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
  90         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
  91                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
  92         SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
  93                 rasterizer && rasterizer->base.flatshade_first);
  94 }
  95
  96 void
  97 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
  98                                   struct mali_vertex_tiler_prefix *prefix,
  99                                   union midgard_primitive_size *primitive_size)
 100 {
 101         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 102
 103         if (!panfrost_writes_point_size(ctx)) {
 104                 bool points = prefix->draw_mode == MALI_DRAW_MODE_POINTS;
 105                 float val = 0.0f;
 106
 107                 if (rasterizer)
 108                         val = points ?
 109                               rasterizer->base.point_size :
 110                               rasterizer->base.line_width;
 111
 112                 primitive_size->constant = val;
 113         }
 114 }
 115
 116 static void
 117 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
 118                                    struct mali_vertex_tiler_postfix *postfix)
 119 {
 120         SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
 121         if (ctx->occlusion_query) {
 122                 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
 123                 panfrost_batch_add_bo(ctx->batch, ctx->occlusion_query->bo,
 124                                       PAN_BO_ACCESS_SHARED |
 125                                       PAN_BO_ACCESS_RW |
 126                                       PAN_BO_ACCESS_FRAGMENT);
 127         } else {
 128                 postfix->occlusion_counter = 0;
 129         }
 130 }
 131
 132 void
 133 panfrost_vt_init(struct panfrost_context *ctx,
 134                  enum pipe_shader_type stage,
 135                  struct mali_vertex_tiler_prefix *prefix,
 136                  struct mali_vertex_tiler_postfix *postfix)
 137 {
 138         struct panfrost_device *device = pan_device(ctx->base.screen);
 139
 140         if (!ctx->shader[stage])
 141                 return;
 142
 143         memset(prefix, 0, sizeof(*prefix));
 144         memset(postfix, 0, sizeof(*postfix));
 145
 146         if (device->quirks & IS_BIFROST) {
 147                 postfix->gl_enables = 0x2;
 148                 panfrost_vt_emit_shared_memory(ctx, postfix);
 149         } else {
 150                 postfix->gl_enables = 0x6;
 151                 panfrost_vt_attach_framebuffer(ctx, postfix);
 152         }
 153
 154         if (stage == PIPE_SHADER_FRAGMENT) {
 155                 panfrost_vt_update_occlusion_query(ctx, postfix);
 156                 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
 157         }
 158 }
 159
 160 static unsigned
 161 panfrost_translate_index_size(unsigned size)
 162 {
 163         switch (size) {
 164         case 1:
 165                 return MALI_DRAW_INDEXED_UINT8;
 166
 167         case 2:
 168                 return MALI_DRAW_INDEXED_UINT16;
 169
 170         case 4:
 171                 return MALI_DRAW_INDEXED_UINT32;
 172
 173         default:
 174                 unreachable("Invalid index size");
 175         }
 176 }
 177
 178 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
 179  * good for the duration of the draw (transient), could last longer. Also get
 180  * the bounds on the index buffer for the range accessed by the draw. We do
 181  * these operations together because there are natural optimizations which
 182  * require them to be together. */
 183
 184 static mali_ptr
 185 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 186                                   const struct pipe_draw_info *info,
 187                                   unsigned *min_index, unsigned *max_index)
 188 {
 189         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 190         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 191         off_t offset = info->start * info->index_size;
 192         bool needs_indices = true;
 193         mali_ptr out = 0;
 194
 195         if (info->max_index != ~0u) {
 196                 *min_index = info->min_index;
 197                 *max_index = info->max_index;
 198                 needs_indices = false;
 199         }
 200
 201         if (!info->has_user_indices) {
 202                 /* Only resources can be directly mapped */
 203                 panfrost_batch_add_bo(batch, rsrc->bo,
 204                                       PAN_BO_ACCESS_SHARED |
 205                                       PAN_BO_ACCESS_READ |
 206                                       PAN_BO_ACCESS_VERTEX_TILER);
 207                 out = rsrc->bo->gpu + offset;
 208
 209                 /* Check the cache */
 210                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 211                                                            info->start,
 212                                                            info->count,
 213                                                            min_index,
 214                                                            max_index);
 215         } else {
 216                 /* Otherwise, we need to upload to transient memory */
 217                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 218                 out = panfrost_pool_upload(&batch->pool, ibuf8 + offset,
 219                                                 info->count *
 220                                                 info->index_size);
 221         }
 222
 223         if (needs_indices) {
 224                 /* Fallback */
 225                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 226
 227                 if (!info->has_user_indices)
 228                         panfrost_minmax_cache_add(rsrc->index_cache,
 229                                                   info->start, info->count,
 230                                                   *min_index, *max_index);
 231         }
 232
 233         return out;
 234 }
 235
 236 void
 237 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
 238                           const struct pipe_draw_info *info,
 239                           enum mali_draw_mode draw_mode,
 240                           struct mali_vertex_tiler_postfix *vertex_postfix,
 241                           struct mali_vertex_tiler_prefix *tiler_prefix,
 242                           struct mali_vertex_tiler_postfix *tiler_postfix,
 243                           unsigned *vertex_count,
 244                           unsigned *padded_count)
 245 {
 246         tiler_prefix->draw_mode = draw_mode;
 247
 248         unsigned draw_flags = 0;
 249
 250         if (panfrost_writes_point_size(ctx))
 251                 draw_flags |= MALI_DRAW_VARYING_SIZE;
 252
 253         if (info->primitive_restart)
 254                 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
 255
 256         /* These doesn't make much sense */
 257
 258         draw_flags |= 0x3000;
 259
 260         if (info->index_size) {
 261                 unsigned min_index = 0, max_index = 0;
 262
 263                 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
 264                                                                        info,
 265                                                                        &min_index,
 266                                                                        &max_index);
 267
 268                 /* Use the corresponding values */
 269                 *vertex_count = max_index - min_index + 1;
 270                 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
 271                 tiler_prefix->offset_bias_correction = -min_index;
 272                 tiler_prefix->index_count = MALI_POSITIVE(info->count);
 273                 draw_flags |= panfrost_translate_index_size(info->index_size);
 274         } else {
 275                 tiler_prefix->indices = 0;
 276                 *vertex_count = ctx->vertex_count;
 277                 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
 278                 tiler_prefix->offset_bias_correction = 0;
 279                 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
 280         }
 281
 282         tiler_prefix->unknown_draw = draw_flags;
 283
 284         /* Encode the padded vertex count */
 285
 286         if (info->instance_count > 1) {
 287                 *padded_count = panfrost_padded_vertex_count(*vertex_count);
 288
 289                 unsigned shift = __builtin_ctz(ctx->padded_count);
 290                 unsigned k = ctx->padded_count >> (shift + 1);
 291
 292                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
 293                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
 294         } else {
 295                 *padded_count = *vertex_count;
 296
 297                 /* Reset instancing state */
 298                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
 299                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
 300         }
 301 }
 302
 303 static void
 304 panfrost_shader_meta_init(struct panfrost_context *ctx,
 305                           enum pipe_shader_type st,
 306                           struct mali_shader_meta *meta)
 307 {
 308         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 309         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 310
 311         memset(meta, 0, sizeof(*meta));
 312         meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
 313         meta->attribute_count = ss->attribute_count;
 314         meta->varying_count = ss->varying_count;
 315         meta->texture_count = ctx->sampler_view_count[st];
 316         meta->sampler_count = ctx->sampler_count[st];
 317
 318         if (dev->quirks & IS_BIFROST) {
 319                 if (st == PIPE_SHADER_VERTEX)
 320                         meta->bifrost1.unk1 = 0x800000;
 321                 else {
 322                         /* First clause ATEST |= 0x4000000.
 323                          * Less than 32 regs |= 0x200 */
 324                         meta->bifrost1.unk1 = 0x950020;
 325                 }
 326
 327                 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 328                 if (st == PIPE_SHADER_VERTEX)
 329                         meta->bifrost2.preload_regs = 0xC0;
 330                 else {
 331                         meta->bifrost2.preload_regs = 0x1;
 332                         SET_BIT(meta->bifrost2.preload_regs, 0x10, ss->reads_frag_coord);
 333                 }
 334
 335                 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
 336                                                     ss->uniform_cutoff);
 337         } else {
 338                 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
 339                                                     ss->uniform_cutoff);
 340                 meta->midgard1.work_count = ss->work_reg_count;
 341
 342                 /* TODO: This is not conformant on ES3 */
 343                 meta->midgard1.flags_hi = MALI_SUPPRESS_INF_NAN;
 344
 345                 meta->midgard1.flags_lo = 0x20;
 346                 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 347
 348                 SET_BIT(meta->midgard1.flags_hi, MALI_WRITES_GLOBAL, ss->writes_global);
 349         }
 350 }
 351
 352 static unsigned
 353 translate_tex_wrap(enum pipe_tex_wrap w)
 354 {
 355         switch (w) {
 356         case PIPE_TEX_WRAP_REPEAT:
 357                 return MALI_WRAP_MODE_REPEAT;
 358
 359         case PIPE_TEX_WRAP_CLAMP:
 360                 return MALI_WRAP_MODE_CLAMP;
 361
 362         case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 363                 return MALI_WRAP_MODE_CLAMP_TO_EDGE;
 364
 365         case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
 366                 return MALI_WRAP_MODE_CLAMP_TO_BORDER;
 367
 368         case PIPE_TEX_WRAP_MIRROR_REPEAT:
 369                 return MALI_WRAP_MODE_MIRRORED_REPEAT;
 370
 371         case PIPE_TEX_WRAP_MIRROR_CLAMP:
 372                 return MALI_WRAP_MODE_MIRRORED_CLAMP;
 373
 374         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 375                 return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
 376
 377         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
 378                 return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
 379
 380         default:
 381                 unreachable("Invalid wrap");
 382         }
 383 }
 384
 385 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 386                                 struct mali_sampler_descriptor *hw)
 387 {
 388         unsigned func = panfrost_translate_compare_func(cso->compare_func);
 389         bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 390         bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 391         bool mip_linear  = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
 392         unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
 393         unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
 394         unsigned mip_filter = mip_linear  ?
 395                               (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
 396         unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
 397
 398         *hw = (struct mali_sampler_descriptor) {
 399                 .filter_mode = min_filter | mag_filter | mip_filter |
 400                                normalized,
 401                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 402                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 403                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 404                 .compare_func = cso->compare_mode ?
 405                         panfrost_flip_compare_func(func) :
 406                         MALI_FUNC_NEVER,
 407                 .border_color = {
 408                         cso->border_color.f[0],
 409                         cso->border_color.f[1],
 410                         cso->border_color.f[2],
 411                         cso->border_color.f[3]
 412                 },
 413                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 414                 .max_lod = FIXED_16(cso->max_lod, false),
 415                 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
 416                 .seamless_cube_map = cso->seamless_cube_map,
 417         };
 418
 419         /* If necessary, we disable mipmapping in the sampler descriptor by
 420          * clamping the LOD as tight as possible (from 0 to epsilon,
 421          * essentially -- remember these are fixed point numbers, so
 422          * epsilon=1/256) */
 423
 424         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 425                 hw->max_lod = hw->min_lod + 1;
 426 }
 427
 428 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
 429                                         struct bifrost_sampler_descriptor *hw)
 430 {
 431         *hw = (struct bifrost_sampler_descriptor) {
 432                 .unk1 = 0x1,
 433                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 434                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 435                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 436                 .unk8 = 0x8,
 437                 .min_filter = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST,
 438                 .norm_coords = cso->normalized_coords,
 439                 .mip_filter = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR,
 440                 .mag_filter = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR,
 441                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 442                 .max_lod = FIXED_16(cso->max_lod, false),
 443         };
 444
 445         /* If necessary, we disable mipmapping in the sampler descriptor by
 446          * clamping the LOD as tight as possible (from 0 to epsilon,
 447          * essentially -- remember these are fixed point numbers, so
 448          * epsilon=1/256) */
 449
 450         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 451                 hw->max_lod = hw->min_lod + 1;
 452 }
 453
 454 static void
 455 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
 456                                      struct mali_shader_meta *fragmeta)
 457 {
 458         if (!ctx->rasterizer) {
 459                 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
 460                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
 461                 fragmeta->depth_units = 0.0f;
 462                 fragmeta->depth_factor = 0.0f;
 463                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
 464                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
 465                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, true);
 466                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, true);
 467                 return;
 468         }
 469
 470         struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 471
 472         bool msaa = rast->multisample;
 473
 474         /* TODO: Sample size */
 475         SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
 476         SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
 477
 478         struct panfrost_shader_state *fs;
 479         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 480
 481         /* EXT_shader_framebuffer_fetch requires the shader to be run
 482          * per-sample when outputs are read. */
 483         bool per_sample = ctx->min_samples > 1 || fs->outputs_read;
 484         SET_BIT(fragmeta->unknown2_3, MALI_PER_SAMPLE, msaa && per_sample);
 485
 486         fragmeta->depth_units = rast->offset_units * 2.0f;
 487         fragmeta->depth_factor = rast->offset_scale;
 488
 489         /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
 490
 491         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, rast->offset_tri);
 492         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, rast->offset_tri);
 493
 494         SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, rast->depth_clip_near);
 495         SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, rast->depth_clip_far);
 496 }
 497
 498 static void
 499 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
 500                               struct mali_shader_meta *fragmeta)
 501 {
 502         const struct panfrost_zsa_state *so = ctx->depth_stencil;
 503         int zfunc = PIPE_FUNC_ALWAYS;
 504
 505         if (!so) {
 506                 /* If stenciling is disabled, the state is irrelevant */
 507                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
 508                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
 509         } else {
 510                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
 511                         so->base.stencil[0].enabled);
 512
 513                 fragmeta->stencil_mask_front = so->stencil_mask_front;
 514                 fragmeta->stencil_mask_back = so->stencil_mask_back;
 515
 516                 /* Bottom bits for stencil ref, exactly one word */
 517                 fragmeta->stencil_front.opaque[0] = so->stencil_front.opaque[0] | ctx->stencil_ref.ref_value[0];
 518
 519                 /* If back-stencil is not enabled, use the front values */
 520
 521                 if (so->base.stencil[1].enabled)
 522                         fragmeta->stencil_back.opaque[0] = so->stencil_back.opaque[0] | ctx->stencil_ref.ref_value[1];
 523                 else
 524                         fragmeta->stencil_back = fragmeta->stencil_front;
 525
 526                 if (so->base.depth.enabled)
 527                         zfunc = so->base.depth.func;
 528
 529                 /* Depth state (TODO: Refactor) */
 530
 531                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
 532                         so->base.depth.writemask);
 533         }
 534
 535         fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
 536         fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
 537 }
 538
 539 static bool
 540 panfrost_fs_required(
 541                 struct panfrost_shader_state *fs,
 542                 struct panfrost_blend_final *blend,
 543                 unsigned rt_count)
 544 {
 545         /* If we generally have side effects */
 546         if (fs->fs_sidefx)
 547                 return true;
 548
 549         /* If colour is written we need to execute */
 550         for (unsigned i = 0; i < rt_count; ++i) {
 551                 if (!blend[i].no_colour)
 552                         return true;
 553         }
 554
 555         /* If depth is written and not implied we need to execute.
 556          * TODO: Predicate on Z/S writes being enabled */
 557         return (fs->writes_depth || fs->writes_stencil);
 558 }
 559
 560 static void
 561 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
 562                                 struct mali_shader_meta *fragmeta,
 563                                 void *rts)
 564 {
 565         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 566         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 567         struct panfrost_shader_state *fs;
 568         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 569
 570         SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
 571                 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
 572                 !ctx->blend->base.dither);
 573
 574         SET_BIT(fragmeta->unknown2_4, MALI_ALPHA_TO_COVERAGE,
 575                         ctx->blend->base.alpha_to_coverage);
 576
 577         /* Get blending setup */
 578         unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 579
 580         struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 581         unsigned shader_offset = 0;
 582         struct panfrost_bo *shader_bo = NULL;
 583
 584         for (unsigned c = 0; c < rt_count; ++c)
 585                 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
 586                                                           &shader_offset);
 587
 588         /* Disable shader execution if we can */
 589         if (dev->quirks & MIDGARD_SHADERLESS
 590                         && !panfrost_fs_required(fs, blend, rt_count)) {
 591                 fragmeta->shader = 0;
 592                 fragmeta->attribute_count = 0;
 593                 fragmeta->varying_count = 0;
 594                 fragmeta->texture_count = 0;
 595                 fragmeta->sampler_count = 0;
 596
 597                 /* This feature is not known to work on Bifrost */
 598                 fragmeta->midgard1.work_count = 1;
 599                 fragmeta->midgard1.uniform_count = 0;
 600                 fragmeta->midgard1.uniform_buffer_count = 0;
 601         }
 602
 603          /* If there is a blend shader, work registers are shared. We impose 8
 604           * work registers as a limit for blend shaders. Should be lower XXX */
 605
 606         if (!(dev->quirks & IS_BIFROST)) {
 607                 for (unsigned c = 0; c < rt_count; ++c) {
 608                         if (blend[c].is_shader) {
 609                                 fragmeta->midgard1.work_count =
 610                                         MAX2(fragmeta->midgard1.work_count, 8);
 611                         }
 612                 }
 613         }
 614
 615         /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
 616          * copied to the blend_meta appended (by convention), but this is the
 617          * field actually read by the hardware. (Or maybe both are read...?).
 618          * Specify the last RTi with a blend shader. */
 619
 620         fragmeta->blend.shader = 0;
 621
 622         for (signed rt = (rt_count - 1); rt >= 0; --rt) {
 623                 if (!blend[rt].is_shader)
 624                         continue;
 625
 626                 fragmeta->blend.shader = blend[rt].shader.gpu |
 627                                          blend[rt].shader.first_tag;
 628                 break;
 629         }
 630
 631         if (dev->quirks & MIDGARD_SFBD) {
 632                 /* When only a single render target platform is used, the blend
 633                  * information is inside the shader meta itself. We additionally
 634                  * need to signal CAN_DISCARD for nontrivial blend modes (so
 635                  * we're able to read back the destination buffer) */
 636
 637                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
 638                         blend[0].is_shader);
 639
 640                 if (!blend[0].is_shader) {
 641                         fragmeta->blend.equation = *blend[0].equation.equation;
 642                         fragmeta->blend.constant = blend[0].equation.constant;
 643                 }
 644
 645                 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
 646                         !blend[0].no_blending || fs->can_discard);
 647
 648                 batch->draws |= PIPE_CLEAR_COLOR0;
 649                 return;
 650         }
 651
 652         if (dev->quirks & IS_BIFROST) {
 653                 bool no_blend = true;
 654
 655                 for (unsigned i = 0; i < rt_count; ++i)
 656                         no_blend &= (blend[i].no_blending | blend[i].no_colour);
 657
 658                 SET_BIT(fragmeta->bifrost1.unk1, MALI_BIFROST_EARLY_Z,
 659                         !fs->can_discard && !fs->writes_depth && no_blend);
 660         }
 661
 662         /* Additional blend descriptor tacked on for jobs using MFBD */
 663
 664         for (unsigned i = 0; i < rt_count; ++i) {
 665                 unsigned flags = 0;
 666
 667                 if (ctx->pipe_framebuffer.nr_cbufs > i && !blend[i].no_colour) {
 668                         flags = 0x200;
 669                         batch->draws |= (PIPE_CLEAR_COLOR0 << i);
 670
 671                         bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
 672                                        (ctx->pipe_framebuffer.cbufs[i]) &&
 673                                        util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
 674
 675                         SET_BIT(flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
 676                         SET_BIT(flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
 677                         SET_BIT(flags, MALI_BLEND_SRGB, is_srgb);
 678                         SET_BIT(flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
 679                 }
 680
 681                 if (dev->quirks & IS_BIFROST) {
 682                         struct bifrost_blend_rt *brts = rts;
 683
 684                         brts[i].flags = flags;
 685
 686                         if (blend[i].is_shader) {
 687                                 /* The blend shader's address needs to be at
 688                                  * the same top 32 bit as the fragment shader.
 689                                  * TODO: Ensure that's always the case.
 690                                  */
 691                                 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
 692                                        (fs->bo->gpu & (0xffffffffull << 32)));
 693                                 brts[i].shader = blend[i].shader.gpu;
 694                                 brts[i].unk2 = 0x0;
 695                         } else if (ctx->pipe_framebuffer.nr_cbufs > i) {
 696                                 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
 697                                 const struct util_format_description *format_desc;
 698                                 format_desc = util_format_description(format);
 699
 700                                 brts[i].equation = *blend[i].equation.equation;
 701
 702                                 /* TODO: this is a bit more complicated */
 703                                 brts[i].constant = blend[i].equation.constant;
 704
 705                                 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
 706
 707                                 /* 0x19 disables blending and forces REPLACE
 708                                  * mode (equivalent to rgb_mode = alpha_mode =
 709                                  * x122, colour mask = 0xF). 0x1a allows
 710                                  * blending. */
 711                                 brts[i].unk2 = blend[i].no_blending ? 0x19 : 0x1a;
 712
 713                                 brts[i].shader_type = fs->blend_types[i];
 714                         } else {
 715                                 /* Dummy attachment for depth-only */
 716                                 brts[i].unk2 = 0x3;
 717                                 brts[i].shader_type = fs->blend_types[i];
 718                         }
 719                 } else {
 720                         struct midgard_blend_rt *mrts = rts;
 721                         mrts[i].flags = flags;
 722
 723                         if (blend[i].is_shader) {
 724                                 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 725                         } else {
 726                                 mrts[i].blend.equation = *blend[i].equation.equation;
 727                                 mrts[i].blend.constant = blend[i].equation.constant;
 728                         }
 729                 }
 730         }
 731 }
 732
 733 static void
 734 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
 735                                struct mali_shader_meta *fragmeta,
 736                                void *rts)
 737 {
 738         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 739         struct panfrost_shader_state *fs;
 740
 741         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 742
 743         bool msaa = ctx->rasterizer && ctx->rasterizer->base.multisample;
 744         fragmeta->coverage_mask = msaa ? ctx->sample_mask : ~0;
 745
 746         fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x10;
 747         fragmeta->unknown2_4 = 0x4e0;
 748
 749         /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
 750          * is required (independent of 32-bit/64-bit descriptors), or why it's
 751          * not used on later GPU revisions. Otherwise, all shader jobs fault on
 752          * these earlier chips (perhaps this is a chicken bit of some kind).
 753          * More investigation is needed. */
 754
 755         SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
 756
 757         if (dev->quirks & IS_BIFROST) {
 758                 /* TODO */
 759         } else {
 760                 /* Depending on whether it's legal to in the given shader, we try to
 761                  * enable early-z testing. TODO: respect e-z force */
 762
 763                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
 764                         !fs->can_discard && !fs->writes_global &&
 765                         !fs->writes_depth && !fs->writes_stencil &&
 766                         !ctx->blend->base.alpha_to_coverage);
 767
 768                 /* Add the writes Z/S flags if needed. */
 769                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
 770                 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
 771
 772                 /* Any time texturing is used, derivatives are implicitly calculated,
 773                  * so we need to enable helper invocations */
 774
 775                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
 776                         fs->helper_invocations);
 777
 778                 /* If discard is enabled, which bit we set to convey this
 779                  * depends on if depth/stencil is used for the draw or not.
 780                  * Just one of depth OR stencil is enough to trigger this. */
 781
 782                 const struct pipe_depth_stencil_alpha_state *zsa = &ctx->depth_stencil->base;
 783                 bool zs_enabled = fs->writes_depth || fs->writes_stencil;
 784
 785                 if (zsa) {
 786                         zs_enabled |= (zsa->depth.enabled && zsa->depth.func != PIPE_FUNC_ALWAYS);
 787                         zs_enabled |= zsa->stencil[0].enabled;
 788                 }
 789
 790                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_TILEBUFFER,
 791                         fs->outputs_read || (!zs_enabled && fs->can_discard));
 792                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, zs_enabled && fs->can_discard);
 793         }
 794
 795         panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
 796         panfrost_frag_meta_zsa_update(ctx, fragmeta);
 797         panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
 798 }
 799
 800 void
 801 panfrost_emit_shader_meta(struct panfrost_batch *batch,
 802                           enum pipe_shader_type st,
 803                           struct mali_vertex_tiler_postfix *postfix)
 804 {
 805         struct panfrost_context *ctx = batch->ctx;
 806         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 807
 808         if (!ss) {
 809                 postfix->shader = 0;
 810                 return;
 811         }
 812
 813         struct mali_shader_meta meta;
 814
 815         panfrost_shader_meta_init(ctx, st, &meta);
 816
 817         /* Add the shader BO to the batch. */
 818         panfrost_batch_add_bo(batch, ss->bo,
 819                               PAN_BO_ACCESS_PRIVATE |
 820                               PAN_BO_ACCESS_READ |
 821                               panfrost_bo_access_for_stage(st));
 822
 823         mali_ptr shader_ptr;
 824
 825         if (st == PIPE_SHADER_FRAGMENT) {
 826                 struct panfrost_device *dev = pan_device(ctx->base.screen);
 827                 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 828                 size_t desc_size = sizeof(meta);
 829                 void *rts = NULL;
 830                 struct panfrost_transfer xfer;
 831                 unsigned rt_size;
 832
 833                 if (dev->quirks & MIDGARD_SFBD)
 834                         rt_size = 0;
 835                 else if (dev->quirks & IS_BIFROST)
 836                         rt_size = sizeof(struct bifrost_blend_rt);
 837                 else
 838                         rt_size = sizeof(struct midgard_blend_rt);
 839
 840                 desc_size += rt_size * rt_count;
 841
 842                 if (rt_size)
 843                         rts = rzalloc_size(ctx, rt_size * rt_count);
 844
 845                 panfrost_frag_shader_meta_init(ctx, &meta, rts);
 846
 847                 xfer = panfrost_pool_alloc(&batch->pool, desc_size);
 848
 849                 memcpy(xfer.cpu, &meta, sizeof(meta));
 850                 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
 851
 852                 if (rt_size)
 853                         ralloc_free(rts);
 854
 855                 shader_ptr = xfer.gpu;
 856         } else {
 857                 shader_ptr = panfrost_pool_upload(&batch->pool, &meta,
 858                                                        sizeof(meta));
 859         }
 860
 861         postfix->shader = shader_ptr;
 862 }
 863
 864 void
 865 panfrost_emit_viewport(struct panfrost_batch *batch,
 866                        struct mali_vertex_tiler_postfix *tiler_postfix)
 867 {
 868         struct panfrost_context *ctx = batch->ctx;
 869         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 870         const struct pipe_scissor_state *ss = &ctx->scissor;
 871         const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 872         const struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
 873
 874         /* Derive min/max from translate/scale. Note since |x| >= 0 by
 875          * definition, we have that -|x| <= |x| hence translate - |scale| <=
 876          * translate + |scale|, so the ordering is correct here. */
 877         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 878         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 879         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 880         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 881         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 882         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 883
 884         /* Scissor to the intersection of viewport and to the scissor, clamped
 885          * to the framebuffer */
 886
 887         unsigned minx = MIN2(fb->width, vp_minx);
 888         unsigned maxx = MIN2(fb->width, vp_maxx);
 889         unsigned miny = MIN2(fb->height, vp_miny);
 890         unsigned maxy = MIN2(fb->height, vp_maxy);
 891
 892         if (ss && rast && rast->scissor) {
 893                 minx = MAX2(ss->minx, minx);
 894                 miny = MAX2(ss->miny, miny);
 895                 maxx = MIN2(ss->maxx, maxx);
 896                 maxy = MIN2(ss->maxy, maxy);
 897         }
 898
 899         struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, MALI_VIEWPORT_LENGTH);
 900
 901         pan_pack(T.cpu, VIEWPORT, cfg) {
 902                 cfg.scissor_minimum_x = minx;
 903                 cfg.scissor_minimum_y = miny;
 904                 cfg.scissor_maximum_x = maxx - 1;
 905                 cfg.scissor_maximum_y = maxy - 1;
 906
 907                 cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY;
 908                 cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY;
 909         }
 910
 911         tiler_postfix->viewport = T.gpu;
 912         panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
 913 }
 914
 915 static mali_ptr
 916 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
 917                                  enum pipe_shader_type st,
 918                                  struct panfrost_constant_buffer *buf,
 919                                  unsigned index)
 920 {
 921         struct pipe_constant_buffer *cb = &buf->cb[index];
 922         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
 923
 924         if (rsrc) {
 925                 panfrost_batch_add_bo(batch, rsrc->bo,
 926                                       PAN_BO_ACCESS_SHARED |
 927                                       PAN_BO_ACCESS_READ |
 928                                       panfrost_bo_access_for_stage(st));
 929
 930                 /* Alignment gauranteed by
 931                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
 932                 return rsrc->bo->gpu + cb->buffer_offset;
 933         } else if (cb->user_buffer) {
 934                 return panfrost_pool_upload(&batch->pool,
 935                                                  cb->user_buffer +
 936                                                  cb->buffer_offset,
 937                                                  cb->buffer_size);
 938         } else {
 939                 unreachable("No constant buffer");
 940         }
 941 }
 942
 943 struct sysval_uniform {
 944         union {
 945                 float f[4];
 946                 int32_t i[4];
 947                 uint32_t u[4];
 948                 uint64_t du[2];
 949         };
 950 };
 951
 952 static void
 953 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
 954                                       struct sysval_uniform *uniform)
 955 {
 956         struct panfrost_context *ctx = batch->ctx;
 957         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 958
 959         uniform->f[0] = vp->scale[0];
 960         uniform->f[1] = vp->scale[1];
 961         uniform->f[2] = vp->scale[2];
 962 }
 963
 964 static void
 965 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
 966                                        struct sysval_uniform *uniform)
 967 {
 968         struct panfrost_context *ctx = batch->ctx;
 969         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 970
 971         uniform->f[0] = vp->translate[0];
 972         uniform->f[1] = vp->translate[1];
 973         uniform->f[2] = vp->translate[2];
 974 }
 975
 976 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
 977                                        enum pipe_shader_type st,
 978                                        unsigned int sysvalid,
 979                                        struct sysval_uniform *uniform)
 980 {
 981         struct panfrost_context *ctx = batch->ctx;
 982         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
 983         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
 984         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
 985         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
 986
 987         assert(dim);
 988         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
 989
 990         if (dim > 1)
 991                 uniform->i[1] = u_minify(tex->texture->height0,
 992                                          tex->u.tex.first_level);
 993
 994         if (dim > 2)
 995                 uniform->i[2] = u_minify(tex->texture->depth0,
 996                                          tex->u.tex.first_level);
 997
 998         if (is_array)
 999                 uniform->i[dim] = tex->texture->array_size;
1000 }
1001
1002 static void
1003 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1004                             enum pipe_shader_type st,
1005                             unsigned ssbo_id,
1006                             struct sysval_uniform *uniform)
1007 {
1008         struct panfrost_context *ctx = batch->ctx;
1009
1010         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1011         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1012
1013         /* Compute address */
1014         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1015
1016         panfrost_batch_add_bo(batch, bo,
1017                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1018                               panfrost_bo_access_for_stage(st));
1019
1020         /* Upload address and size as sysval */
1021         uniform->du[0] = bo->gpu + sb.buffer_offset;
1022         uniform->u[2] = sb.buffer_size;
1023 }
1024
1025 static void
1026 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1027                                enum pipe_shader_type st,
1028                                unsigned samp_idx,
1029                                struct sysval_uniform *uniform)
1030 {
1031         struct panfrost_context *ctx = batch->ctx;
1032         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1033
1034         uniform->f[0] = sampl->min_lod;
1035         uniform->f[1] = sampl->max_lod;
1036         uniform->f[2] = sampl->lod_bias;
1037
1038         /* Even without any errata, Midgard represents "no mipmapping" as
1039          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1040          * panfrost_create_sampler_state which also explains our choice of
1041          * epsilon value (again to keep behaviour consistent) */
1042
1043         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1044                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1045 }
1046
1047 static void
1048 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1049                                        struct sysval_uniform *uniform)
1050 {
1051         struct panfrost_context *ctx = batch->ctx;
1052
1053         uniform->u[0] = ctx->compute_grid->grid[0];
1054         uniform->u[1] = ctx->compute_grid->grid[1];
1055         uniform->u[2] = ctx->compute_grid->grid[2];
1056 }
1057
1058 static void
1059 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1060                         struct panfrost_shader_state *ss,
1061                         enum pipe_shader_type st)
1062 {
1063         struct sysval_uniform *uniforms = (void *)buf;
1064
1065         for (unsigned i = 0; i < ss->sysval_count; ++i) {
1066                 int sysval = ss->sysval[i];
1067
1068                 switch (PAN_SYSVAL_TYPE(sysval)) {
1069                 case PAN_SYSVAL_VIEWPORT_SCALE:
1070                         panfrost_upload_viewport_scale_sysval(batch,
1071                                                               &uniforms[i]);
1072                         break;
1073                 case PAN_SYSVAL_VIEWPORT_OFFSET:
1074                         panfrost_upload_viewport_offset_sysval(batch,
1075                                                                &uniforms[i]);
1076                         break;
1077                 case PAN_SYSVAL_TEXTURE_SIZE:
1078                         panfrost_upload_txs_sysval(batch, st,
1079                                                    PAN_SYSVAL_ID(sysval),
1080                                                    &uniforms[i]);
1081                         break;
1082                 case PAN_SYSVAL_SSBO:
1083                         panfrost_upload_ssbo_sysval(batch, st,
1084                                                     PAN_SYSVAL_ID(sysval),
1085                                                     &uniforms[i]);
1086                         break;
1087                 case PAN_SYSVAL_NUM_WORK_GROUPS:
1088                         panfrost_upload_num_work_groups_sysval(batch,
1089                                                                &uniforms[i]);
1090                         break;
1091                 case PAN_SYSVAL_SAMPLER:
1092                         panfrost_upload_sampler_sysval(batch, st,
1093                                                        PAN_SYSVAL_ID(sysval),
1094                                                        &uniforms[i]);
1095                         break;
1096                 default:
1097                         assert(0);
1098                 }
1099         }
1100 }
1101
1102 static const void *
1103 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1104                                  unsigned index)
1105 {
1106         struct pipe_constant_buffer *cb = &buf->cb[index];
1107         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1108
1109         if (rsrc)
1110                 return rsrc->bo->cpu;
1111         else if (cb->user_buffer)
1112                 return cb->user_buffer;
1113         else
1114                 unreachable("No constant buffer");
1115 }
1116
1117 void
1118 panfrost_emit_const_buf(struct panfrost_batch *batch,
1119                         enum pipe_shader_type stage,
1120                         struct mali_vertex_tiler_postfix *postfix)
1121 {
1122         struct panfrost_context *ctx = batch->ctx;
1123         struct panfrost_shader_variants *all = ctx->shader[stage];
1124
1125         if (!all)
1126                 return;
1127
1128         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1129
1130         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1131
1132         /* Uniforms are implicitly UBO #0 */
1133         bool has_uniforms = buf->enabled_mask & (1 << 0);
1134
1135         /* Allocate room for the sysval and the uniforms */
1136         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1137         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1138         size_t size = sys_size + uniform_size;
1139         struct panfrost_transfer transfer = panfrost_pool_alloc(&batch->pool,
1140                                                                         size);
1141
1142         /* Upload sysvals requested by the shader */
1143         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1144
1145         /* Upload uniforms */
1146         if (has_uniforms && uniform_size) {
1147                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1148                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1149         }
1150
1151         /* Next up, attach UBOs. UBO #0 is the uniforms we just
1152          * uploaded */
1153
1154         unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1155         assert(ubo_count >= 1);
1156
1157         size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count;
1158         struct panfrost_transfer ubos = panfrost_pool_alloc(&batch->pool, sz);
1159         uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
1160
1161         /* Upload uniforms as a UBO */
1162
1163         if (ss->uniform_count) {
1164                 pan_pack(ubo_ptr, UNIFORM_BUFFER, cfg) {
1165                         cfg.entries = ss->uniform_count;
1166                         cfg.pointer = transfer.gpu;
1167                 }
1168         } else {
1169                 *ubo_ptr = 0;
1170         }
1171
1172         /* The rest are honest-to-goodness UBOs */
1173
1174         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1175                 size_t usz = buf->cb[ubo].buffer_size;
1176                 bool enabled = buf->enabled_mask & (1 << ubo);
1177                 bool empty = usz == 0;
1178
1179                 if (!enabled || empty) {
1180                         ubo_ptr[ubo] = 0;
1181                         continue;
1182                 }
1183
1184                 pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) {
1185                         cfg.entries = DIV_ROUND_UP(usz, 16);
1186                         cfg.pointer = panfrost_map_constant_buffer_gpu(batch,
1187                                         stage, buf, ubo);
1188                 }
1189         }
1190
1191         postfix->uniforms = transfer.gpu;
1192         postfix->uniform_buffers = ubos.gpu;
1193
1194         buf->dirty_mask = 0;
1195 }
1196
1197 void
1198 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1199                             const struct pipe_grid_info *info,
1200                             struct midgard_payload_vertex_tiler *vtp)
1201 {
1202         struct panfrost_context *ctx = batch->ctx;
1203         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1204         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1205         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1206                                                            128));
1207         unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1208                                info->grid[2] * 4;
1209         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1210                                                                   shared_size,
1211                                                                   1);
1212
1213         struct mali_shared_memory shared = {
1214                 .shared_memory = bo->gpu,
1215                 .shared_workgroup_count =
1216                         util_logbase2_ceil(info->grid[0]) +
1217                         util_logbase2_ceil(info->grid[1]) +
1218                         util_logbase2_ceil(info->grid[2]),
1219                 .shared_unk1 = 0x2,
1220                 .shared_shift = util_logbase2(single_size) - 1
1221         };
1222
1223         vtp->postfix.shared_memory = panfrost_pool_upload(&batch->pool, &shared,
1224                                                                sizeof(shared));
1225 }
1226
1227 static mali_ptr
1228 panfrost_get_tex_desc(struct panfrost_batch *batch,
1229                       enum pipe_shader_type st,
1230                       struct panfrost_sampler_view *view)
1231 {
1232         if (!view)
1233                 return (mali_ptr) 0;
1234
1235         struct pipe_sampler_view *pview = &view->base;
1236         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1237
1238         /* Add the BO to the job so it's retained until the job is done. */
1239
1240         panfrost_batch_add_bo(batch, rsrc->bo,
1241                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1242                               panfrost_bo_access_for_stage(st));
1243
1244         panfrost_batch_add_bo(batch, view->bo,
1245                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1246                               panfrost_bo_access_for_stage(st));
1247
1248         return view->bo->gpu;
1249 }
1250
1251 static void
1252 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1253                              struct pipe_context *pctx)
1254 {
1255         struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1256         if (view->texture_bo != rsrc->bo->gpu ||
1257             view->modifier != rsrc->modifier) {
1258                 panfrost_bo_unreference(view->bo);
1259                 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1260         }
1261 }
1262
1263 void
1264 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1265                                   enum pipe_shader_type stage,
1266                                   struct mali_vertex_tiler_postfix *postfix)
1267 {
1268         struct panfrost_context *ctx = batch->ctx;
1269         struct panfrost_device *device = pan_device(ctx->base.screen);
1270
1271         if (!ctx->sampler_view_count[stage])
1272                 return;
1273
1274         if (device->quirks & IS_BIFROST) {
1275                 struct bifrost_texture_descriptor *descriptors;
1276
1277                 descriptors = malloc(sizeof(struct bifrost_texture_descriptor) *
1278                                      ctx->sampler_view_count[stage]);
1279
1280                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1281                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1282                         struct pipe_sampler_view *pview = &view->base;
1283                         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1284                         panfrost_update_sampler_view(view, &ctx->base);
1285
1286                         /* Add the BOs to the job so they are retained until the job is done. */
1287
1288                         panfrost_batch_add_bo(batch, rsrc->bo,
1289                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1290                                               panfrost_bo_access_for_stage(stage));
1291
1292                         panfrost_batch_add_bo(batch, view->bo,
1293                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1294                                               panfrost_bo_access_for_stage(stage));
1295
1296                         memcpy(&descriptors[i], view->bifrost_descriptor, sizeof(*view->bifrost_descriptor));
1297                 }
1298
1299                 postfix->textures = panfrost_pool_upload(&batch->pool,
1300                                                               descriptors,
1301                                                               sizeof(struct bifrost_texture_descriptor) *
1302                                                                       ctx->sampler_view_count[stage]);
1303
1304                 free(descriptors);
1305         } else {
1306                 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1307
1308                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1309                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1310
1311                         panfrost_update_sampler_view(view, &ctx->base);
1312
1313                         trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1314                 }
1315
1316                 postfix->textures = panfrost_pool_upload(&batch->pool,
1317                                                               trampolines,
1318                                                               sizeof(uint64_t) *
1319                                                               ctx->sampler_view_count[stage]);
1320         }
1321 }
1322
1323 void
1324 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1325                                   enum pipe_shader_type stage,
1326                                   struct mali_vertex_tiler_postfix *postfix)
1327 {
1328         struct panfrost_context *ctx = batch->ctx;
1329         struct panfrost_device *device = pan_device(ctx->base.screen);
1330
1331         if (!ctx->sampler_count[stage])
1332                 return;
1333
1334         if (device->quirks & IS_BIFROST) {
1335                 size_t desc_size = sizeof(struct bifrost_sampler_descriptor);
1336                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1337                 struct panfrost_transfer transfer = panfrost_pool_alloc(&batch->pool,
1338                                                                                 transfer_size);
1339                 struct bifrost_sampler_descriptor *desc = (struct bifrost_sampler_descriptor *)transfer.cpu;
1340
1341                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1342                         desc[i] = ctx->samplers[stage][i]->bifrost_hw;
1343
1344                 postfix->sampler_descriptor = transfer.gpu;
1345         } else {
1346                 size_t desc_size = sizeof(struct mali_sampler_descriptor);
1347                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1348                 struct panfrost_transfer transfer = panfrost_pool_alloc(&batch->pool,
1349                                                                                 transfer_size);
1350                 struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1351
1352                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1353                         desc[i] = ctx->samplers[stage][i]->midgard_hw;
1354
1355                 postfix->sampler_descriptor = transfer.gpu;
1356         }
1357 }
1358
1359 void
1360 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1361                                struct mali_vertex_tiler_postfix *vertex_postfix)
1362 {
1363         struct panfrost_context *ctx = batch->ctx;
1364
1365         if (!ctx->vertex)
1366                 return;
1367
1368         struct panfrost_vertex_state *so = ctx->vertex;
1369
1370         panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1371         vertex_postfix->attribute_meta = panfrost_pool_upload(&batch->pool, so->hw,
1372                                                                sizeof(*so->hw) *
1373                                                                PAN_MAX_ATTRIBUTE);
1374 }
1375
1376 void
1377 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1378                           struct mali_vertex_tiler_postfix *vertex_postfix)
1379 {
1380         struct panfrost_context *ctx = batch->ctx;
1381         struct panfrost_vertex_state *so = ctx->vertex;
1382
1383         /* Staged mali_attr, and index into them. i =/= k, depending on the
1384          * vertex buffer mask and instancing. Twice as much room is allocated,
1385          * for a worst case of NPOT_DIVIDEs which take up extra slot */
1386         union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1387         unsigned k = 0;
1388
1389         for (unsigned i = 0; i < so->num_elements; ++i) {
1390                 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1391                  * means duplicating some vertex buffers (who cares? aside from
1392                  * maybe some caching implications but I somehow doubt that
1393                  * matters) */
1394
1395                 struct pipe_vertex_element *elem = &so->pipe[i];
1396                 unsigned vbi = elem->vertex_buffer_index;
1397
1398                 /* The exception to 1:1 mapping is that we can have multiple
1399                  * entries (NPOT divisors), so we fixup anyways */
1400
1401                 so->hw[i].index = k;
1402
1403                 if (!(ctx->vb_mask & (1 << vbi)))
1404                         continue;
1405
1406                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1407                 struct panfrost_resource *rsrc;
1408
1409                 rsrc = pan_resource(buf->buffer.resource);
1410                 if (!rsrc)
1411                         continue;
1412
1413                 /* Align to 64 bytes by masking off the lower bits. This
1414                  * will be adjusted back when we fixup the src_offset in
1415                  * mali_attr_meta */
1416
1417                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1418                 mali_ptr addr = raw_addr & ~63;
1419                 unsigned chopped_addr = raw_addr - addr;
1420
1421                 /* Add a dependency of the batch on the vertex buffer */
1422                 panfrost_batch_add_bo(batch, rsrc->bo,
1423                                       PAN_BO_ACCESS_SHARED |
1424                                       PAN_BO_ACCESS_READ |
1425                                       PAN_BO_ACCESS_VERTEX_TILER);
1426
1427                 /* Set common fields */
1428                 attrs[k].elements = addr;
1429                 attrs[k].stride = buf->stride;
1430
1431                 /* Since we advanced the base pointer, we shrink the buffer
1432                  * size */
1433                 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1434
1435                 /* We need to add the extra size we masked off (for
1436                  * correctness) so the data doesn't get clamped away */
1437                 attrs[k].size += chopped_addr;
1438
1439                 /* For non-instancing make sure we initialize */
1440                 attrs[k].shift = attrs[k].extra_flags = 0;
1441
1442                 /* Instancing uses a dramatically different code path than
1443                  * linear, so dispatch for the actual emission now that the
1444                  * common code is finished */
1445
1446                 unsigned divisor = elem->instance_divisor;
1447
1448                 if (divisor && ctx->instance_count == 1) {
1449                         /* Silly corner case where there's a divisor(=1) but
1450                          * there's no legitimate instancing. So we want *every*
1451                          * attribute to be the same. So set stride to zero so
1452                          * we don't go anywhere. */
1453
1454                         attrs[k].size = attrs[k].stride + chopped_addr;
1455                         attrs[k].stride = 0;
1456                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1457                 } else if (ctx->instance_count <= 1) {
1458                         /* Normal, non-instanced attributes */
1459                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1460                 } else {
1461                         unsigned instance_shift = vertex_postfix->instance_shift;
1462                         unsigned instance_odd = vertex_postfix->instance_odd;
1463
1464                         k += panfrost_vertex_instanced(ctx->padded_count,
1465                                                        instance_shift,
1466                                                        instance_odd,
1467                                                        divisor, &attrs[k]);
1468                 }
1469         }
1470
1471         /* Add special gl_VertexID/gl_InstanceID buffers */
1472
1473         panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1474         so->hw[PAN_VERTEX_ID].index = k++;
1475         panfrost_instance_id(ctx->padded_count, &attrs[k]);
1476         so->hw[PAN_INSTANCE_ID].index = k++;
1477
1478         /* Upload whatever we emitted and go */
1479
1480         vertex_postfix->attributes = panfrost_pool_upload(&batch->pool, attrs,
1481                                                            k * sizeof(*attrs));
1482 }
1483
1484 static mali_ptr
1485 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1486                        unsigned stride, unsigned count)
1487 {
1488         /* Fill out the descriptor */
1489         slot->stride = stride;
1490         slot->size = stride * count;
1491         slot->shift = slot->extra_flags = 0;
1492
1493         struct panfrost_transfer transfer = panfrost_pool_alloc(&batch->pool,
1494                                                                         slot->size);
1495
1496         slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1497
1498         return transfer.gpu;
1499 }
1500
1501 static unsigned
1502 panfrost_streamout_offset(unsigned stride, unsigned offset,
1503                         struct pipe_stream_output_target *target)
1504 {
1505         return (target->buffer_offset + (offset * stride * 4)) & 63;
1506 }
1507
1508 static void
1509 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1510                         unsigned stride, unsigned offset, unsigned count,
1511                         struct pipe_stream_output_target *target)
1512 {
1513         /* Fill out the descriptor */
1514         slot->stride = stride * 4;
1515         slot->shift = slot->extra_flags = 0;
1516
1517         unsigned max_size = target->buffer_size;
1518         unsigned expected_size = slot->stride * count;
1519
1520         /* Grab the BO and bind it to the batch */
1521         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1522
1523         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1524          * the perspective of the TILER and FRAGMENT.
1525          */
1526         panfrost_batch_add_bo(batch, bo,
1527                               PAN_BO_ACCESS_SHARED |
1528                               PAN_BO_ACCESS_RW |
1529                               PAN_BO_ACCESS_VERTEX_TILER |
1530                               PAN_BO_ACCESS_FRAGMENT);
1531
1532         /* We will have an offset applied to get alignment */
1533         mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1534         slot->elements = (addr & ~63) | MALI_ATTR_LINEAR;
1535         slot->size = MIN2(max_size, expected_size) + (addr & 63);
1536 }
1537
1538 static bool
1539 has_point_coord(unsigned mask, gl_varying_slot loc)
1540 {
1541         if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1542                 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1543         else if (loc == VARYING_SLOT_PNTC)
1544                 return (mask & (1 << 8));
1545         else
1546                 return false;
1547 }
1548
1549 /* Helpers for manipulating stream out information so we can pack varyings
1550  * accordingly. Compute the src_offset for a given captured varying */
1551
1552 static struct pipe_stream_output *
1553 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1554 {
1555         for (unsigned i = 0; i < info->num_outputs; ++i) {
1556                 if (info->output[i].register_index == loc)
1557                         return &info->output[i];
1558         }
1559
1560         unreachable("Varying not captured");
1561 }
1562
1563 static unsigned
1564 pan_varying_size(enum mali_format fmt)
1565 {
1566         unsigned type = MALI_EXTRACT_TYPE(fmt);
1567         unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1568         unsigned bits = MALI_EXTRACT_BITS(fmt);
1569         unsigned bpc = 0;
1570
1571         if (bits == MALI_CHANNEL_FLOAT) {
1572                 /* No doubles */
1573                 bool fp16 = (type == MALI_FORMAT_SINT);
1574                 assert(fp16 || (type == MALI_FORMAT_UNORM));
1575
1576                 bpc = fp16 ? 2 : 4;
1577         } else {
1578                 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1579
1580                 /* See the enums */
1581                 bits = 1 << bits;
1582                 assert(bits >= 8);
1583                 bpc = bits / 8;
1584         }
1585
1586         return bpc * chan;
1587 }
1588
1589 /* Indices for named (non-XFB) varyings that are present. These are packed
1590  * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1591  * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1592  * of a given special field given a shift S by:
1593  *
1594  *      idx = popcount(P & ((1 << S) - 1))
1595  *
1596  * That is... look at all of the varyings that come earlier and count them, the
1597  * count is the new index since plus one. Likewise, the total number of special
1598  * buffers required is simply popcount(P)
1599  */
1600
1601 enum pan_special_varying {
1602         PAN_VARY_GENERAL = 0,
1603         PAN_VARY_POSITION = 1,
1604         PAN_VARY_PSIZ = 2,
1605         PAN_VARY_PNTCOORD = 3,
1606         PAN_VARY_FACE = 4,
1607         PAN_VARY_FRAGCOORD = 5,
1608
1609         /* Keep last */
1610         PAN_VARY_MAX,
1611 };
1612
1613 /* Given a varying, figure out which index it correpsonds to */
1614
1615 static inline unsigned
1616 pan_varying_index(unsigned present, enum pan_special_varying v)
1617 {
1618         unsigned mask = (1 << v) - 1;
1619         return util_bitcount(present & mask);
1620 }
1621
1622 /* Get the base offset for XFB buffers, which by convention come after
1623  * everything else. Wrapper function for semantic reasons; by construction this
1624  * is just popcount. */
1625
1626 static inline unsigned
1627 pan_xfb_base(unsigned present)
1628 {
1629         return util_bitcount(present);
1630 }
1631
1632 /* Computes the present mask for varyings so we can start emitting varying records */
1633
1634 static inline unsigned
1635 pan_varying_present(
1636         struct panfrost_shader_state *vs,
1637         struct panfrost_shader_state *fs,
1638         unsigned quirks)
1639 {
1640         /* At the moment we always emit general and position buffers. Not
1641          * strictly necessary but usually harmless */
1642
1643         unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1644
1645         /* Enable special buffers by the shader info */
1646
1647         if (vs->writes_point_size)
1648                 present |= (1 << PAN_VARY_PSIZ);
1649
1650         if (fs->reads_point_coord)
1651                 present |= (1 << PAN_VARY_PNTCOORD);
1652
1653         if (fs->reads_face)
1654                 present |= (1 << PAN_VARY_FACE);
1655
1656         if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1657                 present |= (1 << PAN_VARY_FRAGCOORD);
1658
1659         /* Also, if we have a point sprite, we need a point coord buffer */
1660
1661         for (unsigned i = 0; i < fs->varying_count; i++)  {
1662                 gl_varying_slot loc = fs->varyings_loc[i];
1663
1664                 if (has_point_coord(fs->point_sprite_mask, loc))
1665                         present |= (1 << PAN_VARY_PNTCOORD);
1666         }
1667
1668         return present;
1669 }
1670
1671 /* Emitters for varying records */
1672
1673 static struct mali_attr_meta
1674 pan_emit_vary(unsigned present, enum pan_special_varying buf,
1675                 unsigned quirks, enum mali_format format,
1676                 unsigned offset)
1677 {
1678         unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1679
1680         struct mali_attr_meta meta = {
1681                 .index = pan_varying_index(present, buf),
1682                 .unknown1 = quirks & IS_BIFROST ? 0x0 : 0x2,
1683                 .swizzle = quirks & HAS_SWIZZLES ?
1684                         panfrost_get_default_swizzle(nr_channels) :
1685                         panfrost_bifrost_swizzle(nr_channels),
1686                 .format = format,
1687                 .src_offset = offset
1688         };
1689
1690         return meta;
1691 }
1692
1693 /* General varying that is unused */
1694
1695 static struct mali_attr_meta
1696 pan_emit_vary_only(unsigned present, unsigned quirks)
1697 {
1698         return pan_emit_vary(present, 0, quirks, MALI_VARYING_DISCARD, 0);
1699 }
1700
1701 /* Special records */
1702
1703 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1704         [PAN_VARY_POSITION]     = MALI_VARYING_POS,
1705         [PAN_VARY_PSIZ]         = MALI_R16F,
1706         [PAN_VARY_PNTCOORD]     = MALI_R16F,
1707         [PAN_VARY_FACE]         = MALI_R32I,
1708         [PAN_VARY_FRAGCOORD]    = MALI_RGBA32F
1709 };
1710
1711 static struct mali_attr_meta
1712 pan_emit_vary_special(unsigned present, enum pan_special_varying buf,
1713                 unsigned quirks)
1714 {
1715         assert(buf < PAN_VARY_MAX);
1716         return pan_emit_vary(present, buf, quirks, pan_varying_formats[buf], 0);
1717 }
1718
1719 static enum mali_format
1720 pan_xfb_format(enum mali_format format, unsigned nr)
1721 {
1722         if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1723                 return MALI_R32F | MALI_NR_CHANNELS(nr);
1724         else
1725                 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1726 }
1727
1728 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1729  * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1730  * value. */
1731
1732 static struct mali_attr_meta
1733 pan_emit_vary_xfb(unsigned present,
1734                 unsigned max_xfb,
1735                 unsigned *streamout_offsets,
1736                 unsigned quirks,
1737                 enum mali_format format,
1738                 struct pipe_stream_output o)
1739 {
1740         /* Otherwise construct a record for it */
1741         struct mali_attr_meta meta = {
1742                 /* XFB buffers come after everything else */
1743                 .index = pan_xfb_base(present) + o.output_buffer,
1744
1745                 /* As usual unknown bit */
1746                 .unknown1 = quirks & IS_BIFROST ? 0x0 : 0x2,
1747
1748                 /* Override swizzle with number of channels */
1749                 .swizzle = quirks & HAS_SWIZZLES ?
1750                         panfrost_get_default_swizzle(o.num_components) :
1751                         panfrost_bifrost_swizzle(o.num_components),
1752
1753                 /* Override number of channels and precision to highp */
1754                 .format = pan_xfb_format(format, o.num_components),
1755
1756                 /* Apply given offsets together */
1757                 .src_offset = (o.dst_offset * 4) /* dwords */
1758                         + streamout_offsets[o.output_buffer]
1759         };
1760
1761         return meta;
1762 }
1763
1764 /* Determine if we should capture a varying for XFB. This requires actually
1765  * having a buffer for it. If we don't capture it, we'll fallback to a general
1766  * varying path (linked or unlinked, possibly discarding the write) */
1767
1768 static bool
1769 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1770                 unsigned loc, unsigned max_xfb)
1771 {
1772         if (!(xfb->so_mask & (1ll << loc)))
1773                 return false;
1774
1775         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1776         return o->output_buffer < max_xfb;
1777 }
1778
1779 /* Higher-level wrapper around all of the above, classifying a varying into one
1780  * of the above types */
1781
1782 static struct mali_attr_meta
1783 panfrost_emit_varying(
1784                 struct panfrost_shader_state *stage,
1785                 struct panfrost_shader_state *other,
1786                 struct panfrost_shader_state *xfb,
1787                 unsigned present,
1788                 unsigned max_xfb,
1789                 unsigned *streamout_offsets,
1790                 unsigned quirks,
1791                 unsigned *gen_offsets,
1792                 enum mali_format *gen_formats,
1793                 unsigned *gen_stride,
1794                 unsigned idx,
1795                 bool should_alloc,
1796                 bool is_fragment)
1797 {
1798         gl_varying_slot loc = stage->varyings_loc[idx];
1799         enum mali_format format = stage->varyings[idx];
1800
1801         /* Override format to match linkage */
1802         if (!should_alloc && gen_formats[idx])
1803                 format = gen_formats[idx];
1804
1805         if (has_point_coord(stage->point_sprite_mask, loc)) {
1806                 return pan_emit_vary_special(present, PAN_VARY_PNTCOORD, quirks);
1807         } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1808                 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1809                 return pan_emit_vary_xfb(present, max_xfb, streamout_offsets, quirks, format, *o);
1810         } else if (loc == VARYING_SLOT_POS) {
1811                 if (is_fragment)
1812                         return pan_emit_vary_special(present, PAN_VARY_FRAGCOORD, quirks);
1813                 else
1814                         return pan_emit_vary_special(present, PAN_VARY_POSITION, quirks);
1815         } else if (loc == VARYING_SLOT_PSIZ) {
1816                 return pan_emit_vary_special(present, PAN_VARY_PSIZ, quirks);
1817         } else if (loc == VARYING_SLOT_PNTC) {
1818                 return pan_emit_vary_special(present, PAN_VARY_PNTCOORD, quirks);
1819         } else if (loc == VARYING_SLOT_FACE) {
1820                 return pan_emit_vary_special(present, PAN_VARY_FACE, quirks);
1821         }
1822
1823         /* We've exhausted special cases, so it's otherwise a general varying. Check if we're linked */
1824         signed other_idx = -1;
1825
1826         for (unsigned j = 0; j < other->varying_count; ++j) {
1827                 if (other->varyings_loc[j] == loc) {
1828                         other_idx = j;
1829                         break;
1830                 }
1831         }
1832
1833         if (other_idx < 0)
1834                 return pan_emit_vary_only(present, quirks);
1835
1836         unsigned offset = gen_offsets[other_idx];
1837
1838         if (should_alloc) {
1839                 /* We're linked, so allocate a space via a watermark allocation */
1840                 enum mali_format alt = other->varyings[other_idx];
1841
1842                 /* Do interpolation at minimum precision */
1843                 unsigned size_main = pan_varying_size(format);
1844                 unsigned size_alt = pan_varying_size(alt);
1845                 unsigned size = MIN2(size_main, size_alt);
1846
1847                 /* If a varying is marked for XFB but not actually captured, we
1848                  * should match the format to the format that would otherwise
1849                  * be used for XFB, since dEQP checks for invariance here. It's
1850                  * unclear if this is required by the spec. */
1851
1852                 if (xfb->so_mask & (1ull << loc)) {
1853                         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1854                         format = pan_xfb_format(format, o->num_components);
1855                         size = pan_varying_size(format);
1856                 } else if (size == size_alt) {
1857                         format = alt;
1858                 }
1859
1860                 gen_offsets[idx] = *gen_stride;
1861                 gen_formats[other_idx] = format;
1862                 offset = *gen_stride;
1863                 *gen_stride += size;
1864         }
1865
1866         return pan_emit_vary(present, PAN_VARY_GENERAL,
1867                         quirks, format, offset);
1868 }
1869
1870 static void
1871 pan_emit_special_input(union mali_attr *varyings,
1872                 unsigned present,
1873                 enum pan_special_varying v,
1874                 mali_ptr addr)
1875 {
1876         if (present & (1 << v)) {
1877                 /* Ensure we write exactly once for performance and with fields
1878                  * zeroed appropriately to avoid flakes */
1879
1880                 union mali_attr s = {
1881                         .elements = addr
1882                 };
1883
1884                 varyings[pan_varying_index(present, v)] = s;
1885         }
1886 }
1887
1888 void
1889 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1890                                  unsigned vertex_count,
1891                                  struct mali_vertex_tiler_postfix *vertex_postfix,
1892                                  struct mali_vertex_tiler_postfix *tiler_postfix,
1893                                  union midgard_primitive_size *primitive_size)
1894 {
1895         /* Load the shaders */
1896         struct panfrost_context *ctx = batch->ctx;
1897         struct panfrost_device *dev = pan_device(ctx->base.screen);
1898         struct panfrost_shader_state *vs, *fs;
1899         size_t vs_size, fs_size;
1900
1901         /* Allocate the varying descriptor */
1902
1903         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1904         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1905         vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
1906         fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
1907
1908         struct panfrost_transfer trans = panfrost_pool_alloc(&batch->pool,
1909                                                                      vs_size +
1910                                                                      fs_size);
1911
1912         struct pipe_stream_output_info *so = &vs->stream_output;
1913         unsigned present = pan_varying_present(vs, fs, dev->quirks);
1914
1915         /* Check if this varying is linked by us. This is the case for
1916          * general-purpose, non-captured varyings. If it is, link it. If it's
1917          * not, use the provided stream out information to determine the
1918          * offset, since it was already linked for us. */
1919
1920         unsigned gen_offsets[32];
1921         enum mali_format gen_formats[32];
1922         memset(gen_offsets, 0, sizeof(gen_offsets));
1923         memset(gen_formats, 0, sizeof(gen_formats));
1924
1925         unsigned gen_stride = 0;
1926         assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
1927         assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
1928
1929         unsigned streamout_offsets[32];
1930
1931         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1932                 streamout_offsets[i] = panfrost_streamout_offset(
1933                                         so->stride[i],
1934                                         ctx->streamout.offsets[i],
1935                                         ctx->streamout.targets[i]);
1936         }
1937
1938         struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
1939         struct mali_attr_meta *ofs = ovs + vs->varying_count;
1940
1941         for (unsigned i = 0; i < vs->varying_count; i++) {
1942                 ovs[i] = panfrost_emit_varying(vs, fs, vs, present,
1943                                 ctx->streamout.num_targets, streamout_offsets,
1944                                 dev->quirks,
1945                                 gen_offsets, gen_formats, &gen_stride, i, true, false);
1946         }
1947
1948         for (unsigned i = 0; i < fs->varying_count; i++) {
1949                 ofs[i] = panfrost_emit_varying(fs, vs, vs, present,
1950                                 ctx->streamout.num_targets, streamout_offsets,
1951                                 dev->quirks,
1952                                 gen_offsets, gen_formats, &gen_stride, i, false, true);
1953         }
1954
1955         unsigned xfb_base = pan_xfb_base(present);
1956         struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool,
1957                         sizeof(union mali_attr) * (xfb_base + ctx->streamout.num_targets));
1958         union mali_attr *varyings = (union mali_attr *) T.cpu;
1959
1960         /* Emit the stream out buffers */
1961
1962         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1963                                                            ctx->vertex_count);
1964
1965         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1966                 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
1967                                         so->stride[i],
1968                                         ctx->streamout.offsets[i],
1969                                         out_count,
1970                                         ctx->streamout.targets[i]);
1971         }
1972
1973         panfrost_emit_varyings(batch,
1974                         &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
1975                         gen_stride, vertex_count);
1976
1977         /* fp32 vec4 gl_Position */
1978         tiler_postfix->position_varying = panfrost_emit_varyings(batch,
1979                         &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
1980                         sizeof(float) * 4, vertex_count);
1981
1982         if (present & (1 << PAN_VARY_PSIZ)) {
1983                 primitive_size->pointer = panfrost_emit_varyings(batch,
1984                                 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
1985                                 2, vertex_count);
1986         }
1987
1988         pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_VARYING_POINT_COORD);
1989         pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_VARYING_FRONT_FACING);
1990         pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_VARYING_FRAG_COORD);
1991
1992         vertex_postfix->varyings = T.gpu;
1993         tiler_postfix->varyings = T.gpu;
1994
1995         vertex_postfix->varying_meta = trans.gpu;
1996         tiler_postfix->varying_meta = trans.gpu + vs_size;
1997 }
1998
1999 void
2000 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
2001                                 struct mali_vertex_tiler_prefix *vertex_prefix,
2002                                 struct mali_vertex_tiler_postfix *vertex_postfix,
2003                                 struct mali_vertex_tiler_prefix *tiler_prefix,
2004                                 struct mali_vertex_tiler_postfix *tiler_postfix,
2005                                 union midgard_primitive_size *primitive_size)
2006 {
2007         struct panfrost_context *ctx = batch->ctx;
2008         struct panfrost_device *device = pan_device(ctx->base.screen);
2009         bool wallpapering = ctx->wallpaper_batch && batch->scoreboard.tiler_dep;
2010         struct bifrost_payload_vertex bifrost_vertex = {0,};
2011         struct bifrost_payload_tiler bifrost_tiler = {0,};
2012         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
2013         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
2014         void *vp, *tp;
2015         size_t vp_size, tp_size;
2016
2017         if (device->quirks & IS_BIFROST) {
2018                 bifrost_vertex.prefix = *vertex_prefix;
2019                 bifrost_vertex.postfix = *vertex_postfix;
2020                 vp = &bifrost_vertex;
2021                 vp_size = sizeof(bifrost_vertex);
2022
2023                 bifrost_tiler.prefix = *tiler_prefix;
2024                 bifrost_tiler.tiler.primitive_size = *primitive_size;
2025                 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2026                 bifrost_tiler.postfix = *tiler_postfix;
2027                 tp = &bifrost_tiler;
2028                 tp_size = sizeof(bifrost_tiler);
2029         } else {
2030                 midgard_vertex.prefix = *vertex_prefix;
2031                 midgard_vertex.postfix = *vertex_postfix;
2032                 vp = &midgard_vertex;
2033                 vp_size = sizeof(midgard_vertex);
2034
2035                 midgard_tiler.prefix = *tiler_prefix;
2036                 midgard_tiler.postfix = *tiler_postfix;
2037                 midgard_tiler.primitive_size = *primitive_size;
2038                 tp = &midgard_tiler;
2039                 tp_size = sizeof(midgard_tiler);
2040         }
2041
2042         if (wallpapering) {
2043                 /* Inject in reverse order, with "predicted" job indices.
2044                  * THIS IS A HACK XXX */
2045                 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false,
2046                                  batch->scoreboard.job_index + 2, tp, tp_size, true);
2047                 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2048                                  vp, vp_size, true);
2049                 return;
2050         }
2051
2052         /* If rasterizer discard is enable, only submit the vertex */
2053
2054         bool rasterizer_discard = ctx->rasterizer &&
2055                                   ctx->rasterizer->base.rasterizer_discard;
2056
2057         unsigned vertex = panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2058                                            vp, vp_size, false);
2059
2060         if (rasterizer_discard)
2061                 return;
2062
2063         panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false, vertex, tp, tp_size,
2064                          false);
2065 }
2066
2067 /* TODO: stop hardcoding this */
2068 mali_ptr
2069 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2070 {
2071         uint16_t locations[] = {
2072             128, 128,
2073             0, 256,
2074             0, 256,
2075             0, 256,
2076             0, 256,
2077             0, 256,
2078             0, 256,
2079             0, 256,
2080             0, 256,
2081             0, 256,
2082             0, 256,
2083             0, 256,
2084             0, 256,
2085             0, 256,
2086             0, 256,
2087             0, 256,
2088             0, 256,
2089             0, 256,
2090             0, 256,
2091             0, 256,
2092             0, 256,
2093             0, 256,
2094             0, 256,
2095             0, 256,
2096             0, 256,
2097             0, 256,
2098             0, 256,
2099             0, 256,
2100             0, 256,
2101             0, 256,
2102             0, 256,
2103             0, 256,
2104             128, 128,
2105             0, 0,
2106             0, 0,
2107             0, 0,
2108             0, 0,
2109             0, 0,
2110             0, 0,
2111             0, 0,
2112             0, 0,
2113             0, 0,
2114             0, 0,
2115             0, 0,
2116             0, 0,
2117             0, 0,
2118             0, 0,
2119             0, 0,
2120         };
2121
2122         return panfrost_pool_upload(&batch->pool, locations, 96 * sizeof(uint16_t));
2123 }