src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28
  29 #include "panfrost-quirks.h"
  30
  31 #include "pan_pool.h"
  32 #include "pan_bo.h"
  33 #include "pan_cmdstream.h"
  34 #include "pan_context.h"
  35 #include "pan_job.h"
  36
  37 /* If a BO is accessed for a particular shader stage, will it be in the primary
  38  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  39  * fragment will be primary, e.g. compute jobs will be considered
  40  * "vertex/tiler" by analogy */
  41
  42 static inline uint32_t
  43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  44 {
  45         assert(stage == PIPE_SHADER_FRAGMENT ||
  46                stage == PIPE_SHADER_VERTEX ||
  47                stage == PIPE_SHADER_COMPUTE);
  48
  49         return stage == PIPE_SHADER_FRAGMENT ?
  50                PAN_BO_ACCESS_FRAGMENT :
  51                PAN_BO_ACCESS_VERTEX_TILER;
  52 }
  53
  54 static void
  55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
  56                                struct mali_vertex_tiler_postfix *postfix)
  57 {
  58         struct panfrost_device *dev = pan_device(ctx->base.screen);
  59         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  60
  61         unsigned shift = panfrost_get_stack_shift(batch->stack_size);
  62         struct mali_shared_memory shared = {
  63                 .stack_shift = shift,
  64                 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
  65                 .shared_workgroup_count = ~0,
  66         };
  67         postfix->shared_memory = panfrost_pool_upload(&batch->pool, &shared, sizeof(shared));
  68 }
  69
  70 static void
  71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
  72                                struct mali_vertex_tiler_postfix *postfix)
  73 {
  74         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  75         postfix->shared_memory = panfrost_batch_reserve_framebuffer(batch);
  76 }
  77
  78 static void
  79 panfrost_vt_update_rasterizer(struct panfrost_rasterizer *rasterizer,
  80                               struct mali_vertex_tiler_prefix *prefix,
  81                               struct mali_vertex_tiler_postfix *postfix)
  82 {
  83         postfix->gl_enables |= 0x7;
  84         SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
  85                 rasterizer && rasterizer->base.front_ccw);
  86         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
  87                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
  88         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
  89                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
  90         SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
  91                 rasterizer && rasterizer->base.flatshade_first);
  92 }
  93
  94 void
  95 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
  96                                   struct mali_vertex_tiler_prefix *prefix,
  97                                   union midgard_primitive_size *primitive_size)
  98 {
  99         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 100
 101         if (!panfrost_writes_point_size(ctx)) {
 102                 bool points = prefix->draw_mode == MALI_DRAW_MODE_POINTS;
 103                 float val = 0.0f;
 104
 105                 if (rasterizer)
 106                         val = points ?
 107                               rasterizer->base.point_size :
 108                               rasterizer->base.line_width;
 109
 110                 primitive_size->constant = val;
 111         }
 112 }
 113
 114 static void
 115 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
 116                                    struct mali_vertex_tiler_postfix *postfix)
 117 {
 118         SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
 119         if (ctx->occlusion_query) {
 120                 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
 121                 panfrost_batch_add_bo(ctx->batch, ctx->occlusion_query->bo,
 122                                       PAN_BO_ACCESS_SHARED |
 123                                       PAN_BO_ACCESS_RW |
 124                                       PAN_BO_ACCESS_FRAGMENT);
 125         } else {
 126                 postfix->occlusion_counter = 0;
 127         }
 128 }
 129
 130 void
 131 panfrost_vt_init(struct panfrost_context *ctx,
 132                  enum pipe_shader_type stage,
 133                  struct mali_vertex_tiler_prefix *prefix,
 134                  struct mali_vertex_tiler_postfix *postfix)
 135 {
 136         struct panfrost_device *device = pan_device(ctx->base.screen);
 137
 138         if (!ctx->shader[stage])
 139                 return;
 140
 141         memset(prefix, 0, sizeof(*prefix));
 142         memset(postfix, 0, sizeof(*postfix));
 143
 144         if (device->quirks & IS_BIFROST) {
 145                 postfix->gl_enables = 0x2;
 146                 panfrost_vt_emit_shared_memory(ctx, postfix);
 147         } else {
 148                 postfix->gl_enables = 0x6;
 149                 panfrost_vt_attach_framebuffer(ctx, postfix);
 150         }
 151
 152         if (stage == PIPE_SHADER_FRAGMENT) {
 153                 panfrost_vt_update_occlusion_query(ctx, postfix);
 154                 panfrost_vt_update_rasterizer(ctx->rasterizer, prefix, postfix);
 155         }
 156 }
 157
 158 static unsigned
 159 panfrost_translate_index_size(unsigned size)
 160 {
 161         switch (size) {
 162         case 1:
 163                 return MALI_DRAW_INDEXED_UINT8;
 164
 165         case 2:
 166                 return MALI_DRAW_INDEXED_UINT16;
 167
 168         case 4:
 169                 return MALI_DRAW_INDEXED_UINT32;
 170
 171         default:
 172                 unreachable("Invalid index size");
 173         }
 174 }
 175
 176 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
 177  * good for the duration of the draw (transient), could last longer. Also get
 178  * the bounds on the index buffer for the range accessed by the draw. We do
 179  * these operations together because there are natural optimizations which
 180  * require them to be together. */
 181
 182 static mali_ptr
 183 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 184                                   const struct pipe_draw_info *info,
 185                                   unsigned *min_index, unsigned *max_index)
 186 {
 187         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 188         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 189         off_t offset = info->start * info->index_size;
 190         bool needs_indices = true;
 191         mali_ptr out = 0;
 192
 193         if (info->max_index != ~0u) {
 194                 *min_index = info->min_index;
 195                 *max_index = info->max_index;
 196                 needs_indices = false;
 197         }
 198
 199         if (!info->has_user_indices) {
 200                 /* Only resources can be directly mapped */
 201                 panfrost_batch_add_bo(batch, rsrc->bo,
 202                                       PAN_BO_ACCESS_SHARED |
 203                                       PAN_BO_ACCESS_READ |
 204                                       PAN_BO_ACCESS_VERTEX_TILER);
 205                 out = rsrc->bo->gpu + offset;
 206
 207                 /* Check the cache */
 208                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 209                                                            info->start,
 210                                                            info->count,
 211                                                            min_index,
 212                                                            max_index);
 213         } else {
 214                 /* Otherwise, we need to upload to transient memory */
 215                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 216                 out = panfrost_pool_upload(&batch->pool, ibuf8 + offset,
 217                                                 info->count *
 218                                                 info->index_size);
 219         }
 220
 221         if (needs_indices) {
 222                 /* Fallback */
 223                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 224
 225                 if (!info->has_user_indices)
 226                         panfrost_minmax_cache_add(rsrc->index_cache,
 227                                                   info->start, info->count,
 228                                                   *min_index, *max_index);
 229         }
 230
 231         return out;
 232 }
 233
 234 void
 235 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
 236                           const struct pipe_draw_info *info,
 237                           enum mali_draw_mode draw_mode,
 238                           struct mali_vertex_tiler_postfix *vertex_postfix,
 239                           struct mali_vertex_tiler_prefix *tiler_prefix,
 240                           struct mali_vertex_tiler_postfix *tiler_postfix,
 241                           unsigned *vertex_count,
 242                           unsigned *padded_count)
 243 {
 244         tiler_prefix->draw_mode = draw_mode;
 245
 246         unsigned draw_flags = 0;
 247
 248         if (panfrost_writes_point_size(ctx))
 249                 draw_flags |= MALI_DRAW_VARYING_SIZE;
 250
 251         if (info->primitive_restart)
 252                 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
 253
 254         /* These doesn't make much sense */
 255
 256         draw_flags |= 0x3000;
 257
 258         if (info->index_size) {
 259                 unsigned min_index = 0, max_index = 0;
 260
 261                 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
 262                                                                        info,
 263                                                                        &min_index,
 264                                                                        &max_index);
 265
 266                 /* Use the corresponding values */
 267                 *vertex_count = max_index - min_index + 1;
 268                 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
 269                 tiler_prefix->offset_bias_correction = -min_index;
 270                 tiler_prefix->index_count = MALI_POSITIVE(info->count);
 271                 draw_flags |= panfrost_translate_index_size(info->index_size);
 272         } else {
 273                 tiler_prefix->indices = 0;
 274                 *vertex_count = ctx->vertex_count;
 275                 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
 276                 tiler_prefix->offset_bias_correction = 0;
 277                 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
 278         }
 279
 280         tiler_prefix->unknown_draw = draw_flags;
 281
 282         /* Encode the padded vertex count */
 283
 284         if (info->instance_count > 1) {
 285                 *padded_count = panfrost_padded_vertex_count(*vertex_count);
 286
 287                 unsigned shift = __builtin_ctz(ctx->padded_count);
 288                 unsigned k = ctx->padded_count >> (shift + 1);
 289
 290                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
 291                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
 292         } else {
 293                 *padded_count = *vertex_count;
 294
 295                 /* Reset instancing state */
 296                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
 297                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
 298         }
 299 }
 300
 301 static void
 302 panfrost_shader_meta_init(struct panfrost_context *ctx,
 303                           enum pipe_shader_type st,
 304                           struct mali_shader_meta *meta)
 305 {
 306         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 307         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 308
 309         memset(meta, 0, sizeof(*meta));
 310         meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
 311         meta->attribute_count = ss->attribute_count;
 312         meta->varying_count = ss->varying_count;
 313         meta->texture_count = ctx->sampler_view_count[st];
 314         meta->sampler_count = ctx->sampler_count[st];
 315
 316         if (dev->quirks & IS_BIFROST) {
 317                 if (st == PIPE_SHADER_VERTEX)
 318                         meta->bifrost1.unk1 = 0x800000;
 319                 else {
 320                         /* First clause ATEST |= 0x4000000.
 321                          * Less than 32 regs |= 0x200 */
 322                         meta->bifrost1.unk1 = 0x950020;
 323                 }
 324
 325                 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 326                 if (st == PIPE_SHADER_VERTEX)
 327                         meta->bifrost2.preload_regs = 0xC0;
 328                 else {
 329                         meta->bifrost2.preload_regs = 0x1;
 330                         SET_BIT(meta->bifrost2.preload_regs, 0x10, ss->reads_frag_coord);
 331                 }
 332
 333                 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
 334                                                     ss->uniform_cutoff);
 335         } else {
 336                 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
 337                                                     ss->uniform_cutoff);
 338                 meta->midgard1.work_count = ss->work_reg_count;
 339
 340                 /* TODO: This is not conformant on ES3 */
 341                 meta->midgard1.flags_hi = MALI_SUPPRESS_INF_NAN;
 342
 343                 meta->midgard1.flags_lo = 0x20;
 344                 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 345
 346                 SET_BIT(meta->midgard1.flags_hi, MALI_WRITES_GLOBAL, ss->writes_global);
 347         }
 348 }
 349
 350 static unsigned
 351 translate_tex_wrap(enum pipe_tex_wrap w)
 352 {
 353         switch (w) {
 354         case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT;
 355         case PIPE_TEX_WRAP_CLAMP: return MALI_WRAP_MODE_CLAMP;
 356         case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE;
 357         case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER;
 358         case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT;
 359         case PIPE_TEX_WRAP_MIRROR_CLAMP: return MALI_WRAP_MODE_MIRRORED_CLAMP;
 360         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
 361         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
 362         default: unreachable("Invalid wrap");
 363         }
 364 }
 365
 366 /* The hardware compares in the wrong order order, so we have to flip before
 367  * encoding. Yes, really. */
 368
 369 static enum mali_func
 370 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
 371 {
 372         if (!cso->compare_mode)
 373                 return MALI_FUNC_NEVER;
 374
 375         enum mali_func f = panfrost_translate_compare_func(cso->compare_func);
 376         return panfrost_flip_compare_func(f);
 377 }
 378
 379 static enum mali_mipmap_mode
 380 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
 381 {
 382         switch (f) {
 383         case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST;
 384         case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR;
 385         case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE;
 386         default: unreachable("Invalid");
 387         }
 388 }
 389
 390 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 391                                 struct mali_midgard_sampler_packed *hw)
 392 {
 393         pan_pack(hw, MIDGARD_SAMPLER, cfg) {
 394                 cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 395                 cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 396                 cfg.mipmap_mode = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) ?
 397                         MALI_MIPMAP_MODE_TRILINEAR : MALI_MIPMAP_MODE_NEAREST;
 398                 cfg.normalized_coordinates = cso->normalized_coords;
 399
 400                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
 401
 402                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
 403
 404                 /* If necessary, we disable mipmapping in the sampler descriptor by
 405                  * clamping the LOD as tight as possible (from 0 to epsilon,
 406                  * essentially -- remember these are fixed point numbers, so
 407                  * epsilon=1/256) */
 408
 409                 cfg.maximum_lod = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) ?
 410                         cfg.minimum_lod + 1 :
 411                         FIXED_16(cso->max_lod, false);
 412
 413                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
 414                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
 415                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
 416
 417                 cfg.compare_function = panfrost_sampler_compare_func(cso);
 418                 cfg.seamless_cube_map = cso->seamless_cube_map;
 419
 420                 cfg.border_color_r = cso->border_color.f[0];
 421                 cfg.border_color_g = cso->border_color.f[1];
 422                 cfg.border_color_b = cso->border_color.f[2];
 423                 cfg.border_color_a = cso->border_color.f[3];
 424         }
 425 }
 426
 427 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
 428                                         struct mali_bifrost_sampler_packed *hw)
 429 {
 430         pan_pack(hw, BIFROST_SAMPLER, cfg) {
 431                 cfg.magnify_linear = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR;
 432                 cfg.minify_linear = cso->min_img_filter == PIPE_TEX_FILTER_LINEAR;
 433                 cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
 434                 cfg.normalized_coordinates = cso->normalized_coords;
 435
 436                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
 437                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
 438                 cfg.maximum_lod = FIXED_16(cso->max_lod, false);
 439
 440                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
 441                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
 442                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
 443
 444                 cfg.compare_function = panfrost_sampler_compare_func(cso);
 445                 cfg.seamless_cube_map = cso->seamless_cube_map;
 446         }
 447 }
 448
 449 static void
 450 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
 451                                      struct mali_shader_meta *fragmeta)
 452 {
 453         if (!ctx->rasterizer) {
 454                 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
 455                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
 456                 fragmeta->depth_units = 0.0f;
 457                 fragmeta->depth_factor = 0.0f;
 458                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
 459                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
 460                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, true);
 461                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, true);
 462                 return;
 463         }
 464
 465         struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 466
 467         bool msaa = rast->multisample;
 468
 469         /* TODO: Sample size */
 470         SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
 471         SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
 472
 473         struct panfrost_shader_state *fs;
 474         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 475
 476         /* EXT_shader_framebuffer_fetch requires the shader to be run
 477          * per-sample when outputs are read. */
 478         bool per_sample = ctx->min_samples > 1 || fs->outputs_read;
 479         SET_BIT(fragmeta->unknown2_3, MALI_PER_SAMPLE, msaa && per_sample);
 480
 481         fragmeta->depth_units = rast->offset_units * 2.0f;
 482         fragmeta->depth_factor = rast->offset_scale;
 483
 484         /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
 485
 486         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, rast->offset_tri);
 487         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, rast->offset_tri);
 488
 489         SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, rast->depth_clip_near);
 490         SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, rast->depth_clip_far);
 491 }
 492
 493 static void
 494 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
 495                               struct mali_shader_meta *fragmeta)
 496 {
 497         const struct panfrost_zsa_state *so = ctx->depth_stencil;
 498         int zfunc = PIPE_FUNC_ALWAYS;
 499
 500         if (!so) {
 501                 /* If stenciling is disabled, the state is irrelevant */
 502                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
 503                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
 504         } else {
 505                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
 506                         so->base.stencil[0].enabled);
 507
 508                 fragmeta->stencil_mask_front = so->stencil_mask_front;
 509                 fragmeta->stencil_mask_back = so->stencil_mask_back;
 510
 511                 /* Bottom bits for stencil ref, exactly one word */
 512                 fragmeta->stencil_front.opaque[0] = so->stencil_front.opaque[0] | ctx->stencil_ref.ref_value[0];
 513
 514                 /* If back-stencil is not enabled, use the front values */
 515
 516                 if (so->base.stencil[1].enabled)
 517                         fragmeta->stencil_back.opaque[0] = so->stencil_back.opaque[0] | ctx->stencil_ref.ref_value[1];
 518                 else
 519                         fragmeta->stencil_back = fragmeta->stencil_front;
 520
 521                 if (so->base.depth.enabled)
 522                         zfunc = so->base.depth.func;
 523
 524                 /* Depth state (TODO: Refactor) */
 525
 526                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
 527                         so->base.depth.writemask);
 528         }
 529
 530         fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
 531         fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
 532 }
 533
 534 static bool
 535 panfrost_fs_required(
 536                 struct panfrost_shader_state *fs,
 537                 struct panfrost_blend_final *blend,
 538                 unsigned rt_count)
 539 {
 540         /* If we generally have side effects */
 541         if (fs->fs_sidefx)
 542                 return true;
 543
 544         /* If colour is written we need to execute */
 545         for (unsigned i = 0; i < rt_count; ++i) {
 546                 if (!blend[i].no_colour)
 547                         return true;
 548         }
 549
 550         /* If depth is written and not implied we need to execute.
 551          * TODO: Predicate on Z/S writes being enabled */
 552         return (fs->writes_depth || fs->writes_stencil);
 553 }
 554
 555 static void
 556 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
 557                                 struct mali_shader_meta *fragmeta,
 558                                 void *rts)
 559 {
 560         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 561         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 562         struct panfrost_shader_state *fs;
 563         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 564
 565         SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
 566                 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
 567                 !ctx->blend->base.dither);
 568
 569         SET_BIT(fragmeta->unknown2_4, MALI_ALPHA_TO_COVERAGE,
 570                         ctx->blend->base.alpha_to_coverage);
 571
 572         /* Get blending setup */
 573         unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 574
 575         struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 576         unsigned shader_offset = 0;
 577         struct panfrost_bo *shader_bo = NULL;
 578
 579         for (unsigned c = 0; c < rt_count; ++c)
 580                 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
 581                                                           &shader_offset);
 582
 583         /* Disable shader execution if we can */
 584         if (dev->quirks & MIDGARD_SHADERLESS
 585                         && !panfrost_fs_required(fs, blend, rt_count)) {
 586                 fragmeta->shader = 0;
 587                 fragmeta->attribute_count = 0;
 588                 fragmeta->varying_count = 0;
 589                 fragmeta->texture_count = 0;
 590                 fragmeta->sampler_count = 0;
 591
 592                 /* This feature is not known to work on Bifrost */
 593                 fragmeta->midgard1.work_count = 1;
 594                 fragmeta->midgard1.uniform_count = 0;
 595                 fragmeta->midgard1.uniform_buffer_count = 0;
 596         }
 597
 598          /* If there is a blend shader, work registers are shared. We impose 8
 599           * work registers as a limit for blend shaders. Should be lower XXX */
 600
 601         if (!(dev->quirks & IS_BIFROST)) {
 602                 for (unsigned c = 0; c < rt_count; ++c) {
 603                         if (blend[c].is_shader) {
 604                                 fragmeta->midgard1.work_count =
 605                                         MAX2(fragmeta->midgard1.work_count, 8);
 606                         }
 607                 }
 608         }
 609
 610         /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
 611          * copied to the blend_meta appended (by convention), but this is the
 612          * field actually read by the hardware. (Or maybe both are read...?).
 613          * Specify the last RTi with a blend shader. */
 614
 615         fragmeta->blend.shader = 0;
 616
 617         for (signed rt = (rt_count - 1); rt >= 0; --rt) {
 618                 if (!blend[rt].is_shader)
 619                         continue;
 620
 621                 fragmeta->blend.shader = blend[rt].shader.gpu |
 622                                          blend[rt].shader.first_tag;
 623                 break;
 624         }
 625
 626         if (dev->quirks & MIDGARD_SFBD) {
 627                 /* When only a single render target platform is used, the blend
 628                  * information is inside the shader meta itself. We additionally
 629                  * need to signal CAN_DISCARD for nontrivial blend modes (so
 630                  * we're able to read back the destination buffer) */
 631
 632                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
 633                         blend[0].is_shader);
 634
 635                 if (!blend[0].is_shader) {
 636                         fragmeta->blend.equation = *blend[0].equation.equation;
 637                         fragmeta->blend.constant = blend[0].equation.constant;
 638                 }
 639
 640                 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
 641                         !blend[0].no_blending || fs->can_discard);
 642
 643                 batch->draws |= PIPE_CLEAR_COLOR0;
 644                 return;
 645         }
 646
 647         if (dev->quirks & IS_BIFROST) {
 648                 bool no_blend = true;
 649
 650                 for (unsigned i = 0; i < rt_count; ++i)
 651                         no_blend &= (blend[i].no_blending | blend[i].no_colour);
 652
 653                 SET_BIT(fragmeta->bifrost1.unk1, MALI_BIFROST_EARLY_Z,
 654                         !fs->can_discard && !fs->writes_depth && no_blend);
 655         }
 656
 657         /* Additional blend descriptor tacked on for jobs using MFBD */
 658
 659         for (unsigned i = 0; i < rt_count; ++i) {
 660                 unsigned flags = 0;
 661
 662                 if (ctx->pipe_framebuffer.nr_cbufs > i && !blend[i].no_colour) {
 663                         flags = 0x200;
 664                         batch->draws |= (PIPE_CLEAR_COLOR0 << i);
 665
 666                         bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
 667                                        (ctx->pipe_framebuffer.cbufs[i]) &&
 668                                        util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
 669
 670                         SET_BIT(flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
 671                         SET_BIT(flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
 672                         SET_BIT(flags, MALI_BLEND_SRGB, is_srgb);
 673                         SET_BIT(flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
 674                 }
 675
 676                 if (dev->quirks & IS_BIFROST) {
 677                         struct bifrost_blend_rt *brts = rts;
 678
 679                         brts[i].flags = flags;
 680
 681                         if (blend[i].is_shader) {
 682                                 /* The blend shader's address needs to be at
 683                                  * the same top 32 bit as the fragment shader.
 684                                  * TODO: Ensure that's always the case.
 685                                  */
 686                                 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
 687                                        (fs->bo->gpu & (0xffffffffull << 32)));
 688                                 brts[i].shader = blend[i].shader.gpu;
 689                                 brts[i].unk2 = 0x0;
 690                         } else if (ctx->pipe_framebuffer.nr_cbufs > i) {
 691                                 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
 692                                 const struct util_format_description *format_desc;
 693                                 format_desc = util_format_description(format);
 694
 695                                 brts[i].equation = *blend[i].equation.equation;
 696
 697                                 /* TODO: this is a bit more complicated */
 698                                 brts[i].constant = blend[i].equation.constant;
 699
 700                                 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
 701
 702                                 /* 0x19 disables blending and forces REPLACE
 703                                  * mode (equivalent to rgb_mode = alpha_mode =
 704                                  * x122, colour mask = 0xF). 0x1a allows
 705                                  * blending. */
 706                                 brts[i].unk2 = blend[i].no_blending ? 0x19 : 0x1a;
 707
 708                                 brts[i].shader_type = fs->blend_types[i];
 709                         } else {
 710                                 /* Dummy attachment for depth-only */
 711                                 brts[i].unk2 = 0x3;
 712                                 brts[i].shader_type = fs->blend_types[i];
 713                         }
 714                 } else {
 715                         struct midgard_blend_rt *mrts = rts;
 716                         mrts[i].flags = flags;
 717
 718                         if (blend[i].is_shader) {
 719                                 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 720                         } else {
 721                                 mrts[i].blend.equation = *blend[i].equation.equation;
 722                                 mrts[i].blend.constant = blend[i].equation.constant;
 723                         }
 724                 }
 725         }
 726 }
 727
 728 static void
 729 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
 730                                struct mali_shader_meta *fragmeta,
 731                                void *rts)
 732 {
 733         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 734         struct panfrost_shader_state *fs;
 735
 736         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 737
 738         bool msaa = ctx->rasterizer && ctx->rasterizer->base.multisample;
 739         fragmeta->coverage_mask = msaa ? ctx->sample_mask : ~0;
 740
 741         fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x10;
 742         fragmeta->unknown2_4 = 0x4e0;
 743
 744         /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
 745          * is required (independent of 32-bit/64-bit descriptors), or why it's
 746          * not used on later GPU revisions. Otherwise, all shader jobs fault on
 747          * these earlier chips (perhaps this is a chicken bit of some kind).
 748          * More investigation is needed. */
 749
 750         SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
 751
 752         if (dev->quirks & IS_BIFROST) {
 753                 /* TODO */
 754         } else {
 755                 /* Depending on whether it's legal to in the given shader, we try to
 756                  * enable early-z testing. TODO: respect e-z force */
 757
 758                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
 759                         !fs->can_discard && !fs->writes_global &&
 760                         !fs->writes_depth && !fs->writes_stencil &&
 761                         !ctx->blend->base.alpha_to_coverage);
 762
 763                 /* Add the writes Z/S flags if needed. */
 764                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
 765                 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
 766
 767                 /* Any time texturing is used, derivatives are implicitly calculated,
 768                  * so we need to enable helper invocations */
 769
 770                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
 771                         fs->helper_invocations);
 772
 773                 /* If discard is enabled, which bit we set to convey this
 774                  * depends on if depth/stencil is used for the draw or not.
 775                  * Just one of depth OR stencil is enough to trigger this. */
 776
 777                 const struct pipe_depth_stencil_alpha_state *zsa = &ctx->depth_stencil->base;
 778                 bool zs_enabled = fs->writes_depth || fs->writes_stencil;
 779
 780                 if (zsa) {
 781                         zs_enabled |= (zsa->depth.enabled && zsa->depth.func != PIPE_FUNC_ALWAYS);
 782                         zs_enabled |= zsa->stencil[0].enabled;
 783                 }
 784
 785                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_TILEBUFFER,
 786                         fs->outputs_read || (!zs_enabled && fs->can_discard));
 787                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, zs_enabled && fs->can_discard);
 788         }
 789
 790         panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
 791         panfrost_frag_meta_zsa_update(ctx, fragmeta);
 792         panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
 793 }
 794
 795 void
 796 panfrost_emit_shader_meta(struct panfrost_batch *batch,
 797                           enum pipe_shader_type st,
 798                           struct mali_vertex_tiler_postfix *postfix)
 799 {
 800         struct panfrost_context *ctx = batch->ctx;
 801         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 802
 803         if (!ss) {
 804                 postfix->shader = 0;
 805                 return;
 806         }
 807
 808         struct mali_shader_meta meta;
 809
 810         panfrost_shader_meta_init(ctx, st, &meta);
 811
 812         /* Add the shader BO to the batch. */
 813         panfrost_batch_add_bo(batch, ss->bo,
 814                               PAN_BO_ACCESS_PRIVATE |
 815                               PAN_BO_ACCESS_READ |
 816                               panfrost_bo_access_for_stage(st));
 817
 818         mali_ptr shader_ptr;
 819
 820         if (st == PIPE_SHADER_FRAGMENT) {
 821                 struct panfrost_device *dev = pan_device(ctx->base.screen);
 822                 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 823                 size_t desc_size = sizeof(meta);
 824                 void *rts = NULL;
 825                 struct panfrost_transfer xfer;
 826                 unsigned rt_size;
 827
 828                 if (dev->quirks & MIDGARD_SFBD)
 829                         rt_size = 0;
 830                 else if (dev->quirks & IS_BIFROST)
 831                         rt_size = sizeof(struct bifrost_blend_rt);
 832                 else
 833                         rt_size = sizeof(struct midgard_blend_rt);
 834
 835                 desc_size += rt_size * rt_count;
 836
 837                 if (rt_size)
 838                         rts = rzalloc_size(ctx, rt_size * rt_count);
 839
 840                 panfrost_frag_shader_meta_init(ctx, &meta, rts);
 841
 842                 xfer = panfrost_pool_alloc(&batch->pool, desc_size);
 843
 844                 memcpy(xfer.cpu, &meta, sizeof(meta));
 845                 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
 846
 847                 if (rt_size)
 848                         ralloc_free(rts);
 849
 850                 shader_ptr = xfer.gpu;
 851         } else {
 852                 shader_ptr = panfrost_pool_upload(&batch->pool, &meta,
 853                                                        sizeof(meta));
 854         }
 855
 856         postfix->shader = shader_ptr;
 857 }
 858
 859 void
 860 panfrost_emit_viewport(struct panfrost_batch *batch,
 861                        struct mali_vertex_tiler_postfix *tiler_postfix)
 862 {
 863         struct panfrost_context *ctx = batch->ctx;
 864         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 865         const struct pipe_scissor_state *ss = &ctx->scissor;
 866         const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 867         const struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
 868
 869         /* Derive min/max from translate/scale. Note since |x| >= 0 by
 870          * definition, we have that -|x| <= |x| hence translate - |scale| <=
 871          * translate + |scale|, so the ordering is correct here. */
 872         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 873         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 874         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 875         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 876         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 877         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 878
 879         /* Scissor to the intersection of viewport and to the scissor, clamped
 880          * to the framebuffer */
 881
 882         unsigned minx = MIN2(fb->width, vp_minx);
 883         unsigned maxx = MIN2(fb->width, vp_maxx);
 884         unsigned miny = MIN2(fb->height, vp_miny);
 885         unsigned maxy = MIN2(fb->height, vp_maxy);
 886
 887         if (ss && rast && rast->scissor) {
 888                 minx = MAX2(ss->minx, minx);
 889                 miny = MAX2(ss->miny, miny);
 890                 maxx = MIN2(ss->maxx, maxx);
 891                 maxy = MIN2(ss->maxy, maxy);
 892         }
 893
 894         struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, MALI_VIEWPORT_LENGTH);
 895
 896         pan_pack(T.cpu, VIEWPORT, cfg) {
 897                 cfg.scissor_minimum_x = minx;
 898                 cfg.scissor_minimum_y = miny;
 899                 cfg.scissor_maximum_x = maxx - 1;
 900                 cfg.scissor_maximum_y = maxy - 1;
 901
 902                 cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY;
 903                 cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY;
 904         }
 905
 906         tiler_postfix->viewport = T.gpu;
 907         panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
 908 }
 909
 910 static mali_ptr
 911 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
 912                                  enum pipe_shader_type st,
 913                                  struct panfrost_constant_buffer *buf,
 914                                  unsigned index)
 915 {
 916         struct pipe_constant_buffer *cb = &buf->cb[index];
 917         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
 918
 919         if (rsrc) {
 920                 panfrost_batch_add_bo(batch, rsrc->bo,
 921                                       PAN_BO_ACCESS_SHARED |
 922                                       PAN_BO_ACCESS_READ |
 923                                       panfrost_bo_access_for_stage(st));
 924
 925                 /* Alignment gauranteed by
 926                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
 927                 return rsrc->bo->gpu + cb->buffer_offset;
 928         } else if (cb->user_buffer) {
 929                 return panfrost_pool_upload(&batch->pool,
 930                                                  cb->user_buffer +
 931                                                  cb->buffer_offset,
 932                                                  cb->buffer_size);
 933         } else {
 934                 unreachable("No constant buffer");
 935         }
 936 }
 937
 938 struct sysval_uniform {
 939         union {
 940                 float f[4];
 941                 int32_t i[4];
 942                 uint32_t u[4];
 943                 uint64_t du[2];
 944         };
 945 };
 946
 947 static void
 948 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
 949                                       struct sysval_uniform *uniform)
 950 {
 951         struct panfrost_context *ctx = batch->ctx;
 952         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 953
 954         uniform->f[0] = vp->scale[0];
 955         uniform->f[1] = vp->scale[1];
 956         uniform->f[2] = vp->scale[2];
 957 }
 958
 959 static void
 960 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
 961                                        struct sysval_uniform *uniform)
 962 {
 963         struct panfrost_context *ctx = batch->ctx;
 964         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 965
 966         uniform->f[0] = vp->translate[0];
 967         uniform->f[1] = vp->translate[1];
 968         uniform->f[2] = vp->translate[2];
 969 }
 970
 971 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
 972                                        enum pipe_shader_type st,
 973                                        unsigned int sysvalid,
 974                                        struct sysval_uniform *uniform)
 975 {
 976         struct panfrost_context *ctx = batch->ctx;
 977         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
 978         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
 979         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
 980         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
 981
 982         assert(dim);
 983         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
 984
 985         if (dim > 1)
 986                 uniform->i[1] = u_minify(tex->texture->height0,
 987                                          tex->u.tex.first_level);
 988
 989         if (dim > 2)
 990                 uniform->i[2] = u_minify(tex->texture->depth0,
 991                                          tex->u.tex.first_level);
 992
 993         if (is_array)
 994                 uniform->i[dim] = tex->texture->array_size;
 995 }
 996
 997 static void
 998 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
 999                             enum pipe_shader_type st,
1000                             unsigned ssbo_id,
1001                             struct sysval_uniform *uniform)
1002 {
1003         struct panfrost_context *ctx = batch->ctx;
1004
1005         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1006         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1007
1008         /* Compute address */
1009         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1010
1011         panfrost_batch_add_bo(batch, bo,
1012                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1013                               panfrost_bo_access_for_stage(st));
1014
1015         /* Upload address and size as sysval */
1016         uniform->du[0] = bo->gpu + sb.buffer_offset;
1017         uniform->u[2] = sb.buffer_size;
1018 }
1019
1020 static void
1021 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1022                                enum pipe_shader_type st,
1023                                unsigned samp_idx,
1024                                struct sysval_uniform *uniform)
1025 {
1026         struct panfrost_context *ctx = batch->ctx;
1027         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1028
1029         uniform->f[0] = sampl->min_lod;
1030         uniform->f[1] = sampl->max_lod;
1031         uniform->f[2] = sampl->lod_bias;
1032
1033         /* Even without any errata, Midgard represents "no mipmapping" as
1034          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1035          * panfrost_create_sampler_state which also explains our choice of
1036          * epsilon value (again to keep behaviour consistent) */
1037
1038         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1039                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1040 }
1041
1042 static void
1043 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1044                                        struct sysval_uniform *uniform)
1045 {
1046         struct panfrost_context *ctx = batch->ctx;
1047
1048         uniform->u[0] = ctx->compute_grid->grid[0];
1049         uniform->u[1] = ctx->compute_grid->grid[1];
1050         uniform->u[2] = ctx->compute_grid->grid[2];
1051 }
1052
1053 static void
1054 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1055                         struct panfrost_shader_state *ss,
1056                         enum pipe_shader_type st)
1057 {
1058         struct sysval_uniform *uniforms = (void *)buf;
1059
1060         for (unsigned i = 0; i < ss->sysval_count; ++i) {
1061                 int sysval = ss->sysval[i];
1062
1063                 switch (PAN_SYSVAL_TYPE(sysval)) {
1064                 case PAN_SYSVAL_VIEWPORT_SCALE:
1065                         panfrost_upload_viewport_scale_sysval(batch,
1066                                                               &uniforms[i]);
1067                         break;
1068                 case PAN_SYSVAL_VIEWPORT_OFFSET:
1069                         panfrost_upload_viewport_offset_sysval(batch,
1070                                                                &uniforms[i]);
1071                         break;
1072                 case PAN_SYSVAL_TEXTURE_SIZE:
1073                         panfrost_upload_txs_sysval(batch, st,
1074                                                    PAN_SYSVAL_ID(sysval),
1075                                                    &uniforms[i]);
1076                         break;
1077                 case PAN_SYSVAL_SSBO:
1078                         panfrost_upload_ssbo_sysval(batch, st,
1079                                                     PAN_SYSVAL_ID(sysval),
1080                                                     &uniforms[i]);
1081                         break;
1082                 case PAN_SYSVAL_NUM_WORK_GROUPS:
1083                         panfrost_upload_num_work_groups_sysval(batch,
1084                                                                &uniforms[i]);
1085                         break;
1086                 case PAN_SYSVAL_SAMPLER:
1087                         panfrost_upload_sampler_sysval(batch, st,
1088                                                        PAN_SYSVAL_ID(sysval),
1089                                                        &uniforms[i]);
1090                         break;
1091                 default:
1092                         assert(0);
1093                 }
1094         }
1095 }
1096
1097 static const void *
1098 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1099                                  unsigned index)
1100 {
1101         struct pipe_constant_buffer *cb = &buf->cb[index];
1102         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1103
1104         if (rsrc)
1105                 return rsrc->bo->cpu;
1106         else if (cb->user_buffer)
1107                 return cb->user_buffer;
1108         else
1109                 unreachable("No constant buffer");
1110 }
1111
1112 void
1113 panfrost_emit_const_buf(struct panfrost_batch *batch,
1114                         enum pipe_shader_type stage,
1115                         struct mali_vertex_tiler_postfix *postfix)
1116 {
1117         struct panfrost_context *ctx = batch->ctx;
1118         struct panfrost_shader_variants *all = ctx->shader[stage];
1119
1120         if (!all)
1121                 return;
1122
1123         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1124
1125         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1126
1127         /* Uniforms are implicitly UBO #0 */
1128         bool has_uniforms = buf->enabled_mask & (1 << 0);
1129
1130         /* Allocate room for the sysval and the uniforms */
1131         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1132         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1133         size_t size = sys_size + uniform_size;
1134         struct panfrost_transfer transfer = panfrost_pool_alloc(&batch->pool,
1135                                                                         size);
1136
1137         /* Upload sysvals requested by the shader */
1138         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1139
1140         /* Upload uniforms */
1141         if (has_uniforms && uniform_size) {
1142                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1143                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1144         }
1145
1146         /* Next up, attach UBOs. UBO #0 is the uniforms we just
1147          * uploaded */
1148
1149         unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1150         assert(ubo_count >= 1);
1151
1152         size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count;
1153         struct panfrost_transfer ubos = panfrost_pool_alloc(&batch->pool, sz);
1154         uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
1155
1156         /* Upload uniforms as a UBO */
1157
1158         if (ss->uniform_count) {
1159                 pan_pack(ubo_ptr, UNIFORM_BUFFER, cfg) {
1160                         cfg.entries = ss->uniform_count;
1161                         cfg.pointer = transfer.gpu;
1162                 }
1163         } else {
1164                 *ubo_ptr = 0;
1165         }
1166
1167         /* The rest are honest-to-goodness UBOs */
1168
1169         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1170                 size_t usz = buf->cb[ubo].buffer_size;
1171                 bool enabled = buf->enabled_mask & (1 << ubo);
1172                 bool empty = usz == 0;
1173
1174                 if (!enabled || empty) {
1175                         ubo_ptr[ubo] = 0;
1176                         continue;
1177                 }
1178
1179                 pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) {
1180                         cfg.entries = DIV_ROUND_UP(usz, 16);
1181                         cfg.pointer = panfrost_map_constant_buffer_gpu(batch,
1182                                         stage, buf, ubo);
1183                 }
1184         }
1185
1186         postfix->uniforms = transfer.gpu;
1187         postfix->uniform_buffers = ubos.gpu;
1188
1189         buf->dirty_mask = 0;
1190 }
1191
1192 void
1193 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1194                             const struct pipe_grid_info *info,
1195                             struct midgard_payload_vertex_tiler *vtp)
1196 {
1197         struct panfrost_context *ctx = batch->ctx;
1198         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1199         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1200         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1201                                                            128));
1202         unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1203                                info->grid[2] * 4;
1204         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1205                                                                   shared_size,
1206                                                                   1);
1207
1208         struct mali_shared_memory shared = {
1209                 .shared_memory = bo->gpu,
1210                 .shared_workgroup_count =
1211                         util_logbase2_ceil(info->grid[0]) +
1212                         util_logbase2_ceil(info->grid[1]) +
1213                         util_logbase2_ceil(info->grid[2]),
1214                 .shared_unk1 = 0x2,
1215                 .shared_shift = util_logbase2(single_size) - 1
1216         };
1217
1218         vtp->postfix.shared_memory = panfrost_pool_upload(&batch->pool, &shared,
1219                                                                sizeof(shared));
1220 }
1221
1222 static mali_ptr
1223 panfrost_get_tex_desc(struct panfrost_batch *batch,
1224                       enum pipe_shader_type st,
1225                       struct panfrost_sampler_view *view)
1226 {
1227         if (!view)
1228                 return (mali_ptr) 0;
1229
1230         struct pipe_sampler_view *pview = &view->base;
1231         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1232
1233         /* Add the BO to the job so it's retained until the job is done. */
1234
1235         panfrost_batch_add_bo(batch, rsrc->bo,
1236                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1237                               panfrost_bo_access_for_stage(st));
1238
1239         panfrost_batch_add_bo(batch, view->bo,
1240                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1241                               panfrost_bo_access_for_stage(st));
1242
1243         return view->bo->gpu;
1244 }
1245
1246 static void
1247 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1248                              struct pipe_context *pctx)
1249 {
1250         struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1251         if (view->texture_bo != rsrc->bo->gpu ||
1252             view->modifier != rsrc->modifier) {
1253                 panfrost_bo_unreference(view->bo);
1254                 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1255         }
1256 }
1257
1258 void
1259 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1260                                   enum pipe_shader_type stage,
1261                                   struct mali_vertex_tiler_postfix *postfix)
1262 {
1263         struct panfrost_context *ctx = batch->ctx;
1264         struct panfrost_device *device = pan_device(ctx->base.screen);
1265
1266         if (!ctx->sampler_view_count[stage])
1267                 return;
1268
1269         if (device->quirks & IS_BIFROST) {
1270                 struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool,
1271                                 MALI_BIFROST_TEXTURE_LENGTH *
1272                                 ctx->sampler_view_count[stage]);
1273
1274                 struct mali_bifrost_texture_packed *out =
1275                         (struct mali_bifrost_texture_packed *) T.cpu;
1276
1277                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1278                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1279                         struct pipe_sampler_view *pview = &view->base;
1280                         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1281
1282                         panfrost_update_sampler_view(view, &ctx->base);
1283                         out[i] = view->bifrost_descriptor;
1284
1285                         /* Add the BOs to the job so they are retained until the job is done. */
1286
1287                         panfrost_batch_add_bo(batch, rsrc->bo,
1288                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1289                                               panfrost_bo_access_for_stage(stage));
1290
1291                         panfrost_batch_add_bo(batch, view->bo,
1292                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1293                                               panfrost_bo_access_for_stage(stage));
1294                 }
1295
1296                 postfix->textures = T.gpu;
1297         } else {
1298                 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1299
1300                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1301                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1302
1303                         panfrost_update_sampler_view(view, &ctx->base);
1304
1305                         trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1306                 }
1307
1308                 postfix->textures = panfrost_pool_upload(&batch->pool,
1309                                                               trampolines,
1310                                                               sizeof(uint64_t) *
1311                                                               ctx->sampler_view_count[stage]);
1312         }
1313 }
1314
1315 void
1316 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1317                                   enum pipe_shader_type stage,
1318                                   struct mali_vertex_tiler_postfix *postfix)
1319 {
1320         struct panfrost_context *ctx = batch->ctx;
1321
1322         if (!ctx->sampler_count[stage])
1323                 return;
1324
1325         size_t desc_size = MALI_BIFROST_SAMPLER_LENGTH;
1326         assert(MALI_BIFROST_SAMPLER_LENGTH == MALI_MIDGARD_SAMPLER_LENGTH);
1327
1328         size_t sz = desc_size * ctx->sampler_count[stage];
1329         struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, sz);
1330         struct mali_midgard_sampler_packed *out = (struct mali_midgard_sampler_packed *) T.cpu;
1331
1332         for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i)
1333                 out[i] = ctx->samplers[stage][i]->hw;
1334
1335         postfix->sampler_descriptor = T.gpu;
1336 }
1337
1338 void
1339 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1340                           struct mali_vertex_tiler_postfix *vertex_postfix)
1341 {
1342         struct panfrost_context *ctx = batch->ctx;
1343         struct panfrost_vertex_state *so = ctx->vertex;
1344
1345         unsigned instance_shift = vertex_postfix->instance_shift;
1346         unsigned instance_odd = vertex_postfix->instance_odd;
1347
1348         /* Worst case: everything is NPOT */
1349
1350         struct panfrost_transfer S = panfrost_pool_alloc(&batch->pool,
1351                         MALI_ATTRIBUTE_LENGTH * PIPE_MAX_ATTRIBS * 2);
1352
1353         struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool,
1354                         MALI_ATTRIBUTE_LENGTH * (PAN_INSTANCE_ID + 1));
1355
1356         struct mali_attribute_buffer_packed *bufs =
1357                 (struct mali_attribute_buffer_packed *) S.cpu;
1358
1359         struct mali_attribute_packed *out =
1360                 (struct mali_attribute_packed *) T.cpu;
1361
1362         unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 };
1363         unsigned k = 0;
1364
1365         for (unsigned i = 0; i < so->num_elements; ++i) {
1366                 /* We map buffers 1:1 with the attributes, which
1367                  * means duplicating some vertex buffers (who cares? aside from
1368                  * maybe some caching implications but I somehow doubt that
1369                  * matters) */
1370
1371                 struct pipe_vertex_element *elem = &so->pipe[i];
1372                 unsigned vbi = elem->vertex_buffer_index;
1373                 attrib_to_buffer[i] = k;
1374
1375                 if (!(ctx->vb_mask & (1 << vbi)))
1376                         continue;
1377
1378                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1379                 struct panfrost_resource *rsrc;
1380
1381                 rsrc = pan_resource(buf->buffer.resource);
1382                 if (!rsrc)
1383                         continue;
1384
1385                 /* Add a dependency of the batch on the vertex buffer */
1386                 panfrost_batch_add_bo(batch, rsrc->bo,
1387                                       PAN_BO_ACCESS_SHARED |
1388                                       PAN_BO_ACCESS_READ |
1389                                       PAN_BO_ACCESS_VERTEX_TILER);
1390
1391                 /* Mask off lower bits, see offset fixup below */
1392                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1393                 mali_ptr addr = raw_addr & ~63;
1394
1395                 /* Since we advanced the base pointer, we shrink the buffer
1396                  * size, but add the offset we subtracted */
1397                 unsigned size = rsrc->base.width0 + (raw_addr - addr)
1398                         - buf->buffer_offset;
1399
1400                 /* When there is a divisor, the hardware-level divisor is
1401                  * the product of the instance divisor and the padded count */
1402                 unsigned divisor = elem->instance_divisor;
1403                 unsigned hw_divisor = ctx->padded_count * divisor;
1404                 unsigned stride = buf->stride;
1405
1406                 /* If there's a divisor(=1) but no instancing, we want every
1407                  * attribute to be the same */
1408
1409                 if (divisor && ctx->instance_count == 1)
1410                         stride = 0;
1411
1412                 if (!divisor || ctx->instance_count <= 1) {
1413                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1414                                 if (ctx->instance_count > 1)
1415                                         cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
1416
1417                                 cfg.pointer = addr;
1418                                 cfg.stride = stride;
1419                                 cfg.size = size;
1420                                 cfg.divisor_r = instance_shift;
1421                                 cfg.divisor_p = instance_odd;
1422                         }
1423                 } else if (util_is_power_of_two_or_zero(hw_divisor)) {
1424                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1425                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
1426                                 cfg.pointer = addr;
1427                                 cfg.stride = stride;
1428                                 cfg.size = size;
1429                                 cfg.divisor_r = __builtin_ctz(hw_divisor);
1430                         }
1431
1432                 } else {
1433                         unsigned shift = 0, extra_flags = 0;
1434
1435                         unsigned magic_divisor =
1436                                 panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
1437
1438                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1439                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
1440                                 cfg.pointer = addr;
1441                                 cfg.stride = stride;
1442                                 cfg.size = size;
1443
1444                                 cfg.divisor_r = shift;
1445                                 cfg.divisor_e = extra_flags;
1446                         }
1447
1448                         pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
1449                                 cfg.divisor_numerator = magic_divisor;
1450                                 cfg.divisor = divisor;
1451                         }
1452
1453                         ++k;
1454                 }
1455
1456                 ++k;
1457         }
1458
1459         /* Add special gl_VertexID/gl_InstanceID buffers */
1460
1461         panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1462
1463         pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
1464                 cfg.buffer_index = k++;
1465                 cfg.format = so->formats[PAN_VERTEX_ID];
1466         }
1467
1468         panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1469
1470         pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
1471                 cfg.buffer_index = k++;
1472                 cfg.format = so->formats[PAN_INSTANCE_ID];
1473         }
1474
1475         /* Attribute addresses require 64-byte alignment, so let:
1476          *
1477          *      base' = base & ~63 = base - (base & 63)
1478          *      offset' = offset + (base & 63)
1479          *
1480          * Since base' + offset' = base + offset, these are equivalent
1481          * addressing modes and now base is 64 aligned.
1482          */
1483
1484         unsigned start = vertex_postfix->offset_start;
1485
1486         for (unsigned i = 0; i < so->num_elements; ++i) {
1487                 unsigned vbi = so->pipe[i].vertex_buffer_index;
1488                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1489
1490                 /* Adjust by the masked off bits of the offset. Make sure we
1491                  * read src_offset from so->hw (which is not GPU visible)
1492                  * rather than target (which is) due to caching effects */
1493
1494                 unsigned src_offset = so->pipe[i].src_offset;
1495
1496                 /* BOs aligned to 4k so guaranteed aligned to 64 */
1497                 src_offset += (buf->buffer_offset & 63);
1498
1499                 /* Also, somewhat obscurely per-instance data needs to be
1500                  * offset in response to a delayed start in an indexed draw */
1501
1502                 if (so->pipe[i].instance_divisor && ctx->instance_count > 1 && start)
1503                         src_offset -= buf->stride * start;
1504
1505                 pan_pack(out + i, ATTRIBUTE, cfg) {
1506                         cfg.buffer_index = attrib_to_buffer[i];
1507                         cfg.format = so->formats[i];
1508                         cfg.offset = src_offset;
1509                 }
1510         }
1511
1512         vertex_postfix->attributes = S.gpu;
1513         vertex_postfix->attribute_meta = T.gpu;
1514 }
1515
1516 static mali_ptr
1517 panfrost_emit_varyings(struct panfrost_batch *batch,
1518                 struct mali_attribute_buffer_packed *slot,
1519                 unsigned stride, unsigned count)
1520 {
1521         unsigned size = stride * count;
1522         mali_ptr ptr = panfrost_pool_alloc(&batch->pool, size).gpu;
1523
1524         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1525                 cfg.stride = stride;
1526                 cfg.size = size;
1527                 cfg.pointer = ptr;
1528         }
1529
1530         return ptr;
1531 }
1532
1533 static unsigned
1534 panfrost_streamout_offset(unsigned stride, unsigned offset,
1535                         struct pipe_stream_output_target *target)
1536 {
1537         return (target->buffer_offset + (offset * stride * 4)) & 63;
1538 }
1539
1540 static void
1541 panfrost_emit_streamout(struct panfrost_batch *batch,
1542                         struct mali_attribute_buffer_packed *slot,
1543                         unsigned stride_words, unsigned offset, unsigned count,
1544                         struct pipe_stream_output_target *target)
1545 {
1546         unsigned stride = stride_words * 4;
1547         unsigned max_size = target->buffer_size;
1548         unsigned expected_size = stride * count;
1549
1550         /* Grab the BO and bind it to the batch */
1551         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1552
1553         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1554          * the perspective of the TILER and FRAGMENT.
1555          */
1556         panfrost_batch_add_bo(batch, bo,
1557                               PAN_BO_ACCESS_SHARED |
1558                               PAN_BO_ACCESS_RW |
1559                               PAN_BO_ACCESS_VERTEX_TILER |
1560                               PAN_BO_ACCESS_FRAGMENT);
1561
1562         /* We will have an offset applied to get alignment */
1563         mali_ptr addr = bo->gpu + target->buffer_offset + (offset * stride);
1564
1565         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1566                 cfg.pointer = (addr & ~63);
1567                 cfg.stride = stride;
1568                 cfg.size = MIN2(max_size, expected_size) + (addr & 63);
1569         }
1570 }
1571
1572 static bool
1573 has_point_coord(unsigned mask, gl_varying_slot loc)
1574 {
1575         if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1576                 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1577         else if (loc == VARYING_SLOT_PNTC)
1578                 return (mask & (1 << 8));
1579         else
1580                 return false;
1581 }
1582
1583 /* Helpers for manipulating stream out information so we can pack varyings
1584  * accordingly. Compute the src_offset for a given captured varying */
1585
1586 static struct pipe_stream_output *
1587 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1588 {
1589         for (unsigned i = 0; i < info->num_outputs; ++i) {
1590                 if (info->output[i].register_index == loc)
1591                         return &info->output[i];
1592         }
1593
1594         unreachable("Varying not captured");
1595 }
1596
1597 static unsigned
1598 pan_varying_size(enum mali_format fmt)
1599 {
1600         unsigned type = MALI_EXTRACT_TYPE(fmt);
1601         unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1602         unsigned bits = MALI_EXTRACT_BITS(fmt);
1603         unsigned bpc = 0;
1604
1605         if (bits == MALI_CHANNEL_FLOAT) {
1606                 /* No doubles */
1607                 bool fp16 = (type == MALI_FORMAT_SINT);
1608                 assert(fp16 || (type == MALI_FORMAT_UNORM));
1609
1610                 bpc = fp16 ? 2 : 4;
1611         } else {
1612                 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1613
1614                 /* See the enums */
1615                 bits = 1 << bits;
1616                 assert(bits >= 8);
1617                 bpc = bits / 8;
1618         }
1619
1620         return bpc * chan;
1621 }
1622
1623 /* Indices for named (non-XFB) varyings that are present. These are packed
1624  * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1625  * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1626  * of a given special field given a shift S by:
1627  *
1628  *      idx = popcount(P & ((1 << S) - 1))
1629  *
1630  * That is... look at all of the varyings that come earlier and count them, the
1631  * count is the new index since plus one. Likewise, the total number of special
1632  * buffers required is simply popcount(P)
1633  */
1634
1635 enum pan_special_varying {
1636         PAN_VARY_GENERAL = 0,
1637         PAN_VARY_POSITION = 1,
1638         PAN_VARY_PSIZ = 2,
1639         PAN_VARY_PNTCOORD = 3,
1640         PAN_VARY_FACE = 4,
1641         PAN_VARY_FRAGCOORD = 5,
1642
1643         /* Keep last */
1644         PAN_VARY_MAX,
1645 };
1646
1647 /* Given a varying, figure out which index it correpsonds to */
1648
1649 static inline unsigned
1650 pan_varying_index(unsigned present, enum pan_special_varying v)
1651 {
1652         unsigned mask = (1 << v) - 1;
1653         return util_bitcount(present & mask);
1654 }
1655
1656 /* Get the base offset for XFB buffers, which by convention come after
1657  * everything else. Wrapper function for semantic reasons; by construction this
1658  * is just popcount. */
1659
1660 static inline unsigned
1661 pan_xfb_base(unsigned present)
1662 {
1663         return util_bitcount(present);
1664 }
1665
1666 /* Computes the present mask for varyings so we can start emitting varying records */
1667
1668 static inline unsigned
1669 pan_varying_present(
1670         struct panfrost_shader_state *vs,
1671         struct panfrost_shader_state *fs,
1672         unsigned quirks)
1673 {
1674         /* At the moment we always emit general and position buffers. Not
1675          * strictly necessary but usually harmless */
1676
1677         unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1678
1679         /* Enable special buffers by the shader info */
1680
1681         if (vs->writes_point_size)
1682                 present |= (1 << PAN_VARY_PSIZ);
1683
1684         if (fs->reads_point_coord)
1685                 present |= (1 << PAN_VARY_PNTCOORD);
1686
1687         if (fs->reads_face)
1688                 present |= (1 << PAN_VARY_FACE);
1689
1690         if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1691                 present |= (1 << PAN_VARY_FRAGCOORD);
1692
1693         /* Also, if we have a point sprite, we need a point coord buffer */
1694
1695         for (unsigned i = 0; i < fs->varying_count; i++)  {
1696                 gl_varying_slot loc = fs->varyings_loc[i];
1697
1698                 if (has_point_coord(fs->point_sprite_mask, loc))
1699                         present |= (1 << PAN_VARY_PNTCOORD);
1700         }
1701
1702         return present;
1703 }
1704
1705 /* Emitters for varying records */
1706
1707 static void
1708 pan_emit_vary(struct mali_attribute_packed *out,
1709                 unsigned present, enum pan_special_varying buf,
1710                 unsigned quirks, enum mali_format format,
1711                 unsigned offset)
1712 {
1713         unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1714         unsigned swizzle = quirks & HAS_SWIZZLES ?
1715                         panfrost_get_default_swizzle(nr_channels) :
1716                         panfrost_bifrost_swizzle(nr_channels);
1717
1718         pan_pack(out, ATTRIBUTE, cfg) {
1719                 cfg.buffer_index = pan_varying_index(present, buf);
1720                 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1721                 cfg.format = (format << 12) | swizzle;
1722                 cfg.offset = offset;
1723         }
1724 }
1725
1726 /* General varying that is unused */
1727
1728 static void
1729 pan_emit_vary_only(struct mali_attribute_packed *out,
1730                 unsigned present, unsigned quirks)
1731 {
1732         pan_emit_vary(out, present, 0, quirks, MALI_VARYING_DISCARD, 0);
1733 }
1734
1735 /* Special records */
1736
1737 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1738         [PAN_VARY_POSITION]     = MALI_VARYING_POS,
1739         [PAN_VARY_PSIZ]         = MALI_R16F,
1740         [PAN_VARY_PNTCOORD]     = MALI_R16F,
1741         [PAN_VARY_FACE]         = MALI_R32I,
1742         [PAN_VARY_FRAGCOORD]    = MALI_RGBA32F
1743 };
1744
1745 static void
1746 pan_emit_vary_special(struct mali_attribute_packed *out,
1747                 unsigned present, enum pan_special_varying buf,
1748                 unsigned quirks)
1749 {
1750         assert(buf < PAN_VARY_MAX);
1751         pan_emit_vary(out, present, buf, quirks, pan_varying_formats[buf], 0);
1752 }
1753
1754 static enum mali_format
1755 pan_xfb_format(enum mali_format format, unsigned nr)
1756 {
1757         if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1758                 return MALI_R32F | MALI_NR_CHANNELS(nr);
1759         else
1760                 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1761 }
1762
1763 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1764  * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1765  * value. */
1766
1767 static void
1768 pan_emit_vary_xfb(struct mali_attribute_packed *out,
1769                 unsigned present,
1770                 unsigned max_xfb,
1771                 unsigned *streamout_offsets,
1772                 unsigned quirks,
1773                 enum mali_format format,
1774                 struct pipe_stream_output o)
1775 {
1776         unsigned swizzle = quirks & HAS_SWIZZLES ?
1777                         panfrost_get_default_swizzle(o.num_components) :
1778                         panfrost_bifrost_swizzle(o.num_components);
1779
1780         pan_pack(out, ATTRIBUTE, cfg) {
1781                 /* XFB buffers come after everything else */
1782                 cfg.buffer_index = pan_xfb_base(present) + o.output_buffer;
1783                 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1784
1785                 /* Override number of channels and precision to highp */
1786                 cfg.format = (pan_xfb_format(format, o.num_components) << 12) | swizzle;
1787
1788                 /* Apply given offsets together */
1789                 cfg.offset = (o.dst_offset * 4) /* dwords */
1790                         + streamout_offsets[o.output_buffer];
1791         }
1792 }
1793
1794 /* Determine if we should capture a varying for XFB. This requires actually
1795  * having a buffer for it. If we don't capture it, we'll fallback to a general
1796  * varying path (linked or unlinked, possibly discarding the write) */
1797
1798 static bool
1799 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1800                 unsigned loc, unsigned max_xfb)
1801 {
1802         if (!(xfb->so_mask & (1ll << loc)))
1803                 return false;
1804
1805         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1806         return o->output_buffer < max_xfb;
1807 }
1808
1809 static void
1810 pan_emit_general_varying(struct mali_attribute_packed *out,
1811                 struct panfrost_shader_state *other,
1812                 struct panfrost_shader_state *xfb,
1813                 gl_varying_slot loc,
1814                 enum mali_format format,
1815                 unsigned present,
1816                 unsigned quirks,
1817                 unsigned *gen_offsets,
1818                 enum mali_format *gen_formats,
1819                 unsigned *gen_stride,
1820                 unsigned idx,
1821                 bool should_alloc)
1822 {
1823         /* Check if we're linked */
1824         signed other_idx = -1;
1825
1826         for (unsigned j = 0; j < other->varying_count; ++j) {
1827                 if (other->varyings_loc[j] == loc) {
1828                         other_idx = j;
1829                         break;
1830                 }
1831         }
1832
1833         if (other_idx < 0) {
1834                 pan_emit_vary_only(out, present, quirks);
1835                 return;
1836         }
1837
1838         unsigned offset = gen_offsets[other_idx];
1839
1840         if (should_alloc) {
1841                 /* We're linked, so allocate a space via a watermark allocation */
1842                 enum mali_format alt = other->varyings[other_idx];
1843
1844                 /* Do interpolation at minimum precision */
1845                 unsigned size_main = pan_varying_size(format);
1846                 unsigned size_alt = pan_varying_size(alt);
1847                 unsigned size = MIN2(size_main, size_alt);
1848
1849                 /* If a varying is marked for XFB but not actually captured, we
1850                  * should match the format to the format that would otherwise
1851                  * be used for XFB, since dEQP checks for invariance here. It's
1852                  * unclear if this is required by the spec. */
1853
1854                 if (xfb->so_mask & (1ull << loc)) {
1855                         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1856                         format = pan_xfb_format(format, o->num_components);
1857                         size = pan_varying_size(format);
1858                 } else if (size == size_alt) {
1859                         format = alt;
1860                 }
1861
1862                 gen_offsets[idx] = *gen_stride;
1863                 gen_formats[other_idx] = format;
1864                 offset = *gen_stride;
1865                 *gen_stride += size;
1866         }
1867
1868         pan_emit_vary(out, present, PAN_VARY_GENERAL, quirks, format, offset);
1869 }
1870
1871 /* Higher-level wrapper around all of the above, classifying a varying into one
1872  * of the above types */
1873
1874 static void
1875 panfrost_emit_varying(
1876                 struct mali_attribute_packed *out,
1877                 struct panfrost_shader_state *stage,
1878                 struct panfrost_shader_state *other,
1879                 struct panfrost_shader_state *xfb,
1880                 unsigned present,
1881                 unsigned max_xfb,
1882                 unsigned *streamout_offsets,
1883                 unsigned quirks,
1884                 unsigned *gen_offsets,
1885                 enum mali_format *gen_formats,
1886                 unsigned *gen_stride,
1887                 unsigned idx,
1888                 bool should_alloc,
1889                 bool is_fragment)
1890 {
1891         gl_varying_slot loc = stage->varyings_loc[idx];
1892         enum mali_format format = stage->varyings[idx];
1893
1894         /* Override format to match linkage */
1895         if (!should_alloc && gen_formats[idx])
1896                 format = gen_formats[idx];
1897
1898         if (has_point_coord(stage->point_sprite_mask, loc)) {
1899                 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1900         } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1901                 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1902                 pan_emit_vary_xfb(out, present, max_xfb, streamout_offsets, quirks, format, *o);
1903         } else if (loc == VARYING_SLOT_POS) {
1904                 if (is_fragment)
1905                         pan_emit_vary_special(out, present, PAN_VARY_FRAGCOORD, quirks);
1906                 else
1907                         pan_emit_vary_special(out, present, PAN_VARY_POSITION, quirks);
1908         } else if (loc == VARYING_SLOT_PSIZ) {
1909                 pan_emit_vary_special(out, present, PAN_VARY_PSIZ, quirks);
1910         } else if (loc == VARYING_SLOT_PNTC) {
1911                 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1912         } else if (loc == VARYING_SLOT_FACE) {
1913                 pan_emit_vary_special(out, present, PAN_VARY_FACE, quirks);
1914         } else {
1915                 pan_emit_general_varying(out, other, xfb, loc, format, present,
1916                                 quirks, gen_offsets, gen_formats, gen_stride,
1917                                 idx, should_alloc);
1918         }
1919 }
1920
1921 static void
1922 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
1923                 unsigned present,
1924                 enum pan_special_varying v,
1925                 unsigned special)
1926 {
1927         if (present & (1 << v)) {
1928                 unsigned idx = pan_varying_index(present, v);
1929
1930                 pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
1931                         cfg.special = special;
1932                         cfg.type = 0;
1933                 }
1934         }
1935 }
1936
1937 void
1938 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1939                                  unsigned vertex_count,
1940                                  struct mali_vertex_tiler_postfix *vertex_postfix,
1941                                  struct mali_vertex_tiler_postfix *tiler_postfix,
1942                                  union midgard_primitive_size *primitive_size)
1943 {
1944         /* Load the shaders */
1945         struct panfrost_context *ctx = batch->ctx;
1946         struct panfrost_device *dev = pan_device(ctx->base.screen);
1947         struct panfrost_shader_state *vs, *fs;
1948         size_t vs_size, fs_size;
1949
1950         /* Allocate the varying descriptor */
1951
1952         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1953         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1954         vs_size = MALI_ATTRIBUTE_LENGTH * vs->varying_count;
1955         fs_size = MALI_ATTRIBUTE_LENGTH * fs->varying_count;
1956
1957         struct panfrost_transfer trans = panfrost_pool_alloc(&batch->pool,
1958                                                                      vs_size +
1959                                                                      fs_size);
1960
1961         struct pipe_stream_output_info *so = &vs->stream_output;
1962         unsigned present = pan_varying_present(vs, fs, dev->quirks);
1963
1964         /* Check if this varying is linked by us. This is the case for
1965          * general-purpose, non-captured varyings. If it is, link it. If it's
1966          * not, use the provided stream out information to determine the
1967          * offset, since it was already linked for us. */
1968
1969         unsigned gen_offsets[32];
1970         enum mali_format gen_formats[32];
1971         memset(gen_offsets, 0, sizeof(gen_offsets));
1972         memset(gen_formats, 0, sizeof(gen_formats));
1973
1974         unsigned gen_stride = 0;
1975         assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
1976         assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
1977
1978         unsigned streamout_offsets[32];
1979
1980         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1981                 streamout_offsets[i] = panfrost_streamout_offset(
1982                                         so->stride[i],
1983                                         ctx->streamout.offsets[i],
1984                                         ctx->streamout.targets[i]);
1985         }
1986
1987         struct mali_attribute_packed *ovs = (struct mali_attribute_packed *)trans.cpu;
1988         struct mali_attribute_packed *ofs = ovs + vs->varying_count;
1989
1990         for (unsigned i = 0; i < vs->varying_count; i++) {
1991                 panfrost_emit_varying(ovs + i, vs, fs, vs, present,
1992                                 ctx->streamout.num_targets, streamout_offsets,
1993                                 dev->quirks,
1994                                 gen_offsets, gen_formats, &gen_stride, i, true, false);
1995         }
1996
1997         for (unsigned i = 0; i < fs->varying_count; i++) {
1998                 panfrost_emit_varying(ofs + i, fs, vs, vs, present,
1999                                 ctx->streamout.num_targets, streamout_offsets,
2000                                 dev->quirks,
2001                                 gen_offsets, gen_formats, &gen_stride, i, false, true);
2002         }
2003
2004         unsigned xfb_base = pan_xfb_base(present);
2005         struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool,
2006                         MALI_ATTRIBUTE_BUFFER_LENGTH * (xfb_base + ctx->streamout.num_targets));
2007         struct mali_attribute_buffer_packed *varyings =
2008                 (struct mali_attribute_buffer_packed *) T.cpu;
2009
2010         /* Emit the stream out buffers */
2011
2012         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
2013                                                            ctx->vertex_count);
2014
2015         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2016                 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
2017                                         so->stride[i],
2018                                         ctx->streamout.offsets[i],
2019                                         out_count,
2020                                         ctx->streamout.targets[i]);
2021         }
2022
2023         panfrost_emit_varyings(batch,
2024                         &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
2025                         gen_stride, vertex_count);
2026
2027         /* fp32 vec4 gl_Position */
2028         tiler_postfix->position_varying = panfrost_emit_varyings(batch,
2029                         &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
2030                         sizeof(float) * 4, vertex_count);
2031
2032         if (present & (1 << PAN_VARY_PSIZ)) {
2033                 primitive_size->pointer = panfrost_emit_varyings(batch,
2034                                 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
2035                                 2, vertex_count);
2036         }
2037
2038         pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD);
2039         pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
2040         pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
2041
2042         vertex_postfix->varyings = T.gpu;
2043         tiler_postfix->varyings = T.gpu;
2044
2045         vertex_postfix->varying_meta = trans.gpu;
2046         tiler_postfix->varying_meta = trans.gpu + vs_size;
2047 }
2048
2049 void
2050 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
2051                                 struct mali_vertex_tiler_prefix *vertex_prefix,
2052                                 struct mali_vertex_tiler_postfix *vertex_postfix,
2053                                 struct mali_vertex_tiler_prefix *tiler_prefix,
2054                                 struct mali_vertex_tiler_postfix *tiler_postfix,
2055                                 union midgard_primitive_size *primitive_size)
2056 {
2057         struct panfrost_context *ctx = batch->ctx;
2058         struct panfrost_device *device = pan_device(ctx->base.screen);
2059         bool wallpapering = ctx->wallpaper_batch && batch->scoreboard.tiler_dep;
2060         struct bifrost_payload_vertex bifrost_vertex = {0,};
2061         struct bifrost_payload_tiler bifrost_tiler = {0,};
2062         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
2063         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
2064         void *vp, *tp;
2065         size_t vp_size, tp_size;
2066
2067         if (device->quirks & IS_BIFROST) {
2068                 bifrost_vertex.prefix = *vertex_prefix;
2069                 bifrost_vertex.postfix = *vertex_postfix;
2070                 vp = &bifrost_vertex;
2071                 vp_size = sizeof(bifrost_vertex);
2072
2073                 bifrost_tiler.prefix = *tiler_prefix;
2074                 bifrost_tiler.tiler.primitive_size = *primitive_size;
2075                 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2076                 bifrost_tiler.postfix = *tiler_postfix;
2077                 tp = &bifrost_tiler;
2078                 tp_size = sizeof(bifrost_tiler);
2079         } else {
2080                 midgard_vertex.prefix = *vertex_prefix;
2081                 midgard_vertex.postfix = *vertex_postfix;
2082                 vp = &midgard_vertex;
2083                 vp_size = sizeof(midgard_vertex);
2084
2085                 midgard_tiler.prefix = *tiler_prefix;
2086                 midgard_tiler.postfix = *tiler_postfix;
2087                 midgard_tiler.primitive_size = *primitive_size;
2088                 tp = &midgard_tiler;
2089                 tp_size = sizeof(midgard_tiler);
2090         }
2091
2092         if (wallpapering) {
2093                 /* Inject in reverse order, with "predicted" job indices.
2094                  * THIS IS A HACK XXX */
2095                 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false,
2096                                  batch->scoreboard.job_index + 2, tp, tp_size, true);
2097                 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2098                                  vp, vp_size, true);
2099                 return;
2100         }
2101
2102         /* If rasterizer discard is enable, only submit the vertex */
2103
2104         bool rasterizer_discard = ctx->rasterizer &&
2105                                   ctx->rasterizer->base.rasterizer_discard;
2106
2107         unsigned vertex = panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2108                                            vp, vp_size, false);
2109
2110         if (rasterizer_discard)
2111                 return;
2112
2113         panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false, vertex, tp, tp_size,
2114                          false);
2115 }
2116
2117 /* TODO: stop hardcoding this */
2118 mali_ptr
2119 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2120 {
2121         uint16_t locations[] = {
2122             128, 128,
2123             0, 256,
2124             0, 256,
2125             0, 256,
2126             0, 256,
2127             0, 256,
2128             0, 256,
2129             0, 256,
2130             0, 256,
2131             0, 256,
2132             0, 256,
2133             0, 256,
2134             0, 256,
2135             0, 256,
2136             0, 256,
2137             0, 256,
2138             0, 256,
2139             0, 256,
2140             0, 256,
2141             0, 256,
2142             0, 256,
2143             0, 256,
2144             0, 256,
2145             0, 256,
2146             0, 256,
2147             0, 256,
2148             0, 256,
2149             0, 256,
2150             0, 256,
2151             0, 256,
2152             0, 256,
2153             0, 256,
2154             128, 128,
2155             0, 0,
2156             0, 0,
2157             0, 0,
2158             0, 0,
2159             0, 0,
2160             0, 0,
2161             0, 0,
2162             0, 0,
2163             0, 0,
2164             0, 0,
2165             0, 0,
2166             0, 0,
2167             0, 0,
2168             0, 0,
2169             0, 0,
2170         };
2171
2172         return panfrost_pool_upload(&batch->pool, locations, 96 * sizeof(uint16_t));
2173 }