src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28
  29 #include "panfrost-quirks.h"
  30
  31 #include "pan_allocate.h"
  32 #include "pan_bo.h"
  33 #include "pan_cmdstream.h"
  34 #include "pan_context.h"
  35 #include "pan_job.h"
  36
  37 /* If a BO is accessed for a particular shader stage, will it be in the primary
  38  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  39  * fragment will be primary, e.g. compute jobs will be considered
  40  * "vertex/tiler" by analogy */
  41
  42 static inline uint32_t
  43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  44 {
  45         assert(stage == PIPE_SHADER_FRAGMENT ||
  46                stage == PIPE_SHADER_VERTEX ||
  47                stage == PIPE_SHADER_COMPUTE);
  48
  49         return stage == PIPE_SHADER_FRAGMENT ?
  50                PAN_BO_ACCESS_FRAGMENT :
  51                PAN_BO_ACCESS_VERTEX_TILER;
  52 }
  53
  54 static void
  55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
  56                                struct mali_vertex_tiler_postfix *postfix)
  57 {
  58         struct panfrost_device *dev = pan_device(ctx->base.screen);
  59         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  60
  61         unsigned shift = panfrost_get_stack_shift(batch->stack_size);
  62         struct mali_shared_memory shared = {
  63                 .stack_shift = shift,
  64                 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
  65                 .shared_workgroup_count = ~0,
  66         };
  67         postfix->shared_memory = panfrost_upload_transient(batch, &shared, sizeof(shared));
  68 }
  69
  70 static void
  71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
  72                                struct mali_vertex_tiler_postfix *postfix)
  73 {
  74         struct panfrost_device *dev = pan_device(ctx->base.screen);
  75         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  76
  77         /* If we haven't, reserve space for the framebuffer */
  78
  79         if (!batch->framebuffer.gpu) {
  80                 unsigned size = (dev->quirks & MIDGARD_SFBD) ?
  81                         sizeof(struct mali_single_framebuffer) :
  82                         sizeof(struct mali_framebuffer);
  83
  84                 batch->framebuffer = panfrost_allocate_transient(batch, size);
  85
  86                 /* Tag the pointer */
  87                 if (!(dev->quirks & MIDGARD_SFBD))
  88                         batch->framebuffer.gpu |= MALI_MFBD;
  89         }
  90
  91         postfix->shared_memory = batch->framebuffer.gpu;
  92 }
  93
  94 static void
  95 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
  96                               struct mali_vertex_tiler_prefix *prefix,
  97                               struct mali_vertex_tiler_postfix *postfix)
  98 {
  99         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 100
 101         postfix->gl_enables |= 0x7;
 102         SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
 103                 rasterizer && rasterizer->base.front_ccw);
 104         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
 105                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
 106         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
 107                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
 108         SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
 109                 rasterizer && rasterizer->base.flatshade_first);
 110 }
 111
 112 void
 113 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
 114                                   struct mali_vertex_tiler_prefix *prefix,
 115                                   union midgard_primitive_size *primitive_size)
 116 {
 117         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 118
 119         if (!panfrost_writes_point_size(ctx)) {
 120                 bool points = prefix->draw_mode == MALI_POINTS;
 121                 float val = 0.0f;
 122
 123                 if (rasterizer)
 124                         val = points ?
 125                               rasterizer->base.point_size :
 126                               rasterizer->base.line_width;
 127
 128                 primitive_size->constant = val;
 129         }
 130 }
 131
 132 static void
 133 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
 134                                    struct mali_vertex_tiler_postfix *postfix)
 135 {
 136         SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
 137         if (ctx->occlusion_query)
 138                 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
 139         else
 140                 postfix->occlusion_counter = 0;
 141 }
 142
 143 void
 144 panfrost_vt_init(struct panfrost_context *ctx,
 145                  enum pipe_shader_type stage,
 146                  struct mali_vertex_tiler_prefix *prefix,
 147                  struct mali_vertex_tiler_postfix *postfix)
 148 {
 149         struct panfrost_device *device = pan_device(ctx->base.screen);
 150
 151         if (!ctx->shader[stage])
 152                 return;
 153
 154         memset(prefix, 0, sizeof(*prefix));
 155         memset(postfix, 0, sizeof(*postfix));
 156
 157         if (device->quirks & IS_BIFROST) {
 158                 postfix->gl_enables = 0x2;
 159                 panfrost_vt_emit_shared_memory(ctx, postfix);
 160         } else {
 161                 postfix->gl_enables = 0x6;
 162                 panfrost_vt_attach_framebuffer(ctx, postfix);
 163         }
 164
 165         if (stage == PIPE_SHADER_FRAGMENT) {
 166                 panfrost_vt_update_occlusion_query(ctx, postfix);
 167                 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
 168         }
 169 }
 170
 171 static unsigned
 172 panfrost_translate_index_size(unsigned size)
 173 {
 174         switch (size) {
 175         case 1:
 176                 return MALI_DRAW_INDEXED_UINT8;
 177
 178         case 2:
 179                 return MALI_DRAW_INDEXED_UINT16;
 180
 181         case 4:
 182                 return MALI_DRAW_INDEXED_UINT32;
 183
 184         default:
 185                 unreachable("Invalid index size");
 186         }
 187 }
 188
 189 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
 190  * good for the duration of the draw (transient), could last longer. Also get
 191  * the bounds on the index buffer for the range accessed by the draw. We do
 192  * these operations together because there are natural optimizations which
 193  * require them to be together. */
 194
 195 static mali_ptr
 196 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 197                                   const struct pipe_draw_info *info,
 198                                   unsigned *min_index, unsigned *max_index)
 199 {
 200         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 201         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 202         off_t offset = info->start * info->index_size;
 203         bool needs_indices = true;
 204         mali_ptr out = 0;
 205
 206         if (info->max_index != ~0u) {
 207                 *min_index = info->min_index;
 208                 *max_index = info->max_index;
 209                 needs_indices = false;
 210         }
 211
 212         if (!info->has_user_indices) {
 213                 /* Only resources can be directly mapped */
 214                 panfrost_batch_add_bo(batch, rsrc->bo,
 215                                       PAN_BO_ACCESS_SHARED |
 216                                       PAN_BO_ACCESS_READ |
 217                                       PAN_BO_ACCESS_VERTEX_TILER);
 218                 out = rsrc->bo->gpu + offset;
 219
 220                 /* Check the cache */
 221                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 222                                                            info->start,
 223                                                            info->count,
 224                                                            min_index,
 225                                                            max_index);
 226         } else {
 227                 /* Otherwise, we need to upload to transient memory */
 228                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 229                 out = panfrost_upload_transient(batch, ibuf8 + offset,
 230                                                 info->count *
 231                                                 info->index_size);
 232         }
 233
 234         if (needs_indices) {
 235                 /* Fallback */
 236                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 237
 238                 if (!info->has_user_indices)
 239                         panfrost_minmax_cache_add(rsrc->index_cache,
 240                                                   info->start, info->count,
 241                                                   *min_index, *max_index);
 242         }
 243
 244         return out;
 245 }
 246
 247 void
 248 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
 249                           const struct pipe_draw_info *info,
 250                           enum mali_draw_mode draw_mode,
 251                           struct mali_vertex_tiler_postfix *vertex_postfix,
 252                           struct mali_vertex_tiler_prefix *tiler_prefix,
 253                           struct mali_vertex_tiler_postfix *tiler_postfix,
 254                           unsigned *vertex_count,
 255                           unsigned *padded_count)
 256 {
 257         tiler_prefix->draw_mode = draw_mode;
 258
 259         unsigned draw_flags = 0;
 260
 261         if (panfrost_writes_point_size(ctx))
 262                 draw_flags |= MALI_DRAW_VARYING_SIZE;
 263
 264         if (info->primitive_restart)
 265                 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
 266
 267         /* These doesn't make much sense */
 268
 269         draw_flags |= 0x3000;
 270
 271         if (info->index_size) {
 272                 unsigned min_index = 0, max_index = 0;
 273
 274                 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
 275                                                                        info,
 276                                                                        &min_index,
 277                                                                        &max_index);
 278
 279                 /* Use the corresponding values */
 280                 *vertex_count = max_index - min_index + 1;
 281                 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
 282                 tiler_prefix->offset_bias_correction = -min_index;
 283                 tiler_prefix->index_count = MALI_POSITIVE(info->count);
 284                 draw_flags |= panfrost_translate_index_size(info->index_size);
 285         } else {
 286                 tiler_prefix->indices = 0;
 287                 *vertex_count = ctx->vertex_count;
 288                 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
 289                 tiler_prefix->offset_bias_correction = 0;
 290                 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
 291         }
 292
 293         tiler_prefix->unknown_draw = draw_flags;
 294
 295         /* Encode the padded vertex count */
 296
 297         if (info->instance_count > 1) {
 298                 *padded_count = panfrost_padded_vertex_count(*vertex_count);
 299
 300                 unsigned shift = __builtin_ctz(ctx->padded_count);
 301                 unsigned k = ctx->padded_count >> (shift + 1);
 302
 303                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
 304                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
 305         } else {
 306                 *padded_count = *vertex_count;
 307
 308                 /* Reset instancing state */
 309                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
 310                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
 311         }
 312 }
 313
 314 static void
 315 panfrost_shader_meta_init(struct panfrost_context *ctx,
 316                           enum pipe_shader_type st,
 317                           struct mali_shader_meta *meta)
 318 {
 319         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 320         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 321
 322         memset(meta, 0, sizeof(*meta));
 323         meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
 324         meta->attribute_count = ss->attribute_count;
 325         meta->varying_count = ss->varying_count;
 326         meta->texture_count = ctx->sampler_view_count[st];
 327         meta->sampler_count = ctx->sampler_count[st];
 328
 329         if (dev->quirks & IS_BIFROST) {
 330                 if (st == PIPE_SHADER_VERTEX)
 331                         meta->bifrost1.unk1 = 0x800000;
 332                 else {
 333                         /* First clause ATEST |= 0x4000000.
 334                          * Less than 32 regs |= 0x200 */
 335                         meta->bifrost1.unk1 = 0x950020;
 336                 }
 337
 338                 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 339                 if (st == PIPE_SHADER_VERTEX)
 340                         meta->bifrost2.preload_regs = 0xC0;
 341                 else
 342                         meta->bifrost2.preload_regs = 0x1;
 343                 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
 344                                                     ss->uniform_cutoff);
 345         } else {
 346                 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
 347                                                     ss->uniform_cutoff);
 348                 meta->midgard1.work_count = ss->work_reg_count;
 349
 350                 /* TODO: This is not conformant on ES3 */
 351                 meta->midgard1.flags_hi = MALI_SUPPRESS_INF_NAN;
 352
 353                 meta->midgard1.flags_lo = 0x220;
 354                 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 355         }
 356 }
 357
 358 static unsigned
 359 panfrost_translate_compare_func(enum pipe_compare_func in)
 360 {
 361         switch (in) {
 362         case PIPE_FUNC_NEVER:
 363                 return MALI_FUNC_NEVER;
 364
 365         case PIPE_FUNC_LESS:
 366                 return MALI_FUNC_LESS;
 367
 368         case PIPE_FUNC_EQUAL:
 369                 return MALI_FUNC_EQUAL;
 370
 371         case PIPE_FUNC_LEQUAL:
 372                 return MALI_FUNC_LEQUAL;
 373
 374         case PIPE_FUNC_GREATER:
 375                 return MALI_FUNC_GREATER;
 376
 377         case PIPE_FUNC_NOTEQUAL:
 378                 return MALI_FUNC_NOTEQUAL;
 379
 380         case PIPE_FUNC_GEQUAL:
 381                 return MALI_FUNC_GEQUAL;
 382
 383         case PIPE_FUNC_ALWAYS:
 384                 return MALI_FUNC_ALWAYS;
 385
 386         default:
 387                 unreachable("Invalid func");
 388         }
 389 }
 390
 391 static unsigned
 392 panfrost_translate_stencil_op(enum pipe_stencil_op in)
 393 {
 394         switch (in) {
 395         case PIPE_STENCIL_OP_KEEP:
 396                 return MALI_STENCIL_KEEP;
 397
 398         case PIPE_STENCIL_OP_ZERO:
 399                 return MALI_STENCIL_ZERO;
 400
 401         case PIPE_STENCIL_OP_REPLACE:
 402                return MALI_STENCIL_REPLACE;
 403
 404         case PIPE_STENCIL_OP_INCR:
 405                 return MALI_STENCIL_INCR;
 406
 407         case PIPE_STENCIL_OP_DECR:
 408                 return MALI_STENCIL_DECR;
 409
 410         case PIPE_STENCIL_OP_INCR_WRAP:
 411                 return MALI_STENCIL_INCR_WRAP;
 412
 413         case PIPE_STENCIL_OP_DECR_WRAP:
 414                 return MALI_STENCIL_DECR_WRAP;
 415
 416         case PIPE_STENCIL_OP_INVERT:
 417                 return MALI_STENCIL_INVERT;
 418
 419         default:
 420                 unreachable("Invalid stencil op");
 421         }
 422 }
 423
 424 static unsigned
 425 translate_tex_wrap(enum pipe_tex_wrap w)
 426 {
 427         switch (w) {
 428         case PIPE_TEX_WRAP_REPEAT:
 429                 return MALI_WRAP_REPEAT;
 430
 431         case PIPE_TEX_WRAP_CLAMP:
 432                 return MALI_WRAP_CLAMP;
 433
 434         case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 435                 return MALI_WRAP_CLAMP_TO_EDGE;
 436
 437         case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
 438                 return MALI_WRAP_CLAMP_TO_BORDER;
 439
 440         case PIPE_TEX_WRAP_MIRROR_REPEAT:
 441                 return MALI_WRAP_MIRRORED_REPEAT;
 442
 443         case PIPE_TEX_WRAP_MIRROR_CLAMP:
 444                 return MALI_WRAP_MIRRORED_CLAMP;
 445
 446         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 447                 return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE;
 448
 449         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
 450                 return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER;
 451
 452         default:
 453                 unreachable("Invalid wrap");
 454         }
 455 }
 456
 457 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 458                                 struct mali_sampler_descriptor *hw)
 459 {
 460         unsigned func = panfrost_translate_compare_func(cso->compare_func);
 461         bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 462         bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 463         bool mip_linear  = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
 464         unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
 465         unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
 466         unsigned mip_filter = mip_linear  ?
 467                               (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
 468         unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
 469
 470         *hw = (struct mali_sampler_descriptor) {
 471                 .filter_mode = min_filter | mag_filter | mip_filter |
 472                                normalized,
 473                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 474                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 475                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 476                 .compare_func = panfrost_flip_compare_func(func),
 477                 .border_color = {
 478                         cso->border_color.f[0],
 479                         cso->border_color.f[1],
 480                         cso->border_color.f[2],
 481                         cso->border_color.f[3]
 482                 },
 483                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 484                 .max_lod = FIXED_16(cso->max_lod, false),
 485                 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
 486                 .seamless_cube_map = cso->seamless_cube_map,
 487         };
 488
 489         /* If necessary, we disable mipmapping in the sampler descriptor by
 490          * clamping the LOD as tight as possible (from 0 to epsilon,
 491          * essentially -- remember these are fixed point numbers, so
 492          * epsilon=1/256) */
 493
 494         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 495                 hw->max_lod = hw->min_lod + 1;
 496 }
 497
 498 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
 499                                         struct bifrost_sampler_descriptor *hw)
 500 {
 501         *hw = (struct bifrost_sampler_descriptor) {
 502                 .unk1 = 0x1,
 503                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 504                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 505                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 506                 .unk8 = 0x8,
 507                 .min_filter = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST,
 508                 .norm_coords = cso->normalized_coords,
 509                 .mip_filter = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR,
 510                 .mag_filter = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR,
 511                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 512                 .max_lod = FIXED_16(cso->max_lod, false),
 513         };
 514
 515         /* If necessary, we disable mipmapping in the sampler descriptor by
 516          * clamping the LOD as tight as possible (from 0 to epsilon,
 517          * essentially -- remember these are fixed point numbers, so
 518          * epsilon=1/256) */
 519
 520         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 521                 hw->max_lod = hw->min_lod + 1;
 522 }
 523
 524 static void
 525 panfrost_make_stencil_state(const struct pipe_stencil_state *in,
 526                             struct mali_stencil_test *out)
 527 {
 528         out->ref = 0; /* Gallium gets it from elsewhere */
 529
 530         out->mask = in->valuemask;
 531         out->func = panfrost_translate_compare_func(in->func);
 532         out->sfail = panfrost_translate_stencil_op(in->fail_op);
 533         out->dpfail = panfrost_translate_stencil_op(in->zfail_op);
 534         out->dppass = panfrost_translate_stencil_op(in->zpass_op);
 535 }
 536
 537 static void
 538 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
 539                                      struct mali_shader_meta *fragmeta)
 540 {
 541         if (!ctx->rasterizer) {
 542                 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
 543                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
 544                 fragmeta->depth_units = 0.0f;
 545                 fragmeta->depth_factor = 0.0f;
 546                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
 547                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
 548                 return;
 549         }
 550
 551         bool msaa = ctx->rasterizer->base.multisample;
 552
 553         /* TODO: Sample size */
 554         SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
 555         SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
 556         fragmeta->depth_units = ctx->rasterizer->base.offset_units * 2.0f;
 557         fragmeta->depth_factor = ctx->rasterizer->base.offset_scale;
 558
 559         /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
 560
 561         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A,
 562                 ctx->rasterizer->base.offset_tri);
 563         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B,
 564                 ctx->rasterizer->base.offset_tri);
 565 }
 566
 567 static void
 568 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
 569                               struct mali_shader_meta *fragmeta)
 570 {
 571         const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
 572         int zfunc = PIPE_FUNC_ALWAYS;
 573
 574         if (!zsa) {
 575                 struct pipe_stencil_state default_stencil = {
 576                         .enabled = 0,
 577                         .func = PIPE_FUNC_ALWAYS,
 578                         .fail_op = MALI_STENCIL_KEEP,
 579                         .zfail_op = MALI_STENCIL_KEEP,
 580                         .zpass_op = MALI_STENCIL_KEEP,
 581                         .writemask = 0xFF,
 582                         .valuemask = 0xFF
 583                 };
 584
 585                 panfrost_make_stencil_state(&default_stencil,
 586                                             &fragmeta->stencil_front);
 587                 fragmeta->stencil_mask_front = default_stencil.writemask;
 588                 fragmeta->stencil_back = fragmeta->stencil_front;
 589                 fragmeta->stencil_mask_back = default_stencil.writemask;
 590                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
 591                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
 592         } else {
 593                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
 594                         zsa->stencil[0].enabled);
 595                 panfrost_make_stencil_state(&zsa->stencil[0],
 596                                             &fragmeta->stencil_front);
 597                 fragmeta->stencil_mask_front = zsa->stencil[0].writemask;
 598                 fragmeta->stencil_front.ref = ctx->stencil_ref.ref_value[0];
 599
 600                 /* If back-stencil is not enabled, use the front values */
 601
 602                 if (zsa->stencil[1].enabled) {
 603                         panfrost_make_stencil_state(&zsa->stencil[1],
 604                                                     &fragmeta->stencil_back);
 605                         fragmeta->stencil_mask_back = zsa->stencil[1].writemask;
 606                         fragmeta->stencil_back.ref = ctx->stencil_ref.ref_value[1];
 607                 } else {
 608                         fragmeta->stencil_back = fragmeta->stencil_front;
 609                         fragmeta->stencil_mask_back = fragmeta->stencil_mask_front;
 610                         fragmeta->stencil_back.ref = fragmeta->stencil_front.ref;
 611                 }
 612
 613                 if (zsa->depth.enabled)
 614                         zfunc = zsa->depth.func;
 615
 616                 /* Depth state (TODO: Refactor) */
 617
 618                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
 619                         zsa->depth.writemask);
 620         }
 621
 622         fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
 623         fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
 624 }
 625
 626 static bool
 627 panfrost_fs_required(
 628                 struct panfrost_shader_state *fs,
 629                 struct panfrost_blend_final *blend,
 630                 unsigned rt_count)
 631 {
 632         /* If we generally have side effects */
 633         if (fs->fs_sidefx)
 634                 return true;
 635
 636         /* If colour is written we need to execute */
 637         for (unsigned i = 0; i < rt_count; ++i) {
 638                 if (!blend[i].no_colour)
 639                         return true;
 640         }
 641
 642         /* If depth is written and not implied we need to execute.
 643          * TODO: Predicate on Z/S writes being enabled */
 644         return (fs->writes_depth || fs->writes_stencil);
 645 }
 646
 647 static void
 648 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
 649                                 struct mali_shader_meta *fragmeta,
 650                                 void *rts)
 651 {
 652         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 653         struct panfrost_shader_state *fs;
 654         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 655
 656         SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
 657                 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
 658                 !ctx->blend->base.dither);
 659
 660         /* Get blending setup */
 661         unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 662
 663         struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 664         unsigned shader_offset = 0;
 665         struct panfrost_bo *shader_bo = NULL;
 666
 667         for (unsigned c = 0; c < rt_count; ++c)
 668                 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
 669                                                           &shader_offset);
 670
 671         /* Disable shader execution if we can */
 672         if (dev->quirks & MIDGARD_SHADERLESS
 673                         && !panfrost_fs_required(fs, blend, rt_count)) {
 674                 fragmeta->shader = 0;
 675                 fragmeta->attribute_count = 0;
 676                 fragmeta->varying_count = 0;
 677                 fragmeta->texture_count = 0;
 678                 fragmeta->sampler_count = 0;
 679
 680                 /* This feature is not known to work on Bifrost */
 681                 fragmeta->midgard1.work_count = 1;
 682                 fragmeta->midgard1.uniform_count = 0;
 683                 fragmeta->midgard1.uniform_buffer_count = 0;
 684         }
 685
 686          /* If there is a blend shader, work registers are shared. We impose 8
 687           * work registers as a limit for blend shaders. Should be lower XXX */
 688
 689         if (!(dev->quirks & IS_BIFROST)) {
 690                 for (unsigned c = 0; c < rt_count; ++c) {
 691                         if (blend[c].is_shader) {
 692                                 fragmeta->midgard1.work_count =
 693                                         MAX2(fragmeta->midgard1.work_count, 8);
 694                         }
 695                 }
 696         }
 697
 698         /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
 699          * copied to the blend_meta appended (by convention), but this is the
 700          * field actually read by the hardware. (Or maybe both are read...?).
 701          * Specify the last RTi with a blend shader. */
 702
 703         fragmeta->blend.shader = 0;
 704
 705         for (signed rt = (rt_count - 1); rt >= 0; --rt) {
 706                 if (!blend[rt].is_shader)
 707                         continue;
 708
 709                 fragmeta->blend.shader = blend[rt].shader.gpu |
 710                                          blend[rt].shader.first_tag;
 711                 break;
 712         }
 713
 714         if (dev->quirks & MIDGARD_SFBD) {
 715                 /* When only a single render target platform is used, the blend
 716                  * information is inside the shader meta itself. We additionally
 717                  * need to signal CAN_DISCARD for nontrivial blend modes (so
 718                  * we're able to read back the destination buffer) */
 719
 720                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
 721                         blend[0].is_shader);
 722
 723                 if (!blend[0].is_shader) {
 724                         fragmeta->blend.equation = *blend[0].equation.equation;
 725                         fragmeta->blend.constant = blend[0].equation.constant;
 726                 }
 727
 728                 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
 729                         !blend[0].no_blending || fs->can_discard);
 730                 return;
 731         }
 732
 733         if (dev->quirks & IS_BIFROST) {
 734                 bool no_blend = true;
 735
 736                 for (unsigned i = 0; i < rt_count; ++i)
 737                         no_blend &= (blend[i].no_blending | blend[i].no_colour);
 738
 739                 SET_BIT(fragmeta->bifrost1.unk1, MALI_BIFROST_EARLY_Z,
 740                         !fs->can_discard && !fs->writes_depth && no_blend);
 741         }
 742
 743         /* Additional blend descriptor tacked on for jobs using MFBD */
 744
 745         for (unsigned i = 0; i < rt_count; ++i) {
 746                 unsigned flags = 0;
 747
 748                 if (ctx->pipe_framebuffer.nr_cbufs > i && !blend[i].no_colour) {
 749                         flags = 0x200;
 750
 751                         bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
 752                                        (ctx->pipe_framebuffer.cbufs[i]) &&
 753                                        util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
 754
 755                         SET_BIT(flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
 756                         SET_BIT(flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
 757                         SET_BIT(flags, MALI_BLEND_SRGB, is_srgb);
 758                         SET_BIT(flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
 759                 }
 760
 761                 if (dev->quirks & IS_BIFROST) {
 762                         struct bifrost_blend_rt *brts = rts;
 763
 764                         brts[i].flags = flags;
 765
 766                         if (blend[i].is_shader) {
 767                                 /* The blend shader's address needs to be at
 768                                  * the same top 32 bit as the fragment shader.
 769                                  * TODO: Ensure that's always the case.
 770                                  */
 771                                 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
 772                                        (fs->bo->gpu & (0xffffffffull << 32)));
 773                                 brts[i].shader = blend[i].shader.gpu;
 774                                 brts[i].unk2 = 0x0;
 775                         } else if (ctx->pipe_framebuffer.nr_cbufs > i) {
 776                                 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
 777                                 const struct util_format_description *format_desc;
 778                                 format_desc = util_format_description(format);
 779
 780                                 brts[i].equation = *blend[i].equation.equation;
 781
 782                                 /* TODO: this is a bit more complicated */
 783                                 brts[i].constant = blend[i].equation.constant;
 784
 785                                 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
 786
 787                                 /* 0x19 disables blending and forces REPLACE
 788                                  * mode (equivalent to rgb_mode = alpha_mode =
 789                                  * x122, colour mask = 0xF). 0x1a allows
 790                                  * blending. */
 791                                 brts[i].unk2 = blend[i].no_blending ? 0x19 : 0x1a;
 792
 793                                 brts[i].shader_type = fs->blend_types[i];
 794                         } else {
 795                                 /* Dummy attachment for depth-only */
 796                                 brts[i].unk2 = 0x3;
 797                                 brts[i].shader_type = fs->blend_types[i];
 798                         }
 799                 } else {
 800                         struct midgard_blend_rt *mrts = rts;
 801                         mrts[i].flags = flags;
 802
 803                         if (blend[i].is_shader) {
 804                                 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 805                         } else {
 806                                 mrts[i].blend.equation = *blend[i].equation.equation;
 807                                 mrts[i].blend.constant = blend[i].equation.constant;
 808                         }
 809                 }
 810         }
 811 }
 812
 813 static void
 814 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
 815                                struct mali_shader_meta *fragmeta,
 816                                void *rts)
 817 {
 818         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 819         struct panfrost_shader_state *fs;
 820
 821         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 822
 823         fragmeta->alpha_coverage = ~MALI_ALPHA_COVERAGE(0.000000);
 824         fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x3010;
 825         fragmeta->unknown2_4 = 0x4e0;
 826
 827         /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
 828          * is required (independent of 32-bit/64-bit descriptors), or why it's
 829          * not used on later GPU revisions. Otherwise, all shader jobs fault on
 830          * these earlier chips (perhaps this is a chicken bit of some kind).
 831          * More investigation is needed. */
 832
 833         SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
 834
 835         if (dev->quirks & IS_BIFROST) {
 836                 /* TODO */
 837         } else {
 838                 /* Depending on whether it's legal to in the given shader, we try to
 839                  * enable early-z testing (or forward-pixel kill?) */
 840
 841                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
 842                         !fs->can_discard && !fs->writes_depth);
 843
 844                 /* Add the writes Z/S flags if needed. */
 845                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
 846                 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
 847
 848                 /* Any time texturing is used, derivatives are implicitly calculated,
 849                  * so we need to enable helper invocations */
 850
 851                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
 852                         fs->helper_invocations);
 853
 854                 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
 855
 856                 bool depth_enabled = fs->writes_depth ||
 857                    (zsa && zsa->depth.enabled && zsa->depth.func != PIPE_FUNC_ALWAYS);
 858
 859                 SET_BIT(fragmeta->midgard1.flags_lo, 0x400, !depth_enabled && fs->can_discard);
 860                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, depth_enabled && fs->can_discard);
 861         }
 862
 863         panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
 864         panfrost_frag_meta_zsa_update(ctx, fragmeta);
 865         panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
 866 }
 867
 868 void
 869 panfrost_emit_shader_meta(struct panfrost_batch *batch,
 870                           enum pipe_shader_type st,
 871                           struct mali_vertex_tiler_postfix *postfix)
 872 {
 873         struct panfrost_context *ctx = batch->ctx;
 874         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 875
 876         if (!ss) {
 877                 postfix->shader = 0;
 878                 return;
 879         }
 880
 881         struct mali_shader_meta meta;
 882
 883         panfrost_shader_meta_init(ctx, st, &meta);
 884
 885         /* Add the shader BO to the batch. */
 886         panfrost_batch_add_bo(batch, ss->bo,
 887                               PAN_BO_ACCESS_PRIVATE |
 888                               PAN_BO_ACCESS_READ |
 889                               panfrost_bo_access_for_stage(st));
 890
 891         mali_ptr shader_ptr;
 892
 893         if (st == PIPE_SHADER_FRAGMENT) {
 894                 struct panfrost_device *dev = pan_device(ctx->base.screen);
 895                 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 896                 size_t desc_size = sizeof(meta);
 897                 void *rts = NULL;
 898                 struct panfrost_transfer xfer;
 899                 unsigned rt_size;
 900
 901                 if (dev->quirks & MIDGARD_SFBD)
 902                         rt_size = 0;
 903                 else if (dev->quirks & IS_BIFROST)
 904                         rt_size = sizeof(struct bifrost_blend_rt);
 905                 else
 906                         rt_size = sizeof(struct midgard_blend_rt);
 907
 908                 desc_size += rt_size * rt_count;
 909
 910                 if (rt_size)
 911                         rts = rzalloc_size(ctx, rt_size * rt_count);
 912
 913                 panfrost_frag_shader_meta_init(ctx, &meta, rts);
 914
 915                 xfer = panfrost_allocate_transient(batch, desc_size);
 916
 917                 memcpy(xfer.cpu, &meta, sizeof(meta));
 918                 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
 919
 920                 if (rt_size)
 921                         ralloc_free(rts);
 922
 923                 shader_ptr = xfer.gpu;
 924         } else {
 925                 shader_ptr = panfrost_upload_transient(batch, &meta,
 926                                                        sizeof(meta));
 927         }
 928
 929         postfix->shader = shader_ptr;
 930 }
 931
 932 static void
 933 panfrost_mali_viewport_init(struct panfrost_context *ctx,
 934                             struct mali_viewport *mvp)
 935 {
 936         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 937
 938         /* Clip bounds are encoded as floats. The viewport itself is encoded as
 939          * (somewhat) asymmetric ints. */
 940
 941         const struct pipe_scissor_state *ss = &ctx->scissor;
 942
 943         memset(mvp, 0, sizeof(*mvp));
 944
 945         /* By default, do no viewport clipping, i.e. clip to (-inf, inf) in
 946          * each direction. Clipping to the viewport in theory should work, but
 947          * in practice causes issues when we're not explicitly trying to
 948          * scissor */
 949
 950         *mvp = (struct mali_viewport) {
 951                 .clip_minx = -INFINITY,
 952                 .clip_miny = -INFINITY,
 953                 .clip_maxx = INFINITY,
 954                 .clip_maxy = INFINITY,
 955         };
 956
 957         /* Always scissor to the viewport by default. */
 958         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 959         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 960
 961         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 962         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 963
 964         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 965         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 966
 967         /* Apply the scissor test */
 968
 969         unsigned minx, miny, maxx, maxy;
 970
 971         if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
 972                 minx = MAX2(ss->minx, vp_minx);
 973                 miny = MAX2(ss->miny, vp_miny);
 974                 maxx = MIN2(ss->maxx, vp_maxx);
 975                 maxy = MIN2(ss->maxy, vp_maxy);
 976         } else {
 977                 minx = vp_minx;
 978                 miny = vp_miny;
 979                 maxx = vp_maxx;
 980                 maxy = vp_maxy;
 981         }
 982
 983         /* Hardware needs the min/max to be strictly ordered, so flip if we
 984          * need to. The viewport transformation in the vertex shader will
 985          * handle the negatives if we don't */
 986
 987         if (miny > maxy) {
 988                 unsigned temp = miny;
 989                 miny = maxy;
 990                 maxy = temp;
 991         }
 992
 993         if (minx > maxx) {
 994                 unsigned temp = minx;
 995                 minx = maxx;
 996                 maxx = temp;
 997         }
 998
 999         if (minz > maxz) {
1000                 float temp = minz;
1001                 minz = maxz;
1002                 maxz = temp;
1003         }
1004
1005         /* Clamp to the framebuffer size as a last check */
1006
1007         minx = MIN2(ctx->pipe_framebuffer.width, minx);
1008         maxx = MIN2(ctx->pipe_framebuffer.width, maxx);
1009
1010         miny = MIN2(ctx->pipe_framebuffer.height, miny);
1011         maxy = MIN2(ctx->pipe_framebuffer.height, maxy);
1012
1013         /* Upload */
1014
1015         mvp->viewport0[0] = minx;
1016         mvp->viewport1[0] = MALI_POSITIVE(maxx);
1017
1018         mvp->viewport0[1] = miny;
1019         mvp->viewport1[1] = MALI_POSITIVE(maxy);
1020
1021         mvp->clip_minz = minz;
1022         mvp->clip_maxz = maxz;
1023 }
1024
1025 void
1026 panfrost_emit_viewport(struct panfrost_batch *batch,
1027                        struct mali_vertex_tiler_postfix *tiler_postfix)
1028 {
1029         struct panfrost_context *ctx = batch->ctx;
1030         struct mali_viewport mvp;
1031
1032         panfrost_mali_viewport_init(batch->ctx,  &mvp);
1033
1034         /* Update the job, unless we're doing wallpapering (whose lack of
1035          * scissor we can ignore, since if we "miss" a tile of wallpaper, it'll
1036          * just... be faster :) */
1037
1038         if (!ctx->wallpaper_batch)
1039                 panfrost_batch_union_scissor(batch, mvp.viewport0[0],
1040                                              mvp.viewport0[1],
1041                                              mvp.viewport1[0] + 1,
1042                                              mvp.viewport1[1] + 1);
1043
1044         tiler_postfix->viewport = panfrost_upload_transient(batch, &mvp,
1045                                                             sizeof(mvp));
1046 }
1047
1048 static mali_ptr
1049 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
1050                                  enum pipe_shader_type st,
1051                                  struct panfrost_constant_buffer *buf,
1052                                  unsigned index)
1053 {
1054         struct pipe_constant_buffer *cb = &buf->cb[index];
1055         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1056
1057         if (rsrc) {
1058                 panfrost_batch_add_bo(batch, rsrc->bo,
1059                                       PAN_BO_ACCESS_SHARED |
1060                                       PAN_BO_ACCESS_READ |
1061                                       panfrost_bo_access_for_stage(st));
1062
1063                 /* Alignment gauranteed by
1064                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
1065                 return rsrc->bo->gpu + cb->buffer_offset;
1066         } else if (cb->user_buffer) {
1067                 return panfrost_upload_transient(batch,
1068                                                  cb->user_buffer +
1069                                                  cb->buffer_offset,
1070                                                  cb->buffer_size);
1071         } else {
1072                 unreachable("No constant buffer");
1073         }
1074 }
1075
1076 struct sysval_uniform {
1077         union {
1078                 float f[4];
1079                 int32_t i[4];
1080                 uint32_t u[4];
1081                 uint64_t du[2];
1082         };
1083 };
1084
1085 static void
1086 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
1087                                       struct sysval_uniform *uniform)
1088 {
1089         struct panfrost_context *ctx = batch->ctx;
1090         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1091
1092         uniform->f[0] = vp->scale[0];
1093         uniform->f[1] = vp->scale[1];
1094         uniform->f[2] = vp->scale[2];
1095 }
1096
1097 static void
1098 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1099                                        struct sysval_uniform *uniform)
1100 {
1101         struct panfrost_context *ctx = batch->ctx;
1102         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1103
1104         uniform->f[0] = vp->translate[0];
1105         uniform->f[1] = vp->translate[1];
1106         uniform->f[2] = vp->translate[2];
1107 }
1108
1109 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1110                                        enum pipe_shader_type st,
1111                                        unsigned int sysvalid,
1112                                        struct sysval_uniform *uniform)
1113 {
1114         struct panfrost_context *ctx = batch->ctx;
1115         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1116         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1117         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1118         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1119
1120         assert(dim);
1121         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1122
1123         if (dim > 1)
1124                 uniform->i[1] = u_minify(tex->texture->height0,
1125                                          tex->u.tex.first_level);
1126
1127         if (dim > 2)
1128                 uniform->i[2] = u_minify(tex->texture->depth0,
1129                                          tex->u.tex.first_level);
1130
1131         if (is_array)
1132                 uniform->i[dim] = tex->texture->array_size;
1133 }
1134
1135 static void
1136 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1137                             enum pipe_shader_type st,
1138                             unsigned ssbo_id,
1139                             struct sysval_uniform *uniform)
1140 {
1141         struct panfrost_context *ctx = batch->ctx;
1142
1143         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1144         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1145
1146         /* Compute address */
1147         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1148
1149         panfrost_batch_add_bo(batch, bo,
1150                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1151                               panfrost_bo_access_for_stage(st));
1152
1153         /* Upload address and size as sysval */
1154         uniform->du[0] = bo->gpu + sb.buffer_offset;
1155         uniform->u[2] = sb.buffer_size;
1156 }
1157
1158 static void
1159 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1160                                enum pipe_shader_type st,
1161                                unsigned samp_idx,
1162                                struct sysval_uniform *uniform)
1163 {
1164         struct panfrost_context *ctx = batch->ctx;
1165         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1166
1167         uniform->f[0] = sampl->min_lod;
1168         uniform->f[1] = sampl->max_lod;
1169         uniform->f[2] = sampl->lod_bias;
1170
1171         /* Even without any errata, Midgard represents "no mipmapping" as
1172          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1173          * panfrost_create_sampler_state which also explains our choice of
1174          * epsilon value (again to keep behaviour consistent) */
1175
1176         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1177                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1178 }
1179
1180 static void
1181 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1182                                        struct sysval_uniform *uniform)
1183 {
1184         struct panfrost_context *ctx = batch->ctx;
1185
1186         uniform->u[0] = ctx->compute_grid->grid[0];
1187         uniform->u[1] = ctx->compute_grid->grid[1];
1188         uniform->u[2] = ctx->compute_grid->grid[2];
1189 }
1190
1191 static void
1192 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1193                         struct panfrost_shader_state *ss,
1194                         enum pipe_shader_type st)
1195 {
1196         struct sysval_uniform *uniforms = (void *)buf;
1197
1198         for (unsigned i = 0; i < ss->sysval_count; ++i) {
1199                 int sysval = ss->sysval[i];
1200
1201                 switch (PAN_SYSVAL_TYPE(sysval)) {
1202                 case PAN_SYSVAL_VIEWPORT_SCALE:
1203                         panfrost_upload_viewport_scale_sysval(batch,
1204                                                               &uniforms[i]);
1205                         break;
1206                 case PAN_SYSVAL_VIEWPORT_OFFSET:
1207                         panfrost_upload_viewport_offset_sysval(batch,
1208                                                                &uniforms[i]);
1209                         break;
1210                 case PAN_SYSVAL_TEXTURE_SIZE:
1211                         panfrost_upload_txs_sysval(batch, st,
1212                                                    PAN_SYSVAL_ID(sysval),
1213                                                    &uniforms[i]);
1214                         break;
1215                 case PAN_SYSVAL_SSBO:
1216                         panfrost_upload_ssbo_sysval(batch, st,
1217                                                     PAN_SYSVAL_ID(sysval),
1218                                                     &uniforms[i]);
1219                         break;
1220                 case PAN_SYSVAL_NUM_WORK_GROUPS:
1221                         panfrost_upload_num_work_groups_sysval(batch,
1222                                                                &uniforms[i]);
1223                         break;
1224                 case PAN_SYSVAL_SAMPLER:
1225                         panfrost_upload_sampler_sysval(batch, st,
1226                                                        PAN_SYSVAL_ID(sysval),
1227                                                        &uniforms[i]);
1228                         break;
1229                 default:
1230                         assert(0);
1231                 }
1232         }
1233 }
1234
1235 static const void *
1236 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1237                                  unsigned index)
1238 {
1239         struct pipe_constant_buffer *cb = &buf->cb[index];
1240         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1241
1242         if (rsrc)
1243                 return rsrc->bo->cpu;
1244         else if (cb->user_buffer)
1245                 return cb->user_buffer;
1246         else
1247                 unreachable("No constant buffer");
1248 }
1249
1250 void
1251 panfrost_emit_const_buf(struct panfrost_batch *batch,
1252                         enum pipe_shader_type stage,
1253                         struct mali_vertex_tiler_postfix *postfix)
1254 {
1255         struct panfrost_context *ctx = batch->ctx;
1256         struct panfrost_shader_variants *all = ctx->shader[stage];
1257
1258         if (!all)
1259                 return;
1260
1261         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1262
1263         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1264
1265         /* Uniforms are implicitly UBO #0 */
1266         bool has_uniforms = buf->enabled_mask & (1 << 0);
1267
1268         /* Allocate room for the sysval and the uniforms */
1269         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1270         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1271         size_t size = sys_size + uniform_size;
1272         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1273                                                                         size);
1274
1275         /* Upload sysvals requested by the shader */
1276         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1277
1278         /* Upload uniforms */
1279         if (has_uniforms && uniform_size) {
1280                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1281                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1282         }
1283
1284         /* Next up, attach UBOs. UBO #0 is the uniforms we just
1285          * uploaded */
1286
1287         unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1288         assert(ubo_count >= 1);
1289
1290         size_t sz = sizeof(uint64_t) * ubo_count;
1291         uint64_t ubos[PAN_MAX_CONST_BUFFERS];
1292         int uniform_count = ss->uniform_count;
1293
1294         /* Upload uniforms as a UBO */
1295         ubos[0] = MALI_MAKE_UBO(2 + uniform_count, transfer.gpu);
1296
1297         /* The rest are honest-to-goodness UBOs */
1298
1299         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1300                 size_t usz = buf->cb[ubo].buffer_size;
1301                 bool enabled = buf->enabled_mask & (1 << ubo);
1302                 bool empty = usz == 0;
1303
1304                 if (!enabled || empty) {
1305                         /* Stub out disabled UBOs to catch accesses */
1306                         ubos[ubo] = MALI_MAKE_UBO(0, 0xDEAD0000);
1307                         continue;
1308                 }
1309
1310                 mali_ptr gpu = panfrost_map_constant_buffer_gpu(batch, stage,
1311                                                                 buf, ubo);
1312
1313                 unsigned bytes_per_field = 16;
1314                 unsigned aligned = ALIGN_POT(usz, bytes_per_field);
1315                 ubos[ubo] = MALI_MAKE_UBO(aligned / bytes_per_field, gpu);
1316         }
1317
1318         mali_ptr ubufs = panfrost_upload_transient(batch, ubos, sz);
1319         postfix->uniforms = transfer.gpu;
1320         postfix->uniform_buffers = ubufs;
1321
1322         buf->dirty_mask = 0;
1323 }
1324
1325 void
1326 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1327                             const struct pipe_grid_info *info,
1328                             struct midgard_payload_vertex_tiler *vtp)
1329 {
1330         struct panfrost_context *ctx = batch->ctx;
1331         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1332         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1333         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1334                                                            128));
1335         unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1336                                info->grid[2] * 4;
1337         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1338                                                                   shared_size,
1339                                                                   1);
1340
1341         struct mali_shared_memory shared = {
1342                 .shared_memory = bo->gpu,
1343                 .shared_workgroup_count =
1344                         util_logbase2_ceil(info->grid[0]) +
1345                         util_logbase2_ceil(info->grid[1]) +
1346                         util_logbase2_ceil(info->grid[2]),
1347                 .shared_unk1 = 0x2,
1348                 .shared_shift = util_logbase2(single_size) - 1
1349         };
1350
1351         vtp->postfix.shared_memory = panfrost_upload_transient(batch, &shared,
1352                                                                sizeof(shared));
1353 }
1354
1355 static mali_ptr
1356 panfrost_get_tex_desc(struct panfrost_batch *batch,
1357                       enum pipe_shader_type st,
1358                       struct panfrost_sampler_view *view)
1359 {
1360         if (!view)
1361                 return (mali_ptr) 0;
1362
1363         struct pipe_sampler_view *pview = &view->base;
1364         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1365
1366         /* Add the BO to the job so it's retained until the job is done. */
1367
1368         panfrost_batch_add_bo(batch, rsrc->bo,
1369                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1370                               panfrost_bo_access_for_stage(st));
1371
1372         panfrost_batch_add_bo(batch, view->midgard_bo,
1373                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1374                               panfrost_bo_access_for_stage(st));
1375
1376         return view->midgard_bo->gpu;
1377 }
1378
1379 void
1380 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1381                                   enum pipe_shader_type stage,
1382                                   struct mali_vertex_tiler_postfix *postfix)
1383 {
1384         struct panfrost_context *ctx = batch->ctx;
1385         struct panfrost_device *device = pan_device(ctx->base.screen);
1386
1387         if (!ctx->sampler_view_count[stage])
1388                 return;
1389
1390         if (device->quirks & IS_BIFROST) {
1391                 struct bifrost_texture_descriptor *descriptors;
1392
1393                 descriptors = malloc(sizeof(struct bifrost_texture_descriptor) *
1394                                      ctx->sampler_view_count[stage]);
1395
1396                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1397                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1398                         struct pipe_sampler_view *pview = &view->base;
1399                         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1400
1401                         /* Add the BOs to the job so they are retained until the job is done. */
1402
1403                         panfrost_batch_add_bo(batch, rsrc->bo,
1404                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1405                                               panfrost_bo_access_for_stage(stage));
1406
1407                         panfrost_batch_add_bo(batch, view->bifrost_bo,
1408                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1409                                               panfrost_bo_access_for_stage(stage));
1410
1411                         memcpy(&descriptors[i], view->bifrost_descriptor, sizeof(*view->bifrost_descriptor));
1412                 }
1413
1414                 postfix->textures = panfrost_upload_transient(batch,
1415                                                               descriptors,
1416                                                               sizeof(struct bifrost_texture_descriptor) *
1417                                                                       ctx->sampler_view_count[stage]);
1418
1419                 free(descriptors);
1420         } else {
1421                 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1422
1423                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i)
1424                         trampolines[i] = panfrost_get_tex_desc(batch, stage,
1425                                                                ctx->sampler_views[stage][i]);
1426
1427                 postfix->textures = panfrost_upload_transient(batch,
1428                                                               trampolines,
1429                                                               sizeof(uint64_t) *
1430                                                               ctx->sampler_view_count[stage]);
1431         }
1432 }
1433
1434 void
1435 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1436                                   enum pipe_shader_type stage,
1437                                   struct mali_vertex_tiler_postfix *postfix)
1438 {
1439         struct panfrost_context *ctx = batch->ctx;
1440         struct panfrost_device *device = pan_device(ctx->base.screen);
1441
1442         if (!ctx->sampler_count[stage])
1443                 return;
1444
1445         if (device->quirks & IS_BIFROST) {
1446                 size_t desc_size = sizeof(struct bifrost_sampler_descriptor);
1447                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1448                 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1449                                                                                 transfer_size);
1450                 struct bifrost_sampler_descriptor *desc = (struct bifrost_sampler_descriptor *)transfer.cpu;
1451
1452                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1453                         desc[i] = ctx->samplers[stage][i]->bifrost_hw;
1454
1455                 postfix->sampler_descriptor = transfer.gpu;
1456         } else {
1457                 size_t desc_size = sizeof(struct mali_sampler_descriptor);
1458                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1459                 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1460                                                                                 transfer_size);
1461                 struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1462
1463                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1464                         desc[i] = ctx->samplers[stage][i]->midgard_hw;
1465
1466                 postfix->sampler_descriptor = transfer.gpu;
1467         }
1468 }
1469
1470 void
1471 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1472                                struct mali_vertex_tiler_postfix *vertex_postfix)
1473 {
1474         struct panfrost_context *ctx = batch->ctx;
1475
1476         if (!ctx->vertex)
1477                 return;
1478
1479         struct panfrost_vertex_state *so = ctx->vertex;
1480
1481         panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1482         vertex_postfix->attribute_meta = panfrost_upload_transient(batch, so->hw,
1483                                                                sizeof(*so->hw) *
1484                                                                PAN_MAX_ATTRIBUTE);
1485 }
1486
1487 void
1488 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1489                           struct mali_vertex_tiler_postfix *vertex_postfix)
1490 {
1491         struct panfrost_context *ctx = batch->ctx;
1492         struct panfrost_vertex_state *so = ctx->vertex;
1493
1494         /* Staged mali_attr, and index into them. i =/= k, depending on the
1495          * vertex buffer mask and instancing. Twice as much room is allocated,
1496          * for a worst case of NPOT_DIVIDEs which take up extra slot */
1497         union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1498         unsigned k = 0;
1499
1500         for (unsigned i = 0; i < so->num_elements; ++i) {
1501                 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1502                  * means duplicating some vertex buffers (who cares? aside from
1503                  * maybe some caching implications but I somehow doubt that
1504                  * matters) */
1505
1506                 struct pipe_vertex_element *elem = &so->pipe[i];
1507                 unsigned vbi = elem->vertex_buffer_index;
1508
1509                 /* The exception to 1:1 mapping is that we can have multiple
1510                  * entries (NPOT divisors), so we fixup anyways */
1511
1512                 so->hw[i].index = k;
1513
1514                 if (!(ctx->vb_mask & (1 << vbi)))
1515                         continue;
1516
1517                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1518                 struct panfrost_resource *rsrc;
1519
1520                 rsrc = pan_resource(buf->buffer.resource);
1521                 if (!rsrc)
1522                         continue;
1523
1524                 /* Align to 64 bytes by masking off the lower bits. This
1525                  * will be adjusted back when we fixup the src_offset in
1526                  * mali_attr_meta */
1527
1528                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1529                 mali_ptr addr = raw_addr & ~63;
1530                 unsigned chopped_addr = raw_addr - addr;
1531
1532                 /* Add a dependency of the batch on the vertex buffer */
1533                 panfrost_batch_add_bo(batch, rsrc->bo,
1534                                       PAN_BO_ACCESS_SHARED |
1535                                       PAN_BO_ACCESS_READ |
1536                                       PAN_BO_ACCESS_VERTEX_TILER);
1537
1538                 /* Set common fields */
1539                 attrs[k].elements = addr;
1540                 attrs[k].stride = buf->stride;
1541
1542                 /* Since we advanced the base pointer, we shrink the buffer
1543                  * size */
1544                 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1545
1546                 /* We need to add the extra size we masked off (for
1547                  * correctness) so the data doesn't get clamped away */
1548                 attrs[k].size += chopped_addr;
1549
1550                 /* For non-instancing make sure we initialize */
1551                 attrs[k].shift = attrs[k].extra_flags = 0;
1552
1553                 /* Instancing uses a dramatically different code path than
1554                  * linear, so dispatch for the actual emission now that the
1555                  * common code is finished */
1556
1557                 unsigned divisor = elem->instance_divisor;
1558
1559                 if (divisor && ctx->instance_count == 1) {
1560                         /* Silly corner case where there's a divisor(=1) but
1561                          * there's no legitimate instancing. So we want *every*
1562                          * attribute to be the same. So set stride to zero so
1563                          * we don't go anywhere. */
1564
1565                         attrs[k].size = attrs[k].stride + chopped_addr;
1566                         attrs[k].stride = 0;
1567                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1568                 } else if (ctx->instance_count <= 1) {
1569                         /* Normal, non-instanced attributes */
1570                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1571                 } else {
1572                         unsigned instance_shift = vertex_postfix->instance_shift;
1573                         unsigned instance_odd = vertex_postfix->instance_odd;
1574
1575                         k += panfrost_vertex_instanced(ctx->padded_count,
1576                                                        instance_shift,
1577                                                        instance_odd,
1578                                                        divisor, &attrs[k]);
1579                 }
1580         }
1581
1582         /* Add special gl_VertexID/gl_InstanceID buffers */
1583
1584         panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1585         so->hw[PAN_VERTEX_ID].index = k++;
1586         panfrost_instance_id(ctx->padded_count, &attrs[k]);
1587         so->hw[PAN_INSTANCE_ID].index = k++;
1588
1589         /* Upload whatever we emitted and go */
1590
1591         vertex_postfix->attributes = panfrost_upload_transient(batch, attrs,
1592                                                            k * sizeof(*attrs));
1593 }
1594
1595 static mali_ptr
1596 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1597                        unsigned stride, unsigned count)
1598 {
1599         /* Fill out the descriptor */
1600         slot->stride = stride;
1601         slot->size = stride * count;
1602         slot->shift = slot->extra_flags = 0;
1603
1604         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1605                                                                         slot->size);
1606
1607         slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1608
1609         return transfer.gpu;
1610 }
1611
1612 static void
1613 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1614                         unsigned stride, unsigned offset, unsigned count,
1615                         struct pipe_stream_output_target *target)
1616 {
1617         /* Fill out the descriptor */
1618         slot->stride = stride * 4;
1619         slot->shift = slot->extra_flags = 0;
1620
1621         unsigned max_size = target->buffer_size;
1622         unsigned expected_size = slot->stride * count;
1623
1624         slot->size = MIN2(max_size, expected_size);
1625
1626         /* Grab the BO and bind it to the batch */
1627         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1628
1629         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1630          * the perspective of the TILER and FRAGMENT.
1631          */
1632         panfrost_batch_add_bo(batch, bo,
1633                               PAN_BO_ACCESS_SHARED |
1634                               PAN_BO_ACCESS_RW |
1635                               PAN_BO_ACCESS_VERTEX_TILER |
1636                               PAN_BO_ACCESS_FRAGMENT);
1637
1638         mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1639         slot->elements = addr;
1640 }
1641
1642 /* Given a shader and buffer indices, link varying metadata together */
1643
1644 static bool
1645 is_special_varying(gl_varying_slot loc)
1646 {
1647         switch (loc) {
1648         case VARYING_SLOT_POS:
1649         case VARYING_SLOT_PSIZ:
1650         case VARYING_SLOT_PNTC:
1651         case VARYING_SLOT_FACE:
1652                 return true;
1653         default:
1654                 return false;
1655         }
1656 }
1657
1658 static void
1659 panfrost_emit_varying_meta(void *outptr, struct panfrost_shader_state *ss,
1660                            signed general, signed gl_Position,
1661                            signed gl_PointSize, signed gl_PointCoord,
1662                            signed gl_FrontFacing)
1663 {
1664         struct mali_attr_meta *out = (struct mali_attr_meta *) outptr;
1665
1666         for (unsigned i = 0; i < ss->varying_count; ++i) {
1667                 gl_varying_slot location = ss->varyings_loc[i];
1668                 int index = -1;
1669
1670                 switch (location) {
1671                 case VARYING_SLOT_POS:
1672                         index = gl_Position;
1673                         break;
1674                 case VARYING_SLOT_PSIZ:
1675                         index = gl_PointSize;
1676                         break;
1677                 case VARYING_SLOT_PNTC:
1678                         index = gl_PointCoord;
1679                         break;
1680                 case VARYING_SLOT_FACE:
1681                         index = gl_FrontFacing;
1682                         break;
1683                 default:
1684                         index = general;
1685                         break;
1686                 }
1687
1688                 assert(index >= 0);
1689                 out[i].index = index;
1690         }
1691 }
1692
1693 static bool
1694 has_point_coord(unsigned mask, gl_varying_slot loc)
1695 {
1696         if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1697                 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1698         else if (loc == VARYING_SLOT_PNTC)
1699                 return (mask & (1 << 8));
1700         else
1701                 return false;
1702 }
1703
1704 /* Helpers for manipulating stream out information so we can pack varyings
1705  * accordingly. Compute the src_offset for a given captured varying */
1706
1707 static struct pipe_stream_output *
1708 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1709 {
1710         for (unsigned i = 0; i < info->num_outputs; ++i) {
1711                 if (info->output[i].register_index == loc)
1712                         return &info->output[i];
1713         }
1714
1715         unreachable("Varying not captured");
1716 }
1717
1718 void
1719 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1720                                  unsigned vertex_count,
1721                                  struct mali_vertex_tiler_postfix *vertex_postfix,
1722                                  struct mali_vertex_tiler_postfix *tiler_postfix,
1723                                  union midgard_primitive_size *primitive_size)
1724 {
1725         /* Load the shaders */
1726         struct panfrost_context *ctx = batch->ctx;
1727         struct panfrost_shader_state *vs, *fs;
1728         unsigned int num_gen_varyings = 0;
1729         size_t vs_size, fs_size;
1730
1731         /* Allocate the varying descriptor */
1732
1733         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1734         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1735         vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
1736         fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
1737
1738         struct panfrost_transfer trans = panfrost_allocate_transient(batch,
1739                                                                      vs_size +
1740                                                                      fs_size);
1741
1742         struct pipe_stream_output_info *so = &vs->stream_output;
1743
1744         /* Check if this varying is linked by us. This is the case for
1745          * general-purpose, non-captured varyings. If it is, link it. If it's
1746          * not, use the provided stream out information to determine the
1747          * offset, since it was already linked for us. */
1748
1749         for (unsigned i = 0; i < vs->varying_count; i++) {
1750                 gl_varying_slot loc = vs->varyings_loc[i];
1751
1752                 bool special = is_special_varying(loc);
1753                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1754
1755                 if (captured) {
1756                         struct pipe_stream_output *o = pan_get_so(so, loc);
1757
1758                         unsigned dst_offset = o->dst_offset * 4; /* dwords */
1759                         vs->varyings[i].src_offset = dst_offset;
1760                 } else if (!special) {
1761                         vs->varyings[i].src_offset = 16 * (num_gen_varyings++);
1762                 }
1763         }
1764
1765         /* Conversely, we need to set src_offset for the captured varyings.
1766          * Here, the layout is defined by the stream out info, not us */
1767
1768         /* Link up with fragment varyings */
1769         bool reads_point_coord = fs->reads_point_coord;
1770
1771         for (unsigned i = 0; i < fs->varying_count; i++) {
1772                 gl_varying_slot loc = fs->varyings_loc[i];
1773                 unsigned src_offset;
1774                 signed vs_idx = -1;
1775
1776                 /* Link up */
1777                 for (unsigned j = 0; j < vs->varying_count; ++j) {
1778                         if (vs->varyings_loc[j] == loc) {
1779                                 vs_idx = j;
1780                                 break;
1781                         }
1782                 }
1783
1784                 /* Either assign or reuse */
1785                 if (vs_idx >= 0)
1786                         src_offset = vs->varyings[vs_idx].src_offset;
1787                 else
1788                         src_offset = 16 * (num_gen_varyings++);
1789
1790                 fs->varyings[i].src_offset = src_offset;
1791
1792                 if (has_point_coord(fs->point_sprite_mask, loc))
1793                         reads_point_coord = true;
1794         }
1795
1796         memcpy(trans.cpu, vs->varyings, vs_size);
1797         memcpy(trans.cpu + vs_size, fs->varyings, fs_size);
1798
1799         union mali_attr varyings[PIPE_MAX_ATTRIBS] = {0};
1800
1801         /* Figure out how many streamout buffers could be bound */
1802         unsigned so_count = ctx->streamout.num_targets;
1803         for (unsigned i = 0; i < vs->varying_count; i++) {
1804                 gl_varying_slot loc = vs->varyings_loc[i];
1805
1806                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1807                 if (!captured) continue;
1808
1809                 struct pipe_stream_output *o = pan_get_so(so, loc);
1810                 so_count = MAX2(so_count, o->output_buffer + 1);
1811         }
1812
1813         signed idx = so_count;
1814         signed general = idx++;
1815         signed gl_Position = idx++;
1816         signed gl_PointSize = vs->writes_point_size ? (idx++) : -1;
1817         signed gl_PointCoord = reads_point_coord ? (idx++) : -1;
1818         signed gl_FrontFacing = fs->reads_face ? (idx++) : -1;
1819         signed gl_FragCoord = fs->reads_frag_coord ? (idx++) : -1;
1820
1821         /* Emit the stream out buffers */
1822
1823         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1824                                                            ctx->vertex_count);
1825
1826         for (unsigned i = 0; i < so_count; ++i) {
1827                 if (i < ctx->streamout.num_targets) {
1828                         panfrost_emit_streamout(batch, &varyings[i],
1829                                                 so->stride[i],
1830                                                 ctx->streamout.offsets[i],
1831                                                 out_count,
1832                                                 ctx->streamout.targets[i]);
1833                 } else {
1834                         /* Emit a dummy buffer */
1835                         panfrost_emit_varyings(batch, &varyings[i],
1836                                                so->stride[i] * 4,
1837                                                out_count);
1838
1839                         /* Clear the attribute type */
1840                         varyings[i].elements &= ~0xF;
1841                 }
1842         }
1843
1844         panfrost_emit_varyings(batch, &varyings[general],
1845                                num_gen_varyings * 16,
1846                                vertex_count);
1847
1848         mali_ptr varyings_p;
1849
1850         /* fp32 vec4 gl_Position */
1851         varyings_p = panfrost_emit_varyings(batch, &varyings[gl_Position],
1852                                             sizeof(float) * 4, vertex_count);
1853         tiler_postfix->position_varying = varyings_p;
1854
1855
1856         if (panfrost_writes_point_size(ctx)) {
1857                 varyings_p = panfrost_emit_varyings(batch,
1858                                                     &varyings[gl_PointSize],
1859                                                     2, vertex_count);
1860                 primitive_size->pointer = varyings_p;
1861         }
1862
1863         if (reads_point_coord)
1864                 varyings[gl_PointCoord].elements = MALI_VARYING_POINT_COORD;
1865
1866         if (fs->reads_face)
1867                 varyings[gl_FrontFacing].elements = MALI_VARYING_FRONT_FACING;
1868
1869         if (fs->reads_frag_coord)
1870                 varyings[gl_FragCoord].elements = MALI_VARYING_FRAG_COORD;
1871
1872         struct panfrost_device *device = pan_device(ctx->base.screen);
1873         assert(!(device->quirks & IS_BIFROST) || !(reads_point_coord));
1874
1875         /* Let's go ahead and link varying meta to the buffer in question, now
1876          * that that information is available. VARYING_SLOT_POS is mapped to
1877          * gl_FragCoord for fragment shaders but gl_Positionf or vertex shaders
1878          * */
1879
1880         panfrost_emit_varying_meta(trans.cpu, vs, general, gl_Position,
1881                                    gl_PointSize, gl_PointCoord,
1882                                    gl_FrontFacing);
1883
1884         panfrost_emit_varying_meta(trans.cpu + vs_size, fs, general,
1885                                    gl_FragCoord, gl_PointSize,
1886                                    gl_PointCoord, gl_FrontFacing);
1887
1888         /* Replace streamout */
1889
1890         struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
1891         struct mali_attr_meta *ofs = ovs + vs->varying_count;
1892
1893         for (unsigned i = 0; i < vs->varying_count; i++) {
1894                 gl_varying_slot loc = vs->varyings_loc[i];
1895
1896                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1897                 if (!captured)
1898                         continue;
1899
1900                 struct pipe_stream_output *o = pan_get_so(so, loc);
1901                 ovs[i].index = o->output_buffer;
1902
1903                 assert(o->stream == 0);
1904                 ovs[i].format = (vs->varyings[i].format & ~MALI_NR_CHANNELS(4))
1905                         | MALI_NR_CHANNELS(o->num_components);
1906
1907                 if (device->quirks & HAS_SWIZZLES)
1908                         ovs[i].swizzle = panfrost_get_default_swizzle(o->num_components);
1909                 else
1910                         ovs[i].swizzle = panfrost_bifrost_swizzle(o->num_components);
1911
1912                 /* Link to the fragment */
1913                 signed fs_idx = -1;
1914
1915                 /* Link up */
1916                 for (unsigned j = 0; j < fs->varying_count; ++j) {
1917                         if (fs->varyings_loc[j] == loc) {
1918                                 fs_idx = j;
1919                                 break;
1920                         }
1921                 }
1922
1923                 if (fs_idx >= 0) {
1924                         ofs[fs_idx].index = ovs[i].index;
1925                         ofs[fs_idx].format = ovs[i].format;
1926                         ofs[fs_idx].swizzle = ovs[i].swizzle;
1927                 }
1928         }
1929
1930         /* Replace point sprite */
1931         for (unsigned i = 0; i < fs->varying_count; i++) {
1932                 /* If we have a point sprite replacement, handle that here. We
1933                  * have to translate location first.  TODO: Flip y in shader.
1934                  * We're already keying ... just time crunch .. */
1935
1936                 if (has_point_coord(fs->point_sprite_mask,
1937                                     fs->varyings_loc[i])) {
1938                         ofs[i].index = gl_PointCoord;
1939
1940                         /* Swizzle out the z/w to 0/1 */
1941                         ofs[i].format = MALI_RG16F;
1942                         ofs[i].swizzle = panfrost_get_default_swizzle(2);
1943                 }
1944         }
1945
1946         /* Fix up unaligned addresses */
1947         for (unsigned i = 0; i < so_count; ++i) {
1948                 if (varyings[i].elements < MALI_RECORD_SPECIAL)
1949                         continue;
1950
1951                 unsigned align = (varyings[i].elements & 63);
1952
1953                 /* While we're at it, the SO buffers are linear */
1954
1955                 if (!align) {
1956                         varyings[i].elements |= MALI_ATTR_LINEAR;
1957                         continue;
1958                 }
1959
1960                 /* We need to adjust alignment */
1961                 varyings[i].elements &= ~63;
1962                 varyings[i].elements |= MALI_ATTR_LINEAR;
1963                 varyings[i].size += align;
1964
1965                 for (unsigned v = 0; v < vs->varying_count; ++v) {
1966                         if (ovs[v].index != i)
1967                                 continue;
1968
1969                         ovs[v].src_offset = vs->varyings[v].src_offset + align;
1970                 }
1971
1972                 for (unsigned f = 0; f < fs->varying_count; ++f) {
1973                         if (ofs[f].index != i)
1974                                 continue;
1975
1976                         ofs[f].src_offset = fs->varyings[f].src_offset + align;
1977                 }
1978         }
1979
1980         varyings_p = panfrost_upload_transient(batch, varyings,
1981                                                idx * sizeof(*varyings));
1982         vertex_postfix->varyings = varyings_p;
1983         tiler_postfix->varyings = varyings_p;
1984
1985         vertex_postfix->varying_meta = trans.gpu;
1986         tiler_postfix->varying_meta = trans.gpu + vs_size;
1987 }
1988
1989 void
1990 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1991                                 struct mali_vertex_tiler_prefix *vertex_prefix,
1992                                 struct mali_vertex_tiler_postfix *vertex_postfix,
1993                                 struct mali_vertex_tiler_prefix *tiler_prefix,
1994                                 struct mali_vertex_tiler_postfix *tiler_postfix,
1995                                 union midgard_primitive_size *primitive_size)
1996 {
1997         struct panfrost_context *ctx = batch->ctx;
1998         struct panfrost_device *device = pan_device(ctx->base.screen);
1999         bool wallpapering = ctx->wallpaper_batch && batch->tiler_dep;
2000         struct bifrost_payload_vertex bifrost_vertex = {0,};
2001         struct bifrost_payload_tiler bifrost_tiler = {0,};
2002         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
2003         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
2004         void *vp, *tp;
2005         size_t vp_size, tp_size;
2006
2007         if (device->quirks & IS_BIFROST) {
2008                 bifrost_vertex.prefix = *vertex_prefix;
2009                 bifrost_vertex.postfix = *vertex_postfix;
2010                 vp = &bifrost_vertex;
2011                 vp_size = sizeof(bifrost_vertex);
2012
2013                 bifrost_tiler.prefix = *tiler_prefix;
2014                 bifrost_tiler.tiler.primitive_size = *primitive_size;
2015                 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2016                 bifrost_tiler.postfix = *tiler_postfix;
2017                 tp = &bifrost_tiler;
2018                 tp_size = sizeof(bifrost_tiler);
2019         } else {
2020                 midgard_vertex.prefix = *vertex_prefix;
2021                 midgard_vertex.postfix = *vertex_postfix;
2022                 vp = &midgard_vertex;
2023                 vp_size = sizeof(midgard_vertex);
2024
2025                 midgard_tiler.prefix = *tiler_prefix;
2026                 midgard_tiler.postfix = *tiler_postfix;
2027                 midgard_tiler.primitive_size = *primitive_size;
2028                 tp = &midgard_tiler;
2029                 tp_size = sizeof(midgard_tiler);
2030         }
2031
2032         if (wallpapering) {
2033                 /* Inject in reverse order, with "predicted" job indices.
2034                  * THIS IS A HACK XXX */
2035                 panfrost_new_job(batch, JOB_TYPE_TILER, false,
2036                                  batch->job_index + 2, tp, tp_size, true);
2037                 panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2038                                  vp, vp_size, true);
2039                 return;
2040         }
2041
2042         /* If rasterizer discard is enable, only submit the vertex */
2043
2044         bool rasterizer_discard = ctx->rasterizer &&
2045                                   ctx->rasterizer->base.rasterizer_discard;
2046
2047         unsigned vertex = panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2048                                            vp, vp_size, false);
2049
2050         if (rasterizer_discard)
2051                 return;
2052
2053         panfrost_new_job(batch, JOB_TYPE_TILER, false, vertex, tp, tp_size,
2054                          false);
2055 }
2056
2057 /* TODO: stop hardcoding this */
2058 mali_ptr
2059 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2060 {
2061         uint16_t locations[] = {
2062             128, 128,
2063             0, 256,
2064             0, 256,
2065             0, 256,
2066             0, 256,
2067             0, 256,
2068             0, 256,
2069             0, 256,
2070             0, 256,
2071             0, 256,
2072             0, 256,
2073             0, 256,
2074             0, 256,
2075             0, 256,
2076             0, 256,
2077             0, 256,
2078             0, 256,
2079             0, 256,
2080             0, 256,
2081             0, 256,
2082             0, 256,
2083             0, 256,
2084             0, 256,
2085             0, 256,
2086             0, 256,
2087             0, 256,
2088             0, 256,
2089             0, 256,
2090             0, 256,
2091             0, 256,
2092             0, 256,
2093             0, 256,
2094             128, 128,
2095             0, 0,
2096             0, 0,
2097             0, 0,
2098             0, 0,
2099             0, 0,
2100             0, 0,
2101             0, 0,
2102             0, 0,
2103             0, 0,
2104             0, 0,
2105             0, 0,
2106             0, 0,
2107             0, 0,
2108             0, 0,
2109             0, 0,
2110         };
2111
2112         return panfrost_upload_transient(batch, locations, 96 * sizeof(uint16_t));
2113 }