src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28
  29 #include "panfrost-quirks.h"
  30
  31 #include "pan_allocate.h"
  32 #include "pan_bo.h"
  33 #include "pan_cmdstream.h"
  34 #include "pan_context.h"
  35 #include "pan_job.h"
  36
  37 /* If a BO is accessed for a particular shader stage, will it be in the primary
  38  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  39  * fragment will be primary, e.g. compute jobs will be considered
  40  * "vertex/tiler" by analogy */
  41
  42 static inline uint32_t
  43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  44 {
  45         assert(stage == PIPE_SHADER_FRAGMENT ||
  46                stage == PIPE_SHADER_VERTEX ||
  47                stage == PIPE_SHADER_COMPUTE);
  48
  49         return stage == PIPE_SHADER_FRAGMENT ?
  50                PAN_BO_ACCESS_FRAGMENT :
  51                PAN_BO_ACCESS_VERTEX_TILER;
  52 }
  53
  54 static void
  55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
  56                                struct mali_vertex_tiler_postfix *postfix)
  57 {
  58         struct panfrost_device *dev = pan_device(ctx->base.screen);
  59         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  60
  61         unsigned shift = panfrost_get_stack_shift(batch->stack_size);
  62         struct mali_shared_memory shared = {
  63                 .stack_shift = shift,
  64                 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
  65                 .shared_workgroup_count = ~0,
  66         };
  67         postfix->shared_memory = panfrost_upload_transient(batch, &shared, sizeof(shared));
  68 }
  69
  70 static void
  71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
  72                                struct mali_vertex_tiler_postfix *postfix)
  73 {
  74         struct panfrost_device *dev = pan_device(ctx->base.screen);
  75         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  76
  77         /* If we haven't, reserve space for the framebuffer */
  78
  79         if (!batch->framebuffer.gpu) {
  80                 unsigned size = (dev->quirks & MIDGARD_SFBD) ?
  81                         sizeof(struct mali_single_framebuffer) :
  82                         sizeof(struct mali_framebuffer);
  83
  84                 batch->framebuffer = panfrost_allocate_transient(batch, size);
  85
  86                 /* Tag the pointer */
  87                 if (!(dev->quirks & MIDGARD_SFBD))
  88                         batch->framebuffer.gpu |= MALI_MFBD;
  89         }
  90
  91         postfix->shared_memory = batch->framebuffer.gpu;
  92 }
  93
  94 static void
  95 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
  96                               struct mali_vertex_tiler_prefix *prefix,
  97                               struct mali_vertex_tiler_postfix *postfix)
  98 {
  99         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 100
 101         postfix->gl_enables |= 0x7;
 102         SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
 103                 rasterizer && rasterizer->base.front_ccw);
 104         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
 105                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
 106         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
 107                 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
 108         SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
 109                 rasterizer && rasterizer->base.flatshade_first);
 110 }
 111
 112 void
 113 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
 114                                   struct mali_vertex_tiler_prefix *prefix,
 115                                   union midgard_primitive_size *primitive_size)
 116 {
 117         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 118
 119         if (!panfrost_writes_point_size(ctx)) {
 120                 bool points = prefix->draw_mode == MALI_POINTS;
 121                 float val = 0.0f;
 122
 123                 if (rasterizer)
 124                         val = points ?
 125                               rasterizer->base.point_size :
 126                               rasterizer->base.line_width;
 127
 128                 primitive_size->constant = val;
 129         }
 130 }
 131
 132 static void
 133 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
 134                                    struct mali_vertex_tiler_postfix *postfix)
 135 {
 136         SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
 137         if (ctx->occlusion_query)
 138                 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
 139         else
 140                 postfix->occlusion_counter = 0;
 141 }
 142
 143 void
 144 panfrost_vt_init(struct panfrost_context *ctx,
 145                  enum pipe_shader_type stage,
 146                  struct mali_vertex_tiler_prefix *prefix,
 147                  struct mali_vertex_tiler_postfix *postfix)
 148 {
 149         struct panfrost_device *device = pan_device(ctx->base.screen);
 150
 151         if (!ctx->shader[stage])
 152                 return;
 153
 154         memset(prefix, 0, sizeof(*prefix));
 155         memset(postfix, 0, sizeof(*postfix));
 156
 157         if (device->quirks & IS_BIFROST) {
 158                 postfix->gl_enables = 0x2;
 159                 panfrost_vt_emit_shared_memory(ctx, postfix);
 160         } else {
 161                 postfix->gl_enables = 0x6;
 162                 panfrost_vt_attach_framebuffer(ctx, postfix);
 163         }
 164
 165         if (stage == PIPE_SHADER_FRAGMENT) {
 166                 panfrost_vt_update_occlusion_query(ctx, postfix);
 167                 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
 168         }
 169 }
 170
 171 static unsigned
 172 panfrost_translate_index_size(unsigned size)
 173 {
 174         switch (size) {
 175         case 1:
 176                 return MALI_DRAW_INDEXED_UINT8;
 177
 178         case 2:
 179                 return MALI_DRAW_INDEXED_UINT16;
 180
 181         case 4:
 182                 return MALI_DRAW_INDEXED_UINT32;
 183
 184         default:
 185                 unreachable("Invalid index size");
 186         }
 187 }
 188
 189 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
 190  * good for the duration of the draw (transient), could last longer. Also get
 191  * the bounds on the index buffer for the range accessed by the draw. We do
 192  * these operations together because there are natural optimizations which
 193  * require them to be together. */
 194
 195 static mali_ptr
 196 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 197                                   const struct pipe_draw_info *info,
 198                                   unsigned *min_index, unsigned *max_index)
 199 {
 200         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 201         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 202         off_t offset = info->start * info->index_size;
 203         bool needs_indices = true;
 204         mali_ptr out = 0;
 205
 206         if (info->max_index != ~0u) {
 207                 *min_index = info->min_index;
 208                 *max_index = info->max_index;
 209                 needs_indices = false;
 210         }
 211
 212         if (!info->has_user_indices) {
 213                 /* Only resources can be directly mapped */
 214                 panfrost_batch_add_bo(batch, rsrc->bo,
 215                                       PAN_BO_ACCESS_SHARED |
 216                                       PAN_BO_ACCESS_READ |
 217                                       PAN_BO_ACCESS_VERTEX_TILER);
 218                 out = rsrc->bo->gpu + offset;
 219
 220                 /* Check the cache */
 221                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 222                                                            info->start,
 223                                                            info->count,
 224                                                            min_index,
 225                                                            max_index);
 226         } else {
 227                 /* Otherwise, we need to upload to transient memory */
 228                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 229                 out = panfrost_upload_transient(batch, ibuf8 + offset,
 230                                                 info->count *
 231                                                 info->index_size);
 232         }
 233
 234         if (needs_indices) {
 235                 /* Fallback */
 236                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 237
 238                 if (!info->has_user_indices)
 239                         panfrost_minmax_cache_add(rsrc->index_cache,
 240                                                   info->start, info->count,
 241                                                   *min_index, *max_index);
 242         }
 243
 244         return out;
 245 }
 246
 247 void
 248 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
 249                           const struct pipe_draw_info *info,
 250                           enum mali_draw_mode draw_mode,
 251                           struct mali_vertex_tiler_postfix *vertex_postfix,
 252                           struct mali_vertex_tiler_prefix *tiler_prefix,
 253                           struct mali_vertex_tiler_postfix *tiler_postfix,
 254                           unsigned *vertex_count,
 255                           unsigned *padded_count)
 256 {
 257         tiler_prefix->draw_mode = draw_mode;
 258
 259         unsigned draw_flags = 0;
 260
 261         if (panfrost_writes_point_size(ctx))
 262                 draw_flags |= MALI_DRAW_VARYING_SIZE;
 263
 264         if (info->primitive_restart)
 265                 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
 266
 267         /* These doesn't make much sense */
 268
 269         draw_flags |= 0x3000;
 270
 271         if (info->index_size) {
 272                 unsigned min_index = 0, max_index = 0;
 273
 274                 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
 275                                                                        info,
 276                                                                        &min_index,
 277                                                                        &max_index);
 278
 279                 /* Use the corresponding values */
 280                 *vertex_count = max_index - min_index + 1;
 281                 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
 282                 tiler_prefix->offset_bias_correction = -min_index;
 283                 tiler_prefix->index_count = MALI_POSITIVE(info->count);
 284                 draw_flags |= panfrost_translate_index_size(info->index_size);
 285         } else {
 286                 tiler_prefix->indices = 0;
 287                 *vertex_count = ctx->vertex_count;
 288                 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
 289                 tiler_prefix->offset_bias_correction = 0;
 290                 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
 291         }
 292
 293         tiler_prefix->unknown_draw = draw_flags;
 294
 295         /* Encode the padded vertex count */
 296
 297         if (info->instance_count > 1) {
 298                 *padded_count = panfrost_padded_vertex_count(*vertex_count);
 299
 300                 unsigned shift = __builtin_ctz(ctx->padded_count);
 301                 unsigned k = ctx->padded_count >> (shift + 1);
 302
 303                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
 304                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
 305         } else {
 306                 *padded_count = *vertex_count;
 307
 308                 /* Reset instancing state */
 309                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
 310                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
 311         }
 312 }
 313
 314 static void
 315 panfrost_shader_meta_init(struct panfrost_context *ctx,
 316                           enum pipe_shader_type st,
 317                           struct mali_shader_meta *meta)
 318 {
 319         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 320         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 321
 322         memset(meta, 0, sizeof(*meta));
 323         meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
 324         meta->attribute_count = ss->attribute_count;
 325         meta->varying_count = ss->varying_count;
 326         meta->texture_count = ctx->sampler_view_count[st];
 327         meta->sampler_count = ctx->sampler_count[st];
 328
 329         if (dev->quirks & IS_BIFROST) {
 330                 if (st == PIPE_SHADER_VERTEX)
 331                         meta->bifrost1.unk1 = 0x800000;
 332                 else {
 333                         /* First clause ATEST |= 0x4000000.
 334                          * Less than 32 regs |= 0x200 */
 335                         meta->bifrost1.unk1 = 0x958020;
 336                 }
 337
 338                 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 339                 if (st == PIPE_SHADER_VERTEX)
 340                         meta->bifrost2.preload_regs = 0xC0;
 341                 else
 342                         meta->bifrost2.preload_regs = 0x1;
 343                 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
 344                                                     ss->uniform_cutoff);
 345         } else {
 346                 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
 347                                                     ss->uniform_cutoff);
 348                 meta->midgard1.work_count = ss->work_reg_count;
 349                 meta->midgard1.flags_hi = 0x8; /* XXX */
 350                 meta->midgard1.flags_lo = 0x220;
 351                 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 352         }
 353 }
 354
 355 static unsigned
 356 panfrost_translate_compare_func(enum pipe_compare_func in)
 357 {
 358         switch (in) {
 359         case PIPE_FUNC_NEVER:
 360                 return MALI_FUNC_NEVER;
 361
 362         case PIPE_FUNC_LESS:
 363                 return MALI_FUNC_LESS;
 364
 365         case PIPE_FUNC_EQUAL:
 366                 return MALI_FUNC_EQUAL;
 367
 368         case PIPE_FUNC_LEQUAL:
 369                 return MALI_FUNC_LEQUAL;
 370
 371         case PIPE_FUNC_GREATER:
 372                 return MALI_FUNC_GREATER;
 373
 374         case PIPE_FUNC_NOTEQUAL:
 375                 return MALI_FUNC_NOTEQUAL;
 376
 377         case PIPE_FUNC_GEQUAL:
 378                 return MALI_FUNC_GEQUAL;
 379
 380         case PIPE_FUNC_ALWAYS:
 381                 return MALI_FUNC_ALWAYS;
 382
 383         default:
 384                 unreachable("Invalid func");
 385         }
 386 }
 387
 388 static unsigned
 389 panfrost_translate_stencil_op(enum pipe_stencil_op in)
 390 {
 391         switch (in) {
 392         case PIPE_STENCIL_OP_KEEP:
 393                 return MALI_STENCIL_KEEP;
 394
 395         case PIPE_STENCIL_OP_ZERO:
 396                 return MALI_STENCIL_ZERO;
 397
 398         case PIPE_STENCIL_OP_REPLACE:
 399                return MALI_STENCIL_REPLACE;
 400
 401         case PIPE_STENCIL_OP_INCR:
 402                 return MALI_STENCIL_INCR;
 403
 404         case PIPE_STENCIL_OP_DECR:
 405                 return MALI_STENCIL_DECR;
 406
 407         case PIPE_STENCIL_OP_INCR_WRAP:
 408                 return MALI_STENCIL_INCR_WRAP;
 409
 410         case PIPE_STENCIL_OP_DECR_WRAP:
 411                 return MALI_STENCIL_DECR_WRAP;
 412
 413         case PIPE_STENCIL_OP_INVERT:
 414                 return MALI_STENCIL_INVERT;
 415
 416         default:
 417                 unreachable("Invalid stencil op");
 418         }
 419 }
 420
 421 static unsigned
 422 translate_tex_wrap(enum pipe_tex_wrap w)
 423 {
 424         switch (w) {
 425         case PIPE_TEX_WRAP_REPEAT:
 426                 return MALI_WRAP_REPEAT;
 427
 428         case PIPE_TEX_WRAP_CLAMP:
 429                 return MALI_WRAP_CLAMP;
 430
 431         case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 432                 return MALI_WRAP_CLAMP_TO_EDGE;
 433
 434         case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
 435                 return MALI_WRAP_CLAMP_TO_BORDER;
 436
 437         case PIPE_TEX_WRAP_MIRROR_REPEAT:
 438                 return MALI_WRAP_MIRRORED_REPEAT;
 439
 440         case PIPE_TEX_WRAP_MIRROR_CLAMP:
 441                 return MALI_WRAP_MIRRORED_CLAMP;
 442
 443         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 444                 return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE;
 445
 446         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
 447                 return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER;
 448
 449         default:
 450                 unreachable("Invalid wrap");
 451         }
 452 }
 453
 454 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 455                                 struct mali_sampler_descriptor *hw)
 456 {
 457         unsigned func = panfrost_translate_compare_func(cso->compare_func);
 458         bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 459         bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 460         bool mip_linear  = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
 461         unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
 462         unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
 463         unsigned mip_filter = mip_linear  ?
 464                               (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
 465         unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
 466
 467         *hw = (struct mali_sampler_descriptor) {
 468                 .filter_mode = min_filter | mag_filter | mip_filter |
 469                                normalized,
 470                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 471                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 472                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 473                 .compare_func = panfrost_flip_compare_func(func),
 474                 .border_color = {
 475                         cso->border_color.f[0],
 476                         cso->border_color.f[1],
 477                         cso->border_color.f[2],
 478                         cso->border_color.f[3]
 479                 },
 480                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 481                 .max_lod = FIXED_16(cso->max_lod, false),
 482                 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
 483                 .seamless_cube_map = cso->seamless_cube_map,
 484         };
 485
 486         /* If necessary, we disable mipmapping in the sampler descriptor by
 487          * clamping the LOD as tight as possible (from 0 to epsilon,
 488          * essentially -- remember these are fixed point numbers, so
 489          * epsilon=1/256) */
 490
 491         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 492                 hw->max_lod = hw->min_lod + 1;
 493 }
 494
 495 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
 496                                         struct bifrost_sampler_descriptor *hw)
 497 {
 498         *hw = (struct bifrost_sampler_descriptor) {
 499                 .unk1 = 0x1,
 500                 .wrap_s = translate_tex_wrap(cso->wrap_s),
 501                 .wrap_t = translate_tex_wrap(cso->wrap_t),
 502                 .wrap_r = translate_tex_wrap(cso->wrap_r),
 503                 .unk8 = 0x8,
 504                 .min_filter = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST,
 505                 .norm_coords = cso->normalized_coords,
 506                 .mip_filter = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR,
 507                 .mag_filter = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR,
 508                 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
 509                 .max_lod = FIXED_16(cso->max_lod, false),
 510         };
 511
 512         /* If necessary, we disable mipmapping in the sampler descriptor by
 513          * clamping the LOD as tight as possible (from 0 to epsilon,
 514          * essentially -- remember these are fixed point numbers, so
 515          * epsilon=1/256) */
 516
 517         if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 518                 hw->max_lod = hw->min_lod + 1;
 519 }
 520
 521 static void
 522 panfrost_make_stencil_state(const struct pipe_stencil_state *in,
 523                             struct mali_stencil_test *out)
 524 {
 525         out->ref = 0; /* Gallium gets it from elsewhere */
 526
 527         out->mask = in->valuemask;
 528         out->func = panfrost_translate_compare_func(in->func);
 529         out->sfail = panfrost_translate_stencil_op(in->fail_op);
 530         out->dpfail = panfrost_translate_stencil_op(in->zfail_op);
 531         out->dppass = panfrost_translate_stencil_op(in->zpass_op);
 532 }
 533
 534 static void
 535 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
 536                                      struct mali_shader_meta *fragmeta)
 537 {
 538         if (!ctx->rasterizer) {
 539                 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
 540                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
 541                 fragmeta->depth_units = 0.0f;
 542                 fragmeta->depth_factor = 0.0f;
 543                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
 544                 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
 545                 return;
 546         }
 547
 548         bool msaa = ctx->rasterizer->base.multisample;
 549
 550         /* TODO: Sample size */
 551         SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
 552         SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
 553         fragmeta->depth_units = ctx->rasterizer->base.offset_units * 2.0f;
 554         fragmeta->depth_factor = ctx->rasterizer->base.offset_scale;
 555
 556         /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
 557
 558         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A,
 559                 ctx->rasterizer->base.offset_tri);
 560         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B,
 561                 ctx->rasterizer->base.offset_tri);
 562 }
 563
 564 static void
 565 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
 566                               struct mali_shader_meta *fragmeta)
 567 {
 568         const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
 569         int zfunc = PIPE_FUNC_ALWAYS;
 570
 571         if (!zsa) {
 572                 struct pipe_stencil_state default_stencil = {
 573                         .enabled = 0,
 574                         .func = PIPE_FUNC_ALWAYS,
 575                         .fail_op = MALI_STENCIL_KEEP,
 576                         .zfail_op = MALI_STENCIL_KEEP,
 577                         .zpass_op = MALI_STENCIL_KEEP,
 578                         .writemask = 0xFF,
 579                         .valuemask = 0xFF
 580                 };
 581
 582                 panfrost_make_stencil_state(&default_stencil,
 583                                             &fragmeta->stencil_front);
 584                 fragmeta->stencil_mask_front = default_stencil.writemask;
 585                 fragmeta->stencil_back = fragmeta->stencil_front;
 586                 fragmeta->stencil_mask_back = default_stencil.writemask;
 587                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
 588                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
 589         } else {
 590                 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
 591                         zsa->stencil[0].enabled);
 592                 panfrost_make_stencil_state(&zsa->stencil[0],
 593                                             &fragmeta->stencil_front);
 594                 fragmeta->stencil_mask_front = zsa->stencil[0].writemask;
 595                 fragmeta->stencil_front.ref = ctx->stencil_ref.ref_value[0];
 596
 597                 /* If back-stencil is not enabled, use the front values */
 598
 599                 if (zsa->stencil[1].enabled) {
 600                         panfrost_make_stencil_state(&zsa->stencil[1],
 601                                                     &fragmeta->stencil_back);
 602                         fragmeta->stencil_mask_back = zsa->stencil[1].writemask;
 603                         fragmeta->stencil_back.ref = ctx->stencil_ref.ref_value[1];
 604                 } else {
 605                         fragmeta->stencil_back = fragmeta->stencil_front;
 606                         fragmeta->stencil_mask_back = fragmeta->stencil_mask_front;
 607                         fragmeta->stencil_back.ref = fragmeta->stencil_front.ref;
 608                 }
 609
 610                 if (zsa->depth.enabled)
 611                         zfunc = zsa->depth.func;
 612
 613                 /* Depth state (TODO: Refactor) */
 614
 615                 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
 616                         zsa->depth.writemask);
 617         }
 618
 619         fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
 620         fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
 621 }
 622
 623 static bool
 624 panfrost_fs_required(
 625                 struct panfrost_shader_state *fs,
 626                 struct panfrost_blend_final *blend,
 627                 unsigned rt_count)
 628 {
 629         /* If we generally have side effects */
 630         if (fs->fs_sidefx)
 631                 return true;
 632
 633         /* If colour is written we need to execute */
 634         for (unsigned i = 0; i < rt_count; ++i) {
 635                 if (!blend[i].no_colour)
 636                         return true;
 637         }
 638
 639         /* If depth is written and not implied we need to execute.
 640          * TODO: Predicate on Z/S writes being enabled */
 641         return (fs->writes_depth || fs->writes_stencil);
 642 }
 643
 644 static void
 645 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
 646                                 struct mali_shader_meta *fragmeta,
 647                                 void *rts)
 648 {
 649         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 650         struct panfrost_shader_state *fs;
 651         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 652
 653         SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
 654                 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
 655                 !ctx->blend->base.dither);
 656
 657         /* Get blending setup */
 658         unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 659
 660         struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 661         unsigned shader_offset = 0;
 662         struct panfrost_bo *shader_bo = NULL;
 663
 664         for (unsigned c = 0; c < rt_count; ++c)
 665                 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
 666                                                           &shader_offset);
 667
 668         /* Disable shader execution if we can */
 669         if (dev->quirks & MIDGARD_SHADERLESS
 670                         && !panfrost_fs_required(fs, blend, rt_count)) {
 671                 fragmeta->shader = 0;
 672                 fragmeta->attribute_count = 0;
 673                 fragmeta->varying_count = 0;
 674                 fragmeta->texture_count = 0;
 675                 fragmeta->sampler_count = 0;
 676
 677                 /* This feature is not known to work on Bifrost */
 678                 fragmeta->midgard1.work_count = 1;
 679                 fragmeta->midgard1.uniform_count = 0;
 680                 fragmeta->midgard1.uniform_buffer_count = 0;
 681         }
 682
 683          /* If there is a blend shader, work registers are shared. We impose 8
 684           * work registers as a limit for blend shaders. Should be lower XXX */
 685
 686         if (!(dev->quirks & IS_BIFROST)) {
 687                 for (unsigned c = 0; c < rt_count; ++c) {
 688                         if (blend[c].is_shader) {
 689                                 fragmeta->midgard1.work_count =
 690                                         MAX2(fragmeta->midgard1.work_count, 8);
 691                         }
 692                 }
 693         }
 694
 695         /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
 696          * copied to the blend_meta appended (by convention), but this is the
 697          * field actually read by the hardware. (Or maybe both are read...?).
 698          * Specify the last RTi with a blend shader. */
 699
 700         fragmeta->blend.shader = 0;
 701
 702         for (signed rt = (rt_count - 1); rt >= 0; --rt) {
 703                 if (!blend[rt].is_shader)
 704                         continue;
 705
 706                 fragmeta->blend.shader = blend[rt].shader.gpu |
 707                                          blend[rt].shader.first_tag;
 708                 break;
 709         }
 710
 711         if (dev->quirks & MIDGARD_SFBD) {
 712                 /* When only a single render target platform is used, the blend
 713                  * information is inside the shader meta itself. We additionally
 714                  * need to signal CAN_DISCARD for nontrivial blend modes (so
 715                  * we're able to read back the destination buffer) */
 716
 717                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
 718                         blend[0].is_shader);
 719
 720                 if (!blend[0].is_shader) {
 721                         fragmeta->blend.equation = *blend[0].equation.equation;
 722                         fragmeta->blend.constant = blend[0].equation.constant;
 723                 }
 724
 725                 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
 726                         !blend[0].no_blending || fs->can_discard);
 727                 return;
 728         }
 729
 730         /* Additional blend descriptor tacked on for jobs using MFBD */
 731
 732         for (unsigned i = 0; i < rt_count; ++i) {
 733                 unsigned flags = 0;
 734
 735                 if (ctx->pipe_framebuffer.nr_cbufs > i && !blend[i].no_colour) {
 736                         flags = 0x200;
 737
 738                         bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
 739                                        (ctx->pipe_framebuffer.cbufs[i]) &&
 740                                        util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
 741
 742                         SET_BIT(flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
 743                         SET_BIT(flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
 744                         SET_BIT(flags, MALI_BLEND_SRGB, is_srgb);
 745                         SET_BIT(flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
 746                 }
 747
 748                 if (dev->quirks & IS_BIFROST) {
 749                         struct bifrost_blend_rt *brts = rts;
 750
 751                         brts[i].flags = flags;
 752
 753                         if (blend[i].is_shader) {
 754                                 /* The blend shader's address needs to be at
 755                                  * the same top 32 bit as the fragment shader.
 756                                  * TODO: Ensure that's always the case.
 757                                  */
 758                                 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
 759                                        (fs->bo->gpu & (0xffffffffull << 32)));
 760                                 brts[i].shader = blend[i].shader.gpu;
 761                                 brts[i].unk2 = 0x0;
 762                         } else if (ctx->pipe_framebuffer.nr_cbufs > i) {
 763                                 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
 764                                 const struct util_format_description *format_desc;
 765                                 format_desc = util_format_description(format);
 766
 767                                 brts[i].equation = *blend[i].equation.equation;
 768
 769                                 /* TODO: this is a bit more complicated */
 770                                 brts[i].constant = blend[i].equation.constant;
 771
 772                                 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
 773                                 brts[i].unk2 = 0x19;
 774
 775                                 brts[i].shader_type = fs->blend_types[i];
 776                         } else {
 777                                 /* Dummy attachment for depth-only */
 778                                 brts[i].unk2 = 0x3;
 779                                 brts[i].shader_type = fs->blend_types[i];
 780                         }
 781                 } else {
 782                         struct midgard_blend_rt *mrts = rts;
 783                         mrts[i].flags = flags;
 784
 785                         if (blend[i].is_shader) {
 786                                 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 787                         } else {
 788                                 mrts[i].blend.equation = *blend[i].equation.equation;
 789                                 mrts[i].blend.constant = blend[i].equation.constant;
 790                         }
 791                 }
 792         }
 793 }
 794
 795 static void
 796 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
 797                                struct mali_shader_meta *fragmeta,
 798                                void *rts)
 799 {
 800         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 801         struct panfrost_shader_state *fs;
 802
 803         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 804
 805         fragmeta->alpha_coverage = ~MALI_ALPHA_COVERAGE(0.000000);
 806         fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x3010;
 807         fragmeta->unknown2_4 = 0x4e0;
 808
 809         /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
 810          * is required (independent of 32-bit/64-bit descriptors), or why it's
 811          * not used on later GPU revisions. Otherwise, all shader jobs fault on
 812          * these earlier chips (perhaps this is a chicken bit of some kind).
 813          * More investigation is needed. */
 814
 815         SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
 816
 817         if (dev->quirks & IS_BIFROST) {
 818                 /* TODO */
 819         } else {
 820                 /* Depending on whether it's legal to in the given shader, we try to
 821                  * enable early-z testing (or forward-pixel kill?) */
 822
 823                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
 824                         !fs->can_discard && !fs->writes_depth);
 825
 826                 /* Add the writes Z/S flags if needed. */
 827                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
 828                 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
 829
 830                 /* Any time texturing is used, derivatives are implicitly calculated,
 831                  * so we need to enable helper invocations */
 832
 833                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
 834                         fs->helper_invocations);
 835
 836                 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
 837
 838                 bool depth_enabled = fs->writes_depth ||
 839                    (zsa && zsa->depth.enabled && zsa->depth.func != PIPE_FUNC_ALWAYS);
 840
 841                 SET_BIT(fragmeta->midgard1.flags_lo, 0x400, !depth_enabled && fs->can_discard);
 842                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, depth_enabled && fs->can_discard);
 843         }
 844
 845         panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
 846         panfrost_frag_meta_zsa_update(ctx, fragmeta);
 847         panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
 848 }
 849
 850 void
 851 panfrost_emit_shader_meta(struct panfrost_batch *batch,
 852                           enum pipe_shader_type st,
 853                           struct mali_vertex_tiler_postfix *postfix)
 854 {
 855         struct panfrost_context *ctx = batch->ctx;
 856         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 857
 858         if (!ss) {
 859                 postfix->shader = 0;
 860                 return;
 861         }
 862
 863         struct mali_shader_meta meta;
 864
 865         panfrost_shader_meta_init(ctx, st, &meta);
 866
 867         /* Add the shader BO to the batch. */
 868         panfrost_batch_add_bo(batch, ss->bo,
 869                               PAN_BO_ACCESS_PRIVATE |
 870                               PAN_BO_ACCESS_READ |
 871                               panfrost_bo_access_for_stage(st));
 872
 873         mali_ptr shader_ptr;
 874
 875         if (st == PIPE_SHADER_FRAGMENT) {
 876                 struct panfrost_device *dev = pan_device(ctx->base.screen);
 877                 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 878                 size_t desc_size = sizeof(meta);
 879                 void *rts = NULL;
 880                 struct panfrost_transfer xfer;
 881                 unsigned rt_size;
 882
 883                 if (dev->quirks & MIDGARD_SFBD)
 884                         rt_size = 0;
 885                 else if (dev->quirks & IS_BIFROST)
 886                         rt_size = sizeof(struct bifrost_blend_rt);
 887                 else
 888                         rt_size = sizeof(struct midgard_blend_rt);
 889
 890                 desc_size += rt_size * rt_count;
 891
 892                 if (rt_size)
 893                         rts = rzalloc_size(ctx, rt_size * rt_count);
 894
 895                 panfrost_frag_shader_meta_init(ctx, &meta, rts);
 896
 897                 xfer = panfrost_allocate_transient(batch, desc_size);
 898
 899                 memcpy(xfer.cpu, &meta, sizeof(meta));
 900                 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
 901
 902                 if (rt_size)
 903                         ralloc_free(rts);
 904
 905                 shader_ptr = xfer.gpu;
 906         } else {
 907                 shader_ptr = panfrost_upload_transient(batch, &meta,
 908                                                        sizeof(meta));
 909         }
 910
 911         postfix->shader = shader_ptr;
 912 }
 913
 914 static void
 915 panfrost_mali_viewport_init(struct panfrost_context *ctx,
 916                             struct mali_viewport *mvp)
 917 {
 918         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 919
 920         /* Clip bounds are encoded as floats. The viewport itself is encoded as
 921          * (somewhat) asymmetric ints. */
 922
 923         const struct pipe_scissor_state *ss = &ctx->scissor;
 924
 925         memset(mvp, 0, sizeof(*mvp));
 926
 927         /* By default, do no viewport clipping, i.e. clip to (-inf, inf) in
 928          * each direction. Clipping to the viewport in theory should work, but
 929          * in practice causes issues when we're not explicitly trying to
 930          * scissor */
 931
 932         *mvp = (struct mali_viewport) {
 933                 .clip_minx = -INFINITY,
 934                 .clip_miny = -INFINITY,
 935                 .clip_maxx = INFINITY,
 936                 .clip_maxy = INFINITY,
 937         };
 938
 939         /* Always scissor to the viewport by default. */
 940         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 941         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 942
 943         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 944         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 945
 946         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 947         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 948
 949         /* Apply the scissor test */
 950
 951         unsigned minx, miny, maxx, maxy;
 952
 953         if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
 954                 minx = MAX2(ss->minx, vp_minx);
 955                 miny = MAX2(ss->miny, vp_miny);
 956                 maxx = MIN2(ss->maxx, vp_maxx);
 957                 maxy = MIN2(ss->maxy, vp_maxy);
 958         } else {
 959                 minx = vp_minx;
 960                 miny = vp_miny;
 961                 maxx = vp_maxx;
 962                 maxy = vp_maxy;
 963         }
 964
 965         /* Hardware needs the min/max to be strictly ordered, so flip if we
 966          * need to. The viewport transformation in the vertex shader will
 967          * handle the negatives if we don't */
 968
 969         if (miny > maxy) {
 970                 unsigned temp = miny;
 971                 miny = maxy;
 972                 maxy = temp;
 973         }
 974
 975         if (minx > maxx) {
 976                 unsigned temp = minx;
 977                 minx = maxx;
 978                 maxx = temp;
 979         }
 980
 981         if (minz > maxz) {
 982                 float temp = minz;
 983                 minz = maxz;
 984                 maxz = temp;
 985         }
 986
 987         /* Clamp to the framebuffer size as a last check */
 988
 989         minx = MIN2(ctx->pipe_framebuffer.width, minx);
 990         maxx = MIN2(ctx->pipe_framebuffer.width, maxx);
 991
 992         miny = MIN2(ctx->pipe_framebuffer.height, miny);
 993         maxy = MIN2(ctx->pipe_framebuffer.height, maxy);
 994
 995         /* Upload */
 996
 997         mvp->viewport0[0] = minx;
 998         mvp->viewport1[0] = MALI_POSITIVE(maxx);
 999
1000         mvp->viewport0[1] = miny;
1001         mvp->viewport1[1] = MALI_POSITIVE(maxy);
1002
1003         mvp->clip_minz = minz;
1004         mvp->clip_maxz = maxz;
1005 }
1006
1007 void
1008 panfrost_emit_viewport(struct panfrost_batch *batch,
1009                        struct mali_vertex_tiler_postfix *tiler_postfix)
1010 {
1011         struct panfrost_context *ctx = batch->ctx;
1012         struct mali_viewport mvp;
1013
1014         panfrost_mali_viewport_init(batch->ctx,  &mvp);
1015
1016         /* Update the job, unless we're doing wallpapering (whose lack of
1017          * scissor we can ignore, since if we "miss" a tile of wallpaper, it'll
1018          * just... be faster :) */
1019
1020         if (!ctx->wallpaper_batch)
1021                 panfrost_batch_union_scissor(batch, mvp.viewport0[0],
1022                                              mvp.viewport0[1],
1023                                              mvp.viewport1[0] + 1,
1024                                              mvp.viewport1[1] + 1);
1025
1026         tiler_postfix->viewport = panfrost_upload_transient(batch, &mvp,
1027                                                             sizeof(mvp));
1028 }
1029
1030 static mali_ptr
1031 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
1032                                  enum pipe_shader_type st,
1033                                  struct panfrost_constant_buffer *buf,
1034                                  unsigned index)
1035 {
1036         struct pipe_constant_buffer *cb = &buf->cb[index];
1037         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1038
1039         if (rsrc) {
1040                 panfrost_batch_add_bo(batch, rsrc->bo,
1041                                       PAN_BO_ACCESS_SHARED |
1042                                       PAN_BO_ACCESS_READ |
1043                                       panfrost_bo_access_for_stage(st));
1044
1045                 /* Alignment gauranteed by
1046                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
1047                 return rsrc->bo->gpu + cb->buffer_offset;
1048         } else if (cb->user_buffer) {
1049                 return panfrost_upload_transient(batch,
1050                                                  cb->user_buffer +
1051                                                  cb->buffer_offset,
1052                                                  cb->buffer_size);
1053         } else {
1054                 unreachable("No constant buffer");
1055         }
1056 }
1057
1058 struct sysval_uniform {
1059         union {
1060                 float f[4];
1061                 int32_t i[4];
1062                 uint32_t u[4];
1063                 uint64_t du[2];
1064         };
1065 };
1066
1067 static void
1068 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
1069                                       struct sysval_uniform *uniform)
1070 {
1071         struct panfrost_context *ctx = batch->ctx;
1072         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1073
1074         uniform->f[0] = vp->scale[0];
1075         uniform->f[1] = vp->scale[1];
1076         uniform->f[2] = vp->scale[2];
1077 }
1078
1079 static void
1080 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1081                                        struct sysval_uniform *uniform)
1082 {
1083         struct panfrost_context *ctx = batch->ctx;
1084         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1085
1086         uniform->f[0] = vp->translate[0];
1087         uniform->f[1] = vp->translate[1];
1088         uniform->f[2] = vp->translate[2];
1089 }
1090
1091 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1092                                        enum pipe_shader_type st,
1093                                        unsigned int sysvalid,
1094                                        struct sysval_uniform *uniform)
1095 {
1096         struct panfrost_context *ctx = batch->ctx;
1097         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1098         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1099         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1100         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1101
1102         assert(dim);
1103         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1104
1105         if (dim > 1)
1106                 uniform->i[1] = u_minify(tex->texture->height0,
1107                                          tex->u.tex.first_level);
1108
1109         if (dim > 2)
1110                 uniform->i[2] = u_minify(tex->texture->depth0,
1111                                          tex->u.tex.first_level);
1112
1113         if (is_array)
1114                 uniform->i[dim] = tex->texture->array_size;
1115 }
1116
1117 static void
1118 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1119                             enum pipe_shader_type st,
1120                             unsigned ssbo_id,
1121                             struct sysval_uniform *uniform)
1122 {
1123         struct panfrost_context *ctx = batch->ctx;
1124
1125         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1126         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1127
1128         /* Compute address */
1129         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1130
1131         panfrost_batch_add_bo(batch, bo,
1132                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1133                               panfrost_bo_access_for_stage(st));
1134
1135         /* Upload address and size as sysval */
1136         uniform->du[0] = bo->gpu + sb.buffer_offset;
1137         uniform->u[2] = sb.buffer_size;
1138 }
1139
1140 static void
1141 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1142                                enum pipe_shader_type st,
1143                                unsigned samp_idx,
1144                                struct sysval_uniform *uniform)
1145 {
1146         struct panfrost_context *ctx = batch->ctx;
1147         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1148
1149         uniform->f[0] = sampl->min_lod;
1150         uniform->f[1] = sampl->max_lod;
1151         uniform->f[2] = sampl->lod_bias;
1152
1153         /* Even without any errata, Midgard represents "no mipmapping" as
1154          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1155          * panfrost_create_sampler_state which also explains our choice of
1156          * epsilon value (again to keep behaviour consistent) */
1157
1158         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1159                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1160 }
1161
1162 static void
1163 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1164                                        struct sysval_uniform *uniform)
1165 {
1166         struct panfrost_context *ctx = batch->ctx;
1167
1168         uniform->u[0] = ctx->compute_grid->grid[0];
1169         uniform->u[1] = ctx->compute_grid->grid[1];
1170         uniform->u[2] = ctx->compute_grid->grid[2];
1171 }
1172
1173 static void
1174 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1175                         struct panfrost_shader_state *ss,
1176                         enum pipe_shader_type st)
1177 {
1178         struct sysval_uniform *uniforms = (void *)buf;
1179
1180         for (unsigned i = 0; i < ss->sysval_count; ++i) {
1181                 int sysval = ss->sysval[i];
1182
1183                 switch (PAN_SYSVAL_TYPE(sysval)) {
1184                 case PAN_SYSVAL_VIEWPORT_SCALE:
1185                         panfrost_upload_viewport_scale_sysval(batch,
1186                                                               &uniforms[i]);
1187                         break;
1188                 case PAN_SYSVAL_VIEWPORT_OFFSET:
1189                         panfrost_upload_viewport_offset_sysval(batch,
1190                                                                &uniforms[i]);
1191                         break;
1192                 case PAN_SYSVAL_TEXTURE_SIZE:
1193                         panfrost_upload_txs_sysval(batch, st,
1194                                                    PAN_SYSVAL_ID(sysval),
1195                                                    &uniforms[i]);
1196                         break;
1197                 case PAN_SYSVAL_SSBO:
1198                         panfrost_upload_ssbo_sysval(batch, st,
1199                                                     PAN_SYSVAL_ID(sysval),
1200                                                     &uniforms[i]);
1201                         break;
1202                 case PAN_SYSVAL_NUM_WORK_GROUPS:
1203                         panfrost_upload_num_work_groups_sysval(batch,
1204                                                                &uniforms[i]);
1205                         break;
1206                 case PAN_SYSVAL_SAMPLER:
1207                         panfrost_upload_sampler_sysval(batch, st,
1208                                                        PAN_SYSVAL_ID(sysval),
1209                                                        &uniforms[i]);
1210                         break;
1211                 default:
1212                         assert(0);
1213                 }
1214         }
1215 }
1216
1217 static const void *
1218 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1219                                  unsigned index)
1220 {
1221         struct pipe_constant_buffer *cb = &buf->cb[index];
1222         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1223
1224         if (rsrc)
1225                 return rsrc->bo->cpu;
1226         else if (cb->user_buffer)
1227                 return cb->user_buffer;
1228         else
1229                 unreachable("No constant buffer");
1230 }
1231
1232 void
1233 panfrost_emit_const_buf(struct panfrost_batch *batch,
1234                         enum pipe_shader_type stage,
1235                         struct mali_vertex_tiler_postfix *postfix)
1236 {
1237         struct panfrost_context *ctx = batch->ctx;
1238         struct panfrost_shader_variants *all = ctx->shader[stage];
1239
1240         if (!all)
1241                 return;
1242
1243         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1244
1245         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1246
1247         /* Uniforms are implicitly UBO #0 */
1248         bool has_uniforms = buf->enabled_mask & (1 << 0);
1249
1250         /* Allocate room for the sysval and the uniforms */
1251         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1252         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1253         size_t size = sys_size + uniform_size;
1254         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1255                                                                         size);
1256
1257         /* Upload sysvals requested by the shader */
1258         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1259
1260         /* Upload uniforms */
1261         if (has_uniforms && uniform_size) {
1262                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1263                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1264         }
1265
1266         /* Next up, attach UBOs. UBO #0 is the uniforms we just
1267          * uploaded */
1268
1269         unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1270         assert(ubo_count >= 1);
1271
1272         size_t sz = sizeof(uint64_t) * ubo_count;
1273         uint64_t ubos[PAN_MAX_CONST_BUFFERS];
1274         int uniform_count = ss->uniform_count;
1275
1276         /* Upload uniforms as a UBO */
1277         ubos[0] = MALI_MAKE_UBO(2 + uniform_count, transfer.gpu);
1278
1279         /* The rest are honest-to-goodness UBOs */
1280
1281         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1282                 size_t usz = buf->cb[ubo].buffer_size;
1283                 bool enabled = buf->enabled_mask & (1 << ubo);
1284                 bool empty = usz == 0;
1285
1286                 if (!enabled || empty) {
1287                         /* Stub out disabled UBOs to catch accesses */
1288                         ubos[ubo] = MALI_MAKE_UBO(0, 0xDEAD0000);
1289                         continue;
1290                 }
1291
1292                 mali_ptr gpu = panfrost_map_constant_buffer_gpu(batch, stage,
1293                                                                 buf, ubo);
1294
1295                 unsigned bytes_per_field = 16;
1296                 unsigned aligned = ALIGN_POT(usz, bytes_per_field);
1297                 ubos[ubo] = MALI_MAKE_UBO(aligned / bytes_per_field, gpu);
1298         }
1299
1300         mali_ptr ubufs = panfrost_upload_transient(batch, ubos, sz);
1301         postfix->uniforms = transfer.gpu;
1302         postfix->uniform_buffers = ubufs;
1303
1304         buf->dirty_mask = 0;
1305 }
1306
1307 void
1308 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1309                             const struct pipe_grid_info *info,
1310                             struct midgard_payload_vertex_tiler *vtp)
1311 {
1312         struct panfrost_context *ctx = batch->ctx;
1313         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1314         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1315         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1316                                                            128));
1317         unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1318                                info->grid[2] * 4;
1319         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1320                                                                   shared_size,
1321                                                                   1);
1322
1323         struct mali_shared_memory shared = {
1324                 .shared_memory = bo->gpu,
1325                 .shared_workgroup_count =
1326                         util_logbase2_ceil(info->grid[0]) +
1327                         util_logbase2_ceil(info->grid[1]) +
1328                         util_logbase2_ceil(info->grid[2]),
1329                 .shared_unk1 = 0x2,
1330                 .shared_shift = util_logbase2(single_size) - 1
1331         };
1332
1333         vtp->postfix.shared_memory = panfrost_upload_transient(batch, &shared,
1334                                                                sizeof(shared));
1335 }
1336
1337 static mali_ptr
1338 panfrost_get_tex_desc(struct panfrost_batch *batch,
1339                       enum pipe_shader_type st,
1340                       struct panfrost_sampler_view *view)
1341 {
1342         if (!view)
1343                 return (mali_ptr) 0;
1344
1345         struct pipe_sampler_view *pview = &view->base;
1346         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1347
1348         /* Add the BO to the job so it's retained until the job is done. */
1349
1350         panfrost_batch_add_bo(batch, rsrc->bo,
1351                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1352                               panfrost_bo_access_for_stage(st));
1353
1354         panfrost_batch_add_bo(batch, view->midgard_bo,
1355                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1356                               panfrost_bo_access_for_stage(st));
1357
1358         return view->midgard_bo->gpu;
1359 }
1360
1361 void
1362 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1363                                   enum pipe_shader_type stage,
1364                                   struct mali_vertex_tiler_postfix *postfix)
1365 {
1366         struct panfrost_context *ctx = batch->ctx;
1367         struct panfrost_device *device = pan_device(ctx->base.screen);
1368
1369         if (!ctx->sampler_view_count[stage])
1370                 return;
1371
1372         if (device->quirks & IS_BIFROST) {
1373                 struct bifrost_texture_descriptor *descriptors;
1374
1375                 descriptors = malloc(sizeof(struct bifrost_texture_descriptor) *
1376                                      ctx->sampler_view_count[stage]);
1377
1378                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1379                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1380                         struct pipe_sampler_view *pview = &view->base;
1381                         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1382
1383                         /* Add the BOs to the job so they are retained until the job is done. */
1384
1385                         panfrost_batch_add_bo(batch, rsrc->bo,
1386                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1387                                               panfrost_bo_access_for_stage(stage));
1388
1389                         panfrost_batch_add_bo(batch, view->bifrost_bo,
1390                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1391                                               panfrost_bo_access_for_stage(stage));
1392
1393                         memcpy(&descriptors[i], view->bifrost_descriptor, sizeof(*view->bifrost_descriptor));
1394                 }
1395
1396                 postfix->textures = panfrost_upload_transient(batch,
1397                                                               descriptors,
1398                                                               sizeof(struct bifrost_texture_descriptor) *
1399                                                                       ctx->sampler_view_count[stage]);
1400
1401                 free(descriptors);
1402         } else {
1403                 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1404
1405                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i)
1406                         trampolines[i] = panfrost_get_tex_desc(batch, stage,
1407                                                                ctx->sampler_views[stage][i]);
1408
1409                 postfix->textures = panfrost_upload_transient(batch,
1410                                                               trampolines,
1411                                                               sizeof(uint64_t) *
1412                                                               ctx->sampler_view_count[stage]);
1413         }
1414 }
1415
1416 void
1417 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1418                                   enum pipe_shader_type stage,
1419                                   struct mali_vertex_tiler_postfix *postfix)
1420 {
1421         struct panfrost_context *ctx = batch->ctx;
1422         struct panfrost_device *device = pan_device(ctx->base.screen);
1423
1424         if (!ctx->sampler_count[stage])
1425                 return;
1426
1427         if (device->quirks & IS_BIFROST) {
1428                 size_t desc_size = sizeof(struct bifrost_sampler_descriptor);
1429                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1430                 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1431                                                                                 transfer_size);
1432                 struct bifrost_sampler_descriptor *desc = (struct bifrost_sampler_descriptor *)transfer.cpu;
1433
1434                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1435                         desc[i] = ctx->samplers[stage][i]->bifrost_hw;
1436
1437                 postfix->sampler_descriptor = transfer.gpu;
1438         } else {
1439                 size_t desc_size = sizeof(struct mali_sampler_descriptor);
1440                 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1441                 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1442                                                                                 transfer_size);
1443                 struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1444
1445                 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1446                         desc[i] = ctx->samplers[stage][i]->midgard_hw;
1447
1448                 postfix->sampler_descriptor = transfer.gpu;
1449         }
1450 }
1451
1452 void
1453 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1454                                struct mali_vertex_tiler_postfix *vertex_postfix)
1455 {
1456         struct panfrost_context *ctx = batch->ctx;
1457
1458         if (!ctx->vertex)
1459                 return;
1460
1461         struct panfrost_vertex_state *so = ctx->vertex;
1462
1463         panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1464         vertex_postfix->attribute_meta = panfrost_upload_transient(batch, so->hw,
1465                                                                sizeof(*so->hw) *
1466                                                                PAN_MAX_ATTRIBUTE);
1467 }
1468
1469 void
1470 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1471                           struct mali_vertex_tiler_postfix *vertex_postfix)
1472 {
1473         struct panfrost_context *ctx = batch->ctx;
1474         struct panfrost_vertex_state *so = ctx->vertex;
1475
1476         /* Staged mali_attr, and index into them. i =/= k, depending on the
1477          * vertex buffer mask and instancing. Twice as much room is allocated,
1478          * for a worst case of NPOT_DIVIDEs which take up extra slot */
1479         union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1480         unsigned k = 0;
1481
1482         for (unsigned i = 0; i < so->num_elements; ++i) {
1483                 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1484                  * means duplicating some vertex buffers (who cares? aside from
1485                  * maybe some caching implications but I somehow doubt that
1486                  * matters) */
1487
1488                 struct pipe_vertex_element *elem = &so->pipe[i];
1489                 unsigned vbi = elem->vertex_buffer_index;
1490
1491                 /* The exception to 1:1 mapping is that we can have multiple
1492                  * entries (NPOT divisors), so we fixup anyways */
1493
1494                 so->hw[i].index = k;
1495
1496                 if (!(ctx->vb_mask & (1 << vbi)))
1497                         continue;
1498
1499                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1500                 struct panfrost_resource *rsrc;
1501
1502                 rsrc = pan_resource(buf->buffer.resource);
1503                 if (!rsrc)
1504                         continue;
1505
1506                 /* Align to 64 bytes by masking off the lower bits. This
1507                  * will be adjusted back when we fixup the src_offset in
1508                  * mali_attr_meta */
1509
1510                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1511                 mali_ptr addr = raw_addr & ~63;
1512                 unsigned chopped_addr = raw_addr - addr;
1513
1514                 /* Add a dependency of the batch on the vertex buffer */
1515                 panfrost_batch_add_bo(batch, rsrc->bo,
1516                                       PAN_BO_ACCESS_SHARED |
1517                                       PAN_BO_ACCESS_READ |
1518                                       PAN_BO_ACCESS_VERTEX_TILER);
1519
1520                 /* Set common fields */
1521                 attrs[k].elements = addr;
1522                 attrs[k].stride = buf->stride;
1523
1524                 /* Since we advanced the base pointer, we shrink the buffer
1525                  * size */
1526                 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1527
1528                 /* We need to add the extra size we masked off (for
1529                  * correctness) so the data doesn't get clamped away */
1530                 attrs[k].size += chopped_addr;
1531
1532                 /* For non-instancing make sure we initialize */
1533                 attrs[k].shift = attrs[k].extra_flags = 0;
1534
1535                 /* Instancing uses a dramatically different code path than
1536                  * linear, so dispatch for the actual emission now that the
1537                  * common code is finished */
1538
1539                 unsigned divisor = elem->instance_divisor;
1540
1541                 if (divisor && ctx->instance_count == 1) {
1542                         /* Silly corner case where there's a divisor(=1) but
1543                          * there's no legitimate instancing. So we want *every*
1544                          * attribute to be the same. So set stride to zero so
1545                          * we don't go anywhere. */
1546
1547                         attrs[k].size = attrs[k].stride + chopped_addr;
1548                         attrs[k].stride = 0;
1549                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1550                 } else if (ctx->instance_count <= 1) {
1551                         /* Normal, non-instanced attributes */
1552                         attrs[k++].elements |= MALI_ATTR_LINEAR;
1553                 } else {
1554                         unsigned instance_shift = vertex_postfix->instance_shift;
1555                         unsigned instance_odd = vertex_postfix->instance_odd;
1556
1557                         k += panfrost_vertex_instanced(ctx->padded_count,
1558                                                        instance_shift,
1559                                                        instance_odd,
1560                                                        divisor, &attrs[k]);
1561                 }
1562         }
1563
1564         /* Add special gl_VertexID/gl_InstanceID buffers */
1565
1566         panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1567         so->hw[PAN_VERTEX_ID].index = k++;
1568         panfrost_instance_id(ctx->padded_count, &attrs[k]);
1569         so->hw[PAN_INSTANCE_ID].index = k++;
1570
1571         /* Upload whatever we emitted and go */
1572
1573         vertex_postfix->attributes = panfrost_upload_transient(batch, attrs,
1574                                                            k * sizeof(*attrs));
1575 }
1576
1577 static mali_ptr
1578 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1579                        unsigned stride, unsigned count)
1580 {
1581         /* Fill out the descriptor */
1582         slot->stride = stride;
1583         slot->size = stride * count;
1584         slot->shift = slot->extra_flags = 0;
1585
1586         struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1587                                                                         slot->size);
1588
1589         slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1590
1591         return transfer.gpu;
1592 }
1593
1594 static void
1595 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1596                         unsigned stride, unsigned offset, unsigned count,
1597                         struct pipe_stream_output_target *target)
1598 {
1599         /* Fill out the descriptor */
1600         slot->stride = stride * 4;
1601         slot->shift = slot->extra_flags = 0;
1602
1603         unsigned max_size = target->buffer_size;
1604         unsigned expected_size = slot->stride * count;
1605
1606         slot->size = MIN2(max_size, expected_size);
1607
1608         /* Grab the BO and bind it to the batch */
1609         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1610
1611         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1612          * the perspective of the TILER and FRAGMENT.
1613          */
1614         panfrost_batch_add_bo(batch, bo,
1615                               PAN_BO_ACCESS_SHARED |
1616                               PAN_BO_ACCESS_RW |
1617                               PAN_BO_ACCESS_VERTEX_TILER |
1618                               PAN_BO_ACCESS_FRAGMENT);
1619
1620         mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1621         slot->elements = addr;
1622 }
1623
1624 /* Given a shader and buffer indices, link varying metadata together */
1625
1626 static bool
1627 is_special_varying(gl_varying_slot loc)
1628 {
1629         switch (loc) {
1630         case VARYING_SLOT_POS:
1631         case VARYING_SLOT_PSIZ:
1632         case VARYING_SLOT_PNTC:
1633         case VARYING_SLOT_FACE:
1634                 return true;
1635         default:
1636                 return false;
1637         }
1638 }
1639
1640 static void
1641 panfrost_emit_varying_meta(void *outptr, struct panfrost_shader_state *ss,
1642                            signed general, signed gl_Position,
1643                            signed gl_PointSize, signed gl_PointCoord,
1644                            signed gl_FrontFacing)
1645 {
1646         struct mali_attr_meta *out = (struct mali_attr_meta *) outptr;
1647
1648         for (unsigned i = 0; i < ss->varying_count; ++i) {
1649                 gl_varying_slot location = ss->varyings_loc[i];
1650                 int index = -1;
1651
1652                 switch (location) {
1653                 case VARYING_SLOT_POS:
1654                         index = gl_Position;
1655                         break;
1656                 case VARYING_SLOT_PSIZ:
1657                         index = gl_PointSize;
1658                         break;
1659                 case VARYING_SLOT_PNTC:
1660                         index = gl_PointCoord;
1661                         break;
1662                 case VARYING_SLOT_FACE:
1663                         index = gl_FrontFacing;
1664                         break;
1665                 default:
1666                         index = general;
1667                         break;
1668                 }
1669
1670                 assert(index >= 0);
1671                 out[i].index = index;
1672         }
1673 }
1674
1675 static bool
1676 has_point_coord(unsigned mask, gl_varying_slot loc)
1677 {
1678         if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1679                 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1680         else if (loc == VARYING_SLOT_PNTC)
1681                 return (mask & (1 << 8));
1682         else
1683                 return false;
1684 }
1685
1686 /* Helpers for manipulating stream out information so we can pack varyings
1687  * accordingly. Compute the src_offset for a given captured varying */
1688
1689 static struct pipe_stream_output *
1690 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1691 {
1692         for (unsigned i = 0; i < info->num_outputs; ++i) {
1693                 if (info->output[i].register_index == loc)
1694                         return &info->output[i];
1695         }
1696
1697         unreachable("Varying not captured");
1698 }
1699
1700 void
1701 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1702                                  unsigned vertex_count,
1703                                  struct mali_vertex_tiler_postfix *vertex_postfix,
1704                                  struct mali_vertex_tiler_postfix *tiler_postfix,
1705                                  union midgard_primitive_size *primitive_size)
1706 {
1707         /* Load the shaders */
1708         struct panfrost_context *ctx = batch->ctx;
1709         struct panfrost_shader_state *vs, *fs;
1710         unsigned int num_gen_varyings = 0;
1711         size_t vs_size, fs_size;
1712
1713         /* Allocate the varying descriptor */
1714
1715         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1716         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1717         vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
1718         fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
1719
1720         struct panfrost_transfer trans = panfrost_allocate_transient(batch,
1721                                                                      vs_size +
1722                                                                      fs_size);
1723
1724         struct pipe_stream_output_info *so = &vs->stream_output;
1725
1726         /* Check if this varying is linked by us. This is the case for
1727          * general-purpose, non-captured varyings. If it is, link it. If it's
1728          * not, use the provided stream out information to determine the
1729          * offset, since it was already linked for us. */
1730
1731         for (unsigned i = 0; i < vs->varying_count; i++) {
1732                 gl_varying_slot loc = vs->varyings_loc[i];
1733
1734                 bool special = is_special_varying(loc);
1735                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1736
1737                 if (captured) {
1738                         struct pipe_stream_output *o = pan_get_so(so, loc);
1739
1740                         unsigned dst_offset = o->dst_offset * 4; /* dwords */
1741                         vs->varyings[i].src_offset = dst_offset;
1742                 } else if (!special) {
1743                         vs->varyings[i].src_offset = 16 * (num_gen_varyings++);
1744                 }
1745         }
1746
1747         /* Conversely, we need to set src_offset for the captured varyings.
1748          * Here, the layout is defined by the stream out info, not us */
1749
1750         /* Link up with fragment varyings */
1751         bool reads_point_coord = fs->reads_point_coord;
1752
1753         for (unsigned i = 0; i < fs->varying_count; i++) {
1754                 gl_varying_slot loc = fs->varyings_loc[i];
1755                 unsigned src_offset;
1756                 signed vs_idx = -1;
1757
1758                 /* Link up */
1759                 for (unsigned j = 0; j < vs->varying_count; ++j) {
1760                         if (vs->varyings_loc[j] == loc) {
1761                                 vs_idx = j;
1762                                 break;
1763                         }
1764                 }
1765
1766                 /* Either assign or reuse */
1767                 if (vs_idx >= 0)
1768                         src_offset = vs->varyings[vs_idx].src_offset;
1769                 else
1770                         src_offset = 16 * (num_gen_varyings++);
1771
1772                 fs->varyings[i].src_offset = src_offset;
1773
1774                 if (has_point_coord(fs->point_sprite_mask, loc))
1775                         reads_point_coord = true;
1776         }
1777
1778         memcpy(trans.cpu, vs->varyings, vs_size);
1779         memcpy(trans.cpu + vs_size, fs->varyings, fs_size);
1780
1781         union mali_attr varyings[PIPE_MAX_ATTRIBS] = {0};
1782
1783         /* Figure out how many streamout buffers could be bound */
1784         unsigned so_count = ctx->streamout.num_targets;
1785         for (unsigned i = 0; i < vs->varying_count; i++) {
1786                 gl_varying_slot loc = vs->varyings_loc[i];
1787
1788                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1789                 if (!captured) continue;
1790
1791                 struct pipe_stream_output *o = pan_get_so(so, loc);
1792                 so_count = MAX2(so_count, o->output_buffer + 1);
1793         }
1794
1795         signed idx = so_count;
1796         signed general = idx++;
1797         signed gl_Position = idx++;
1798         signed gl_PointSize = vs->writes_point_size ? (idx++) : -1;
1799         signed gl_PointCoord = reads_point_coord ? (idx++) : -1;
1800         signed gl_FrontFacing = fs->reads_face ? (idx++) : -1;
1801         signed gl_FragCoord = fs->reads_frag_coord ? (idx++) : -1;
1802
1803         /* Emit the stream out buffers */
1804
1805         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1806                                                            ctx->vertex_count);
1807
1808         for (unsigned i = 0; i < so_count; ++i) {
1809                 if (i < ctx->streamout.num_targets) {
1810                         panfrost_emit_streamout(batch, &varyings[i],
1811                                                 so->stride[i],
1812                                                 ctx->streamout.offsets[i],
1813                                                 out_count,
1814                                                 ctx->streamout.targets[i]);
1815                 } else {
1816                         /* Emit a dummy buffer */
1817                         panfrost_emit_varyings(batch, &varyings[i],
1818                                                so->stride[i] * 4,
1819                                                out_count);
1820
1821                         /* Clear the attribute type */
1822                         varyings[i].elements &= ~0xF;
1823                 }
1824         }
1825
1826         panfrost_emit_varyings(batch, &varyings[general],
1827                                num_gen_varyings * 16,
1828                                vertex_count);
1829
1830         mali_ptr varyings_p;
1831
1832         /* fp32 vec4 gl_Position */
1833         varyings_p = panfrost_emit_varyings(batch, &varyings[gl_Position],
1834                                             sizeof(float) * 4, vertex_count);
1835         tiler_postfix->position_varying = varyings_p;
1836
1837
1838         if (panfrost_writes_point_size(ctx)) {
1839                 varyings_p = panfrost_emit_varyings(batch,
1840                                                     &varyings[gl_PointSize],
1841                                                     2, vertex_count);
1842                 primitive_size->pointer = varyings_p;
1843         }
1844
1845         if (reads_point_coord)
1846                 varyings[gl_PointCoord].elements = MALI_VARYING_POINT_COORD;
1847
1848         if (fs->reads_face)
1849                 varyings[gl_FrontFacing].elements = MALI_VARYING_FRONT_FACING;
1850
1851         if (fs->reads_frag_coord)
1852                 varyings[gl_FragCoord].elements = MALI_VARYING_FRAG_COORD;
1853
1854         struct panfrost_device *device = pan_device(ctx->base.screen);
1855         assert(!(device->quirks & IS_BIFROST) || !(reads_point_coord));
1856
1857         /* Let's go ahead and link varying meta to the buffer in question, now
1858          * that that information is available. VARYING_SLOT_POS is mapped to
1859          * gl_FragCoord for fragment shaders but gl_Positionf or vertex shaders
1860          * */
1861
1862         panfrost_emit_varying_meta(trans.cpu, vs, general, gl_Position,
1863                                    gl_PointSize, gl_PointCoord,
1864                                    gl_FrontFacing);
1865
1866         panfrost_emit_varying_meta(trans.cpu + vs_size, fs, general,
1867                                    gl_FragCoord, gl_PointSize,
1868                                    gl_PointCoord, gl_FrontFacing);
1869
1870         /* Replace streamout */
1871
1872         struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
1873         struct mali_attr_meta *ofs = ovs + vs->varying_count;
1874
1875         for (unsigned i = 0; i < vs->varying_count; i++) {
1876                 gl_varying_slot loc = vs->varyings_loc[i];
1877
1878                 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1879                 if (!captured)
1880                         continue;
1881
1882                 struct pipe_stream_output *o = pan_get_so(so, loc);
1883                 ovs[i].index = o->output_buffer;
1884
1885                 assert(o->stream == 0);
1886                 ovs[i].format = (vs->varyings[i].format & ~MALI_NR_CHANNELS(4))
1887                         | MALI_NR_CHANNELS(o->num_components);
1888
1889                 if (device->quirks & HAS_SWIZZLES)
1890                         ovs[i].swizzle = panfrost_get_default_swizzle(o->num_components);
1891                 else
1892                         ovs[i].swizzle = panfrost_bifrost_swizzle(o->num_components);
1893
1894                 /* Link to the fragment */
1895                 signed fs_idx = -1;
1896
1897                 /* Link up */
1898                 for (unsigned j = 0; j < fs->varying_count; ++j) {
1899                         if (fs->varyings_loc[j] == loc) {
1900                                 fs_idx = j;
1901                                 break;
1902                         }
1903                 }
1904
1905                 if (fs_idx >= 0) {
1906                         ofs[fs_idx].index = ovs[i].index;
1907                         ofs[fs_idx].format = ovs[i].format;
1908                         ofs[fs_idx].swizzle = ovs[i].swizzle;
1909                 }
1910         }
1911
1912         /* Replace point sprite */
1913         for (unsigned i = 0; i < fs->varying_count; i++) {
1914                 /* If we have a point sprite replacement, handle that here. We
1915                  * have to translate location first.  TODO: Flip y in shader.
1916                  * We're already keying ... just time crunch .. */
1917
1918                 if (has_point_coord(fs->point_sprite_mask,
1919                                     fs->varyings_loc[i])) {
1920                         ofs[i].index = gl_PointCoord;
1921
1922                         /* Swizzle out the z/w to 0/1 */
1923                         ofs[i].format = MALI_RG16F;
1924                         ofs[i].swizzle = panfrost_get_default_swizzle(2);
1925                 }
1926         }
1927
1928         /* Fix up unaligned addresses */
1929         for (unsigned i = 0; i < so_count; ++i) {
1930                 if (varyings[i].elements < MALI_RECORD_SPECIAL)
1931                         continue;
1932
1933                 unsigned align = (varyings[i].elements & 63);
1934
1935                 /* While we're at it, the SO buffers are linear */
1936
1937                 if (!align) {
1938                         varyings[i].elements |= MALI_ATTR_LINEAR;
1939                         continue;
1940                 }
1941
1942                 /* We need to adjust alignment */
1943                 varyings[i].elements &= ~63;
1944                 varyings[i].elements |= MALI_ATTR_LINEAR;
1945                 varyings[i].size += align;
1946
1947                 for (unsigned v = 0; v < vs->varying_count; ++v) {
1948                         if (ovs[v].index != i)
1949                                 continue;
1950
1951                         ovs[v].src_offset = vs->varyings[v].src_offset + align;
1952                 }
1953
1954                 for (unsigned f = 0; f < fs->varying_count; ++f) {
1955                         if (ofs[f].index != i)
1956                                 continue;
1957
1958                         ofs[f].src_offset = fs->varyings[f].src_offset + align;
1959                 }
1960         }
1961
1962         varyings_p = panfrost_upload_transient(batch, varyings,
1963                                                idx * sizeof(*varyings));
1964         vertex_postfix->varyings = varyings_p;
1965         tiler_postfix->varyings = varyings_p;
1966
1967         vertex_postfix->varying_meta = trans.gpu;
1968         tiler_postfix->varying_meta = trans.gpu + vs_size;
1969 }
1970
1971 void
1972 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1973                                 struct mali_vertex_tiler_prefix *vertex_prefix,
1974                                 struct mali_vertex_tiler_postfix *vertex_postfix,
1975                                 struct mali_vertex_tiler_prefix *tiler_prefix,
1976                                 struct mali_vertex_tiler_postfix *tiler_postfix,
1977                                 union midgard_primitive_size *primitive_size)
1978 {
1979         struct panfrost_context *ctx = batch->ctx;
1980         struct panfrost_device *device = pan_device(ctx->base.screen);
1981         bool wallpapering = ctx->wallpaper_batch && batch->tiler_dep;
1982         struct bifrost_payload_vertex bifrost_vertex = {0,};
1983         struct bifrost_payload_tiler bifrost_tiler = {0,};
1984         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
1985         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
1986         void *vp, *tp;
1987         size_t vp_size, tp_size;
1988
1989         if (device->quirks & IS_BIFROST) {
1990                 bifrost_vertex.prefix = *vertex_prefix;
1991                 bifrost_vertex.postfix = *vertex_postfix;
1992                 vp = &bifrost_vertex;
1993                 vp_size = sizeof(bifrost_vertex);
1994
1995                 bifrost_tiler.prefix = *tiler_prefix;
1996                 bifrost_tiler.tiler.primitive_size = *primitive_size;
1997                 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
1998                 bifrost_tiler.postfix = *tiler_postfix;
1999                 tp = &bifrost_tiler;
2000                 tp_size = sizeof(bifrost_tiler);
2001         } else {
2002                 midgard_vertex.prefix = *vertex_prefix;
2003                 midgard_vertex.postfix = *vertex_postfix;
2004                 vp = &midgard_vertex;
2005                 vp_size = sizeof(midgard_vertex);
2006
2007                 midgard_tiler.prefix = *tiler_prefix;
2008                 midgard_tiler.postfix = *tiler_postfix;
2009                 midgard_tiler.primitive_size = *primitive_size;
2010                 tp = &midgard_tiler;
2011                 tp_size = sizeof(midgard_tiler);
2012         }
2013
2014         if (wallpapering) {
2015                 /* Inject in reverse order, with "predicted" job indices.
2016                  * THIS IS A HACK XXX */
2017                 panfrost_new_job(batch, JOB_TYPE_TILER, false,
2018                                  batch->job_index + 2, tp, tp_size, true);
2019                 panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2020                                  vp, vp_size, true);
2021                 return;
2022         }
2023
2024         /* If rasterizer discard is enable, only submit the vertex */
2025
2026         bool rasterizer_discard = ctx->rasterizer &&
2027                                   ctx->rasterizer->base.rasterizer_discard;
2028
2029         unsigned vertex = panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2030                                            vp, vp_size, false);
2031
2032         if (rasterizer_discard)
2033                 return;
2034
2035         panfrost_new_job(batch, JOB_TYPE_TILER, false, vertex, tp, tp_size,
2036                          false);
2037 }
2038
2039 /* TODO: stop hardcoding this */
2040 mali_ptr
2041 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2042 {
2043         uint16_t locations[] = {
2044             128, 128,
2045             0, 256,
2046             0, 256,
2047             0, 256,
2048             0, 256,
2049             0, 256,
2050             0, 256,
2051             0, 256,
2052             0, 256,
2053             0, 256,
2054             0, 256,
2055             0, 256,
2056             0, 256,
2057             0, 256,
2058             0, 256,
2059             0, 256,
2060             0, 256,
2061             0, 256,
2062             0, 256,
2063             0, 256,
2064             0, 256,
2065             0, 256,
2066             0, 256,
2067             0, 256,
2068             0, 256,
2069             0, 256,
2070             0, 256,
2071             0, 256,
2072             0, 256,
2073             0, 256,
2074             0, 256,
2075             0, 256,
2076             128, 128,
2077             0, 0,
2078             0, 0,
2079             0, 0,
2080             0, 0,
2081             0, 0,
2082             0, 0,
2083             0, 0,
2084             0, 0,
2085             0, 0,
2086             0, 0,
2087             0, 0,
2088             0, 0,
2089             0, 0,
2090             0, 0,
2091             0, 0,
2092         };
2093
2094         return panfrost_upload_transient(batch, locations, 96 * sizeof(uint16_t));
2095 }